~bzr-pqm/bzr/bzr.dev : revision 6432

1

#

2

# ElementTree

3

# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $

4

#

5

# light-weight XML support for Python 1.5.2 and later.

6

#

7

# history:

8

# 2001-10-20 fl created (from various sources)

9

# 2001-11-01 fl return root from parse method

10

# 2002-02-16 fl sort attributes in lexical order

11

# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup

12

# 2002-05-01 fl finished TreeBuilder refactoring

13

# 2002-07-14 fl added basic namespace support to ElementTree.write

14

# 2002-07-25 fl added QName attribute support

15

# 2002-10-20 fl fixed encoding in write

16

# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding

17

# 2002-11-27 fl accept file objects or file names for parse/write

18

# 2002-12-04 fl moved XMLTreeBuilder back to this module

19

# 2003-01-11 fl fixed entity encoding glitch for us-ascii

20

# 2003-02-13 fl added XML literal factory

21

# 2003-02-21 fl added ProcessingInstruction/PI factory

22

# 2003-05-11 fl added tostring/fromstring helpers

23

# 2003-05-26 fl added ElementPath support

24

# 2003-07-05 fl added makeelement factory method

25

# 2003-07-28 fl added more well-known namespace prefixes

26

# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)

27

# 2003-09-04 fl fall back on emulator if ElementPath is not installed

28

# 2003-10-31 fl markup updates

29

# 2003-11-15 fl fixed nested namespace bug

30

# 2004-03-28 fl added XMLID helper

31

# 2004-06-02 fl added default support to findtext

32

# 2004-06-08 fl fixed encoding of non-ascii element/attribute names

33

# 2004-08-23 fl take advantage of post-2.1 expat features

34

# 2005-02-01 fl added iterparse implementation

35

# 2005-03-02 fl fixed iterparse support for pre-2.2 versions

36

#

37

38

#

39

# fredrik@pythonware.com

40

# http://www.pythonware.com

41

#

42

# --------------------------------------------------------------------

43

# The ElementTree toolkit is

44

#

45

46

#

47

# By obtaining, using, and/or copying this software and/or its

48

# associated documentation, you agree that you have read, understood,

49

# and will comply with the following terms and conditions:

50

#

51

# Permission to use, copy, modify, and distribute this software and

52

# its associated documentation for any purpose and without fee is

53

# hereby granted, provided that the above copyright notice appears in

54

# all copies, and that both that copyright notice and this permission

55

# notice appear in supporting documentation, and that the name of

56

# Secret Labs AB or the author not be used in advertising or publicity

57

# pertaining to distribution of the software without specific, written

58

# prior permission.

59

#

60

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

61

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

62

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

63

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

64

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

65

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

66

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

67

# OF THIS SOFTWARE.

68

# --------------------------------------------------------------------

69

70

from __future__ import absolute_import

71

72

__all__ = [

73

# public symbols

74

"Comment",

75

"dump",

76

"Element", "ElementTree",

77

"fromstring",

78

"iselement", "iterparse",

79

"parse",

80

"PI", "ProcessingInstruction",

81

"QName",

82

"SubElement",

83

"tostring",

84

"TreeBuilder",

85

"VERSION", "XML",

86

"XMLTreeBuilder",

87

]

88

89

##

90

# The Element type is a flexible container object, designed to

91

# store hierarchical data structures in memory. The type can be

92

# described as a cross between a list and a dictionary.

93

#

94

# Each element has a number of properties associated with it:

95

# <ul>

96

# <li>a tag. This is a string identifying what kind of data

97

# this element represents (the element type, in other words).</li>

98

# <li>a number of attributes, stored in a Python dictionary.</li>

99

# <li>a text string.</li>

100

# <li>an optional tail string.</li>

101

# <li>a number of child elements, stored in a Python sequence</li>

102

# </ul>

103

#

104

# To create an element instance, use the {@link #Element} or {@link

105

# #SubElement} factory functions.

106

#

107

# The {@link #ElementTree} class can be used to wrap an element

108

# structure, and convert it from and to XML.

109

##

110

111

import string, sys, re

112

113

class _SimpleElementPath:

114

# emulate pre-1.2 find/findtext/findall behaviour

115

def find(self, element, tag):

116

for elem in element:

117

if elem.tag == tag:

118

return elem

119

return None

120

def findtext(self, element, tag, default=None):

121

for elem in element:

122

if elem.tag == tag:

123

return elem.text or ""

124

return default

125

def findall(self, element, tag):

126

if tag[:3] == ".//":

127

return element.getiterator(tag[3:])

128

result = []

129

for elem in element:

130

if elem.tag == tag:

131

result.append(elem)

132

return result

133

134

try:

135

import ElementPath

136

except ImportError:

137

# FIXME: issue warning in this case?

138

ElementPath = _SimpleElementPath()

139

140

# TODO: add support for custom namespace resolvers/default namespaces

141

# TODO: add improved support for incremental parsing

142

143

VERSION = "1.2.6"

144

145

##

146

# Internal element class. This class defines the Element interface,

147

# and provides a reference implementation of this interface.

148

#

149

# You should not create instances of this class directly. Use the

150

# appropriate factory functions instead, such as {@link #Element}

151

# and {@link #SubElement}.

152

#

153

# @see Element

154

# @see SubElement

155

# @see Comment

156

# @see ProcessingInstruction

157

158

class _ElementInterface:

159

# <tag attrib>text<child/>...</tag>tail

160

161

##

162

# (Attribute) Element tag.

163

164

tag = None

165

166

##

167

# (Attribute) Element attribute dictionary. Where possible, use

168

# {@link #_ElementInterface.get},

169

# {@link #_ElementInterface.set},

170

# {@link #_ElementInterface.keys}, and

171

# {@link #_ElementInterface.items} to access

172

# element attributes.

173

174

attrib = None

175

176

##

177

# (Attribute) Text before first subelement. This is either a

178

# string or the value None, if there was no text.

179

180

text = None

181

182

##

183

# (Attribute) Text after this element's end tag, but before the

184

# next sibling element's start tag. This is either a string or

185

# the value None, if there was no text.

186

187

tail = None # text after end tag, if any

188

189

def __init__(self, tag, attrib):

190

self.tag = tag

191

self.attrib = attrib

192

self._children = []

193

194

def __repr__(self):

195

return "<Element %s at %x>" % (self.tag, id(self))

196

197

##

198

# Creates a new element object of the same type as this element.

199

#

200

# @param tag Element tag.

201

# @param attrib Element attributes, given as a dictionary.

202

# @return A new element instance.

203

204

def makeelement(self, tag, attrib):

205

return Element(tag, attrib)

206

207

##

208

# Returns the number of subelements.

209

#

210

# @return The number of subelements.

211

212

def __len__(self):

213

return len(self._children)

214

215

##

216

# Returns the given subelement.

217

#

218

# @param index What subelement to return.

219

# @return The given subelement.

220

# @exception IndexError If the given element does not exist.

221

222

def __getitem__(self, index):

223

return self._children[index]

224

225

##

226

# Replaces the given subelement.

227

#

228

# @param index What subelement to replace.

229

# @param element The new element value.

230

# @exception IndexError If the given element does not exist.

231

# @exception AssertionError If element is not a valid object.

232

233

def __setitem__(self, index, element):

234

assert iselement(element)

235

self._children[index] = element

236

237

##

238

# Deletes the given subelement.

239

#

240

# @param index What subelement to delete.

241

# @exception IndexError If the given element does not exist.

242

243

def __delitem__(self, index):

244

del self._children[index]

245

246

##

247

# Returns a list containing subelements in the given range.

248

#

249

# @param start The first subelement to return.

250

# @param stop The first subelement that shouldn't be returned.

251

# @return A sequence object containing subelements.

252

253

def __getslice__(self, start, stop):

254

return self._children[start:stop]

255

256

##

257

# Replaces a number of subelements with elements from a sequence.

258

#

259

# @param start The first subelement to replace.

260

# @param stop The first subelement that shouldn't be replaced.

261

# @param elements A sequence object with zero or more elements.

262

# @exception AssertionError If a sequence member is not a valid object.

263

264

def __setslice__(self, start, stop, elements):

265

for element in elements:

266

assert iselement(element)

267

self._children[start:stop] = list(elements)

268

269

##

270

# Deletes a number of subelements.

271

#

272

# @param start The first subelement to delete.

273

# @param stop The first subelement to leave in there.

274

275

def __delslice__(self, start, stop):

276

del self._children[start:stop]

277

278

##

279

# Adds a subelement to the end of this element.

280

#

281

# @param element The element to add.

282

# @exception AssertionError If a sequence member is not a valid object.

283

284

def append(self, element):

285

assert iselement(element)

286

self._children.append(element)

287

288

##

289

# Inserts a subelement at the given position in this element.

290

#

291

# @param index Where to insert the new subelement.

292

# @exception AssertionError If the element is not a valid object.

293

294

def insert(self, index, element):

295

assert iselement(element)

296

self._children.insert(index, element)

297

298

##

299

# Removes a matching subelement. Unlike the find methods,

300

# this method compares elements based on identity, not on tag

301

# value or contents.

302

#

303

# @param element What element to remove.

304

# @exception ValueError If a matching element could not be found.

305

# @exception AssertionError If the element is not a valid object.

306

307

def remove(self, element):

308

assert iselement(element)

309

self._children.remove(element)

310

311

##

312

# Returns all subelements. The elements are returned in document

313

# order.

314

#

315

# @return A list of subelements.

316

# @defreturn list of Element instances

317

318

def getchildren(self):

319

return self._children

320

321

##

322

# Finds the first matching subelement, by tag name or path.

323

#

324

# @param path What element to look for.

325

# @return The first matching element, or None if no element was found.

326

# @defreturn Element or None

327

328

def find(self, path):

329

return ElementPath.find(self, path)

330

331

##

332

# Finds text for the first matching subelement, by tag name or path.

333

#

334

# @param path What element to look for.

335

# @param default What to return if the element was not found.

336

# @return The text content of the first matching element, or the

337

# default value no element was found. Note that if the element

338

# has is found, but has no text content, this method returns an

339

# empty string.

340

# @defreturn string

341

342

def findtext(self, path, default=None):

343

return ElementPath.findtext(self, path, default)

344

345

##

346

# Finds all matching subelements, by tag name or path.

347

#

348

# @param path What element to look for.

349

# @return A list or iterator containing all matching elements,

350

# in document order.

351

# @defreturn list of Element instances

352

353

def findall(self, path):

354

return ElementPath.findall(self, path)

355

356

##

357

# Resets an element. This function removes all subelements, clears

358

# all attributes, and sets the text and tail attributes to None.

359

360

def clear(self):

361

self.attrib.clear()

362

self._children = []

363

self.text = self.tail = None

364

365

##

366

# Gets an element attribute.

367

#

368

# @param key What attribute to look for.

369

# @param default What to return if the attribute was not found.

370

# @return The attribute value, or the default value, if the

371

# attribute was not found.

372

# @defreturn string or None

373

374

def get(self, key, default=None):

375

return self.attrib.get(key, default)

376

377

##

378

# Sets an element attribute.

379

#

380

# @param key What attribute to set.

381

# @param value The attribute value.

382

383

def set(self, key, value):

384

self.attrib[key] = value

385

386

##

387

# Gets a list of attribute names. The names are returned in an

388

# arbitrary order (just like for an ordinary Python dictionary).

389

#

390

# @return A list of element attribute names.

391

# @defreturn list of strings

392

393

def keys(self):

394

return self.attrib.keys()

395

396

##

397

# Gets element attributes, as a sequence. The attributes are

398

# returned in an arbitrary order.

399

#

400

# @return A list of (name, value) tuples for all attributes.

401

# @defreturn list of (string, string) tuples

402

403

def items(self):

404

return self.attrib.items()

405

406

##

407

# Creates a tree iterator. The iterator loops over this element

408

# and all subelements, in document order, and returns all elements

409

# with a matching tag.

410

#

411

# If the tree structure is modified during iteration, the result

412

# is undefined.

413

#

414

# @param tag What tags to look for (default is to return all elements).

415

# @return A list or iterator containing all the matching elements.

416

# @defreturn list or iterator

417

418

def getiterator(self, tag=None):

419

nodes = []

420

if tag == "*":

421

tag = None

422

if tag is None or self.tag == tag:

423

nodes.append(self)

424

for node in self._children:

425

nodes.extend(node.getiterator(tag))

426

return nodes

427

428

# compatibility

429

_Element = _ElementInterface

430

431

##

432

# Element factory. This function returns an object implementing the

433

# standard Element interface. The exact class or type of that object

434

# is implementation dependent, but it will always be compatible with

435

# the {@link #_ElementInterface} class in this module.

436

#

437

# The element name, attribute names, and attribute values can be

438

# either 8-bit ASCII strings or Unicode strings.

439

#

440

# @param tag The element name.

441

# @param attrib An optional dictionary, containing element attributes.

442

# @param **extra Additional attributes, given as keyword arguments.

443

# @return An element instance.

444

# @defreturn Element

445

446

def Element(tag, attrib={}, **extra):

447

attrib = attrib.copy()

448

attrib.update(extra)

449

return _ElementInterface(tag, attrib)

450

451

##

452

# Subelement factory. This function creates an element instance, and

453

# appends it to an existing element.

454

#

455

# The element name, attribute names, and attribute values can be

456

# either 8-bit ASCII strings or Unicode strings.

457

#

458

# @param parent The parent element.

459

# @param tag The subelement name.

460

# @param attrib An optional dictionary, containing element attributes.

461

# @param **extra Additional attributes, given as keyword arguments.

462

# @return An element instance.

463

# @defreturn Element

464

465

def SubElement(parent, tag, attrib={}, **extra):

466

attrib = attrib.copy()

467

attrib.update(extra)

468

element = parent.makeelement(tag, attrib)

469

parent.append(element)

470

return element

471

472

##

473

# Comment element factory. This factory function creates a special

474

# element that will be serialized as an XML comment.

475

#

476

# The comment string can be either an 8-bit ASCII string or a Unicode

477

# string.

478

#

479

# @param text A string containing the comment string.

480

# @return An element instance, representing a comment.

481

# @defreturn Element

482

483

def Comment(text=None):

484

element = Element(Comment)

485

element.text = text

486

return element

487

488

##

489

# PI element factory. This factory function creates a special element

490

# that will be serialized as an XML processing instruction.

491

#

492

# @param target A string containing the PI target.

493

# @param text A string containing the PI contents, if any.

494

# @return An element instance, representing a PI.

495

# @defreturn Element

496

497

def ProcessingInstruction(target, text=None):

498

element = Element(ProcessingInstruction)

499

element.text = target

500

if text:

501

element.text = element.text + " " + text

502

return element

503

504

PI = ProcessingInstruction

505

506

##

507

# QName wrapper. This can be used to wrap a QName attribute value, in

508

# order to get proper namespace handling on output.

509

#

510

# @param text A string containing the QName value, in the form {uri}local,

511

# or, if the tag argument is given, the URI part of a QName.

512

# @param tag Optional tag. If given, the first argument is interpreted as

513

# an URI, and this argument is interpreted as a local name.

514

# @return An opaque object, representing the QName.

515

516

class QName:

517

def __init__(self, text_or_uri, tag=None):

518

if tag:

519

text_or_uri = "{%s}%s" % (text_or_uri, tag)

520

self.text = text_or_uri

521

def __str__(self):

522

return self.text

523

def __hash__(self):

524

return hash(self.text)

525

def __cmp__(self, other):

526

if isinstance(other, QName):

527

return cmp(self.text, other.text)

528

return cmp(self.text, other)

529

530

##

531

# ElementTree wrapper class. This class represents an entire element

532

# hierarchy, and adds some extra support for serialization to and from

533

# standard XML.

534

#

535

# @param element Optional root element.

536

# @keyparam file Optional file handle or name. If given, the

537

# tree is initialized with the contents of this XML file.

538

539

class ElementTree:

540

541

def __init__(self, element=None, file=None):

542

assert element is None or iselement(element)

543

self._root = element # first node

544

if file:

545

self.parse(file)

546

547

##

548

# Gets the root element for this tree.

549

#

550

# @return An element instance.

551

# @defreturn Element

552

553

def getroot(self):

554

return self._root

555

556

##

557

# Replaces the root element for this tree. This discards the

558

# current contents of the tree, and replaces it with the given

559

# element. Use with care.

560

#

561

# @param element An element instance.

562

563

def _setroot(self, element):

564

assert iselement(element)

565

self._root = element

566

567

##

568

# Loads an external XML document into this element tree.

569

#

570

# @param source A file name or file object.

571

# @param parser An optional parser instance. If not given, the

572

# standard {@link XMLTreeBuilder} parser is used.

573

# @return The document root element.

574

# @defreturn Element

575

576

def parse(self, source, parser=None):

577

if getattr(source, "read", None) is None:

578

source = open(source, "rb")

579

if not parser:

580

parser = XMLTreeBuilder()

581

while 1:

582

data = source.read(32768)

583

if not data:

584

break

585

parser.feed(data)

586

self._root = parser.close()

587

return self._root

588

589

##

590

# Creates a tree iterator for the root element. The iterator loops

591

# over all elements in this tree, in document order.

592

#

593

# @param tag What tags to look for (default is to return all elements)

594

# @return An iterator.

595

# @defreturn iterator

596

597

def getiterator(self, tag=None):

598

assert self._root is not None

599

return self._root.getiterator(tag)

600

601

##

602

# Finds the first toplevel element with given tag.

603

# Same as getroot().find(path).

604

#

605

# @param path What element to look for.

606

# @return The first matching element, or None if no element was found.

607

# @defreturn Element or None

608

609

def find(self, path):

610

assert self._root is not None

611

if path[:1] == "/":

612

path = "." + path

613

return self._root.find(path)

614

615

##

616

# Finds the element text for the first toplevel element with given

617

# tag. Same as getroot().findtext(path).

618

#

619

# @param path What toplevel element to look for.

620

# @param default What to return if the element was not found.

621

# @return The text content of the first matching element, or the

622

# default value no element was found. Note that if the element

623

# has is found, but has no text content, this method returns an

624

# empty string.

625

# @defreturn string

626

627

def findtext(self, path, default=None):

628

assert self._root is not None

629

if path[:1] == "/":

630

path = "." + path

631

return self._root.findtext(path, default)

632

633

##

634

# Finds all toplevel elements with the given tag.

635

# Same as getroot().findall(path).

636

#

637

# @param path What element to look for.

638

# @return A list or iterator containing all matching elements,

639

# in document order.

640

# @defreturn list of Element instances

641

642

def findall(self, path):

643

assert self._root is not None

644

if path[:1] == "/":

645

path = "." + path

646

return self._root.findall(path)

647

648

##

649

# Writes the element tree to a file, as XML.

650

#

651

# @param file A file name, or a file object opened for writing.

652

# @param encoding Optional output encoding (default is US-ASCII).

653

654

def write(self, file, encoding="us-ascii"):

655

assert self._root is not None

656

if getattr(file, "write", None) is None:

657

file = open(file, "wb")

658

if not encoding:

659

encoding = "us-ascii"

660

elif encoding != "utf-8" and encoding != "us-ascii":

661

file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)

662

self._write(file, self._root, encoding, {})

663

664

def _write(self, file, node, encoding, namespaces):

665

# write XML to file

666

tag = node.tag

667

if tag is Comment:

668

file.write("" % _escape_cdata(node.text, encoding))

669

elif tag is ProcessingInstruction:

670

file.write("<?%s?>" % _escape_cdata(node.text, encoding))

671

else:

672

items = node.items()

673

xmlns_items = [] # new namespaces in this scope

674

try:

675

if isinstance(tag, QName) or tag[:1] == "{":

676

tag, xmlns = fixtag(tag, namespaces)

677

if xmlns: xmlns_items.append(xmlns)

678

except TypeError:

679

_raise_serialization_error(tag)

680

file.write("<" + _encode(tag, encoding))

681

if items or xmlns_items:

682

items.sort() # lexical order

683

for k, v in items:

684

try:

685

if isinstance(k, QName) or k[:1] == "{":

686

k, xmlns = fixtag(k, namespaces)

687

if xmlns: xmlns_items.append(xmlns)

688

except TypeError:

689

_raise_serialization_error(k)

690

try:

691

if isinstance(v, QName):

692

v, xmlns = fixtag(v, namespaces)

693

if xmlns: xmlns_items.append(xmlns)

694

except TypeError:

695

_raise_serialization_error(v)

696

file.write(" %s=\"%s\"" % (_encode(k, encoding),

697

_escape_attrib(v, encoding)))

698

for k, v in xmlns_items:

699

file.write(" %s=\"%s\"" % (_encode(k, encoding),

700

_escape_attrib(v, encoding)))

701

if node.text or len(node):

702

file.write(">")

703

if node.text:

704

file.write(_escape_cdata(node.text, encoding))

705

for n in node:

706

self._write(file, n, encoding, namespaces)

707

file.write("</" + _encode(tag, encoding) + ">")

708

else:

709

file.write(" />")

710

for k, v in xmlns_items:

711

del namespaces[v]

712

if node.tail:

713

file.write(_escape_cdata(node.tail, encoding))

714

715

# --------------------------------------------------------------------

716

# helpers

717

718

##

719

# Checks if an object appears to be a valid element object.

720

#

721

# @param An element instance.

722

# @return A true value if this is an element object.

723

# @defreturn flag

724

725

def iselement(element):

726

# FIXME: not sure about this; might be a better idea to look

727

# for tag/attrib/text attributes

728

return isinstance(element, _ElementInterface) or (getattr(element, "tag", None) is not None)

729

730

##

731

# Writes an element tree or element structure to sys.stdout. This

732

# function should be used for debugging only.

733

#

734

# The exact output format is implementation dependent. In this

735

# version, it's written as an ordinary XML file.

736

#

737

# @param elem An element tree or an individual element.

738

739

def dump(elem):

740

# debugging

741

if not isinstance(elem, ElementTree):

742

elem = ElementTree(elem)

743

elem.write(sys.stdout)

744

tail = elem.getroot().tail

745

if not tail or tail[-1] != "\n":

746

sys.stdout.write("\n")

747

748

def _encode(s, encoding):

749

try:

750

return s.encode(encoding)

751

except AttributeError:

752

return s # 1.5.2: assume the string uses the right encoding

753

754

if sys.version[:3] == "1.5":

755

_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2

756

else:

757

_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))

758

759

_escape_map = {

760

"&": "&",

761

"<": "<",

762

">": ">",

763

'"': """,

764

}

765

766

_namespace_map = {

767

# "well-known" namespace prefixes

768

"http://www.w3.org/XML/1998/namespace": "xml",

769

"http://www.w3.org/1999/xhtml": "html",

770

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

771

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

772

}

773

774

def _raise_serialization_error(text):

775

raise TypeError(

776

"cannot serialize %r (type %s)" % (text, type(text).__name__)

777

)

778

779

def _encode_entity(text, pattern=_escape):

780

# map reserved and non-ascii characters to numerical entities

781

def escape_entities(m, map=_escape_map):

782

out = []

783

append = out.append

784

for char in m.group():

785

text = map.get(char)

786

if text is None:

787

text = "&#%d;" % ord(char)

788

append(text)

789

return string.join(out, "")

790

try:

791

return _encode(pattern.sub(escape_entities, text), "ascii")

792

except TypeError:

793

_raise_serialization_error(text)

794

795

#

796

# the following functions assume an ascii-compatible encoding

797

# (or "utf-16")

798

799

def _escape_cdata(text, encoding=None, replace=string.replace):

800

# escape character data

801

try:

802

if encoding:

803

try:

804

text = _encode(text, encoding)

805

except UnicodeError:

806

return _encode_entity(text)

807

text = replace(text, "&", "&")

808

text = replace(text, "<", "<")

809

text = replace(text, ">", ">")

810

return text

811

except (TypeError, AttributeError):

812

_raise_serialization_error(text)

813

814

def _escape_attrib(text, encoding=None, replace=string.replace):

815

# escape attribute value

816

try:

817

if encoding:

818

try:

819

text = _encode(text, encoding)

820

except UnicodeError:

821

return _encode_entity(text)

822

text = replace(text, "&", "&")

823

text = replace(text, "'", "'") # FIXME: overkill

824

text = replace(text, "\"", """)

825

text = replace(text, "<", "<")

826

text = replace(text, ">", ">")

827

return text

828

except (TypeError, AttributeError):

829

_raise_serialization_error(text)

830

831

def fixtag(tag, namespaces):

832

# given a decorated tag (of the form {uri}tag), return prefixed

833

# tag and namespace declaration, if any

834

if isinstance(tag, QName):

835

tag = tag.text

836

namespace_uri, tag = string.split(tag[1:], "}", 1)

837

prefix = namespaces.get(namespace_uri)

838

if prefix is None:

839

prefix = _namespace_map.get(namespace_uri)

840

if prefix is None:

841

prefix = "ns%d" % len(namespaces)

842

namespaces[namespace_uri] = prefix

843

if prefix == "xml":

844

xmlns = None

845

else:

846

xmlns = ("xmlns:%s" % prefix, namespace_uri)

847

else:

848

xmlns = None

849

return "%s:%s" % (prefix, tag), xmlns

850

851

##

852

# Parses an XML document into an element tree.

853

#

854

# @param source A filename or file object containing XML data.

855

# @param parser An optional parser instance. If not given, the

856

# standard {@link XMLTreeBuilder} parser is used.

857

# @return An ElementTree instance

858

859

def parse(source, parser=None):

860

tree = ElementTree()

861

tree.parse(source, parser)

862

return tree

863

864

##

865

# Parses an XML document into an element tree incrementally, and reports

866

# what's going on to the user.

867

#

868

# @param source A filename or file object containing XML data.

869

# @param events A list of events to report back. If omitted, only "end"

870

# events are reported.

871

# @return A (event, elem) iterator.

872

873

class iterparse:

874

875

def __init__(self, source, events=None):

876

if getattr(source, "read", None) is None:

877

source = open(source, "rb")

878

self._file = source

879

self._events = []

880

self._index = 0

881

self.root = self._root = None

882

self._parser = XMLTreeBuilder()

883

# wire up the parser for event reporting

884

parser = self._parser._parser

885

append = self._events.append

886

if events is None:

887

events = ["end"]

888

for event in events:

889

if event == "start":

890

try:

891

parser.ordered_attributes = 1

892

parser.specified_attributes = 1

893

def handler(tag, attrib_in, event=event, append=append,

894

start=self._parser._start_list):

895

append((event, start(tag, attrib_in)))

896

parser.StartElementHandler = handler

897

except AttributeError:

898

def handler(tag, attrib_in, event=event, append=append,

899

start=self._parser._start):

900

append((event, start(tag, attrib_in)))

901

parser.StartElementHandler = handler

902

elif event == "end":

903

def handler(tag, event=event, append=append,

904

end=self._parser._end):

905

append((event, end(tag)))

906

parser.EndElementHandler = handler

907

elif event == "start-ns":

908

def handler(prefix, uri, event=event, append=append):

909

try:

910

uri = _encode(uri, "ascii")

911

except UnicodeError:

912

pass

913

append((event, (prefix or "", uri)))

914

parser.StartNamespaceDeclHandler = handler

915

elif event == "end-ns":

916

def handler(prefix, event=event, append=append):

917

append((event, None))

918

parser.EndNamespaceDeclHandler = handler

919

920

def next(self):

921

while 1:

922

try:

923

item = self._events[self._index]

924

except IndexError:

925

if self._parser is None:

926

self.root = self._root

927

try:

928

raise StopIteration

929

except NameError:

930

raise IndexError

931

# load event buffer

932

del self._events[:]

933

self._index = 0

934

data = self._file.read(16384)

935

if data:

936

self._parser.feed(data)

937

else:

938

self._root = self._parser.close()

939

self._parser = None

940

else:

941

self._index = self._index + 1

942

return item

943

944

try:

945

iter

946

def __iter__(self):

947

return self

948

except NameError:

949

def __getitem__(self, index):

950

return self.next()

951

952

##

953

# Parses an XML document from a string constant. This function can

954

# be used to embed "XML literals" in Python code.

955

#

956

# @param source A string containing XML data.

957

# @return An Element instance.

958

# @defreturn Element

959

960

def XML(text):

961

parser = XMLTreeBuilder()

962

parser.feed(text)

963

return parser.close()

964

965

##

966

# Parses an XML document from a string constant, and also returns

967

# a dictionary which maps from element id:s to elements.

968

#

969

# @param source A string containing XML data.

970

# @return A tuple containing an Element instance and a dictionary.

971

# @defreturn (Element, dictionary)

972

973

def XMLID(text):

974

parser = XMLTreeBuilder()

975

parser.feed(text)

976

tree = parser.close()

977

ids = {}

978

for elem in tree.getiterator():

979

id = elem.get("id")

980

if id:

981

ids[id] = elem

982

return tree, ids

983

984

##

985

# Parses an XML document from a string constant. Same as {@link #XML}.

986

#

987

# @def fromstring(text)

988

# @param source A string containing XML data.

989

# @return An Element instance.

990

# @defreturn Element

991

992

fromstring = XML

993

994

##

995

# Generates a string representation of an XML element, including all

996

# subelements.

997

#

998

# @param element An Element instance.

999

# @return An encoded string containing the XML data.

1000

# @defreturn string

1001

1002

def tostring(element, encoding=None):

1003

class dummy:

1004

pass

1005

data = []

1006

file = dummy()

1007

file.write = data.append

1008

ElementTree(element).write(file, encoding)

1009

return string.join(data, "")

1010

1011

##

1012

# Generic element structure builder. This builder converts a sequence

1013

# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link

1014

# #TreeBuilder.end} method calls to a well-formed element structure.

1015

#

1016

# You can use this class to build an element structure using a custom XML

1017

# parser, or a parser for some other XML-like format.

1018

#

1019

# @param element_factory Optional element factory. This factory

1020

# is called to create new Element instances, as necessary.

1021

1022

class TreeBuilder:

1023

1024

def __init__(self, element_factory=None):

1025

self._data = [] # data collector

1026

self._elem = [] # element stack

1027

self._last = None # last element

1028

self._tail = None # true if we're after an end tag

1029

if element_factory is None:

1030

element_factory = _ElementInterface

1031

self._factory = element_factory

1032

1033

##

1034

# Flushes the parser buffers, and returns the toplevel documen

1035

# element.

1036

#

1037

# @return An Element instance.

1038

# @defreturn Element

1039

1040

def close(self):

1041

assert len(self._elem) == 0, "missing end tags"

1042

assert self._last is not None, "missing toplevel element"

1043

return self._last

1044

1045

def _flush(self):

1046

if self._data:

1047

if self._last is not None:

1048

text = string.join(self._data, "")

1049

if self._tail:

1050

assert self._last.tail is None, "internal error (tail)"

1051

self._last.tail = text

1052

else:

1053

assert self._last.text is None, "internal error (text)"

1054

self._last.text = text

1055

self._data = []

1056

1057

##

1058

# Adds text to the current element.

1059

#

1060

# @param data A string. This should be either an 8-bit string

1061

# containing ASCII text, or a Unicode string.

1062

1063

def data(self, data):

1064

self._data.append(data)

1065

1066

##

1067

# Opens a new element.

1068

#

1069

# @param tag The element name.

1070

# @param attrib A dictionary containing element attributes.

1071

# @return The opened element.

1072

# @defreturn Element

1073

1074

def start(self, tag, attrs):

1075

self._flush()

1076

self._last = elem = self._factory(tag, attrs)

1077

if self._elem:

1078

self._elem[-1].append(elem)

1079

self._elem.append(elem)

1080

self._tail = 0

1081

return elem

1082

1083

##

1084

# Closes the current element.

1085

#

1086

# @param tag The element name.

1087

# @return The closed element.

1088

# @defreturn Element

1089

1090

def end(self, tag):

1091

self._flush()

1092

self._last = self._elem.pop()

1093

assert self._last.tag == tag,\

1094

"end tag mismatch (expected %s, got %s)" % (

1095

self._last.tag, tag)

1096

self._tail = 1

1097

return self._last

1098

1099

##

1100

# Element structure builder for XML source data, based on the

1101

# expat parser.

1102

#

1103

# @keyparam target Target object. If omitted, the builder uses an

1104

# instance of the standard {@link #TreeBuilder} class.

1105

# @keyparam html Predefine HTML entities. This flag is not supported

1106

# by the current implementation.

1107

# @see #ElementTree

1108

# @see #TreeBuilder

1109

1110

class XMLTreeBuilder:

1111

1112

def __init__(self, html=0, target=None):

1113

try:

1114

from xml.parsers import expat

1115

except ImportError:

1116

raise ImportError(

1117

"No module named expat; use SimpleXMLTreeBuilder instead"

1118

)

1119

self._parser = parser = expat.ParserCreate(None, "}")

1120

if target is None:

1121

target = TreeBuilder()

1122

self._target = target

1123

self._names = {} # name memo cache

1124

# callbacks

1125

parser.DefaultHandlerExpand = self._default

1126

parser.StartElementHandler = self._start

1127

parser.EndElementHandler = self._end

1128

parser.CharacterDataHandler = self._data

1129

# let expat do the buffering, if supported

1130

try:

1131

self._parser.buffer_text = 1

1132

except AttributeError:

1133

pass

1134

# use new-style attribute handling, if supported

1135

try:

1136

self._parser.ordered_attributes = 1

1137

self._parser.specified_attributes = 1

1138

parser.StartElementHandler = self._start_list

1139

except AttributeError:

1140

pass

1141

encoding = None

1142

if not parser.returns_unicode:

1143

encoding = "utf-8"

1144

# target.xml(encoding, None)

1145

self._doctype = None

1146

self.entity = {}

1147

1148

def _fixtext(self, text):

1149

# convert text string to ascii, if possible

1150

try:

1151

return _encode(text, "ascii")

1152

except UnicodeError:

1153

return text

1154

1155

def _fixname(self, key):

1156

# expand qname, and convert name string to ascii, if possible

1157

try:

1158

name = self._names[key]

1159

except KeyError:

1160

name = key

1161

if "}" in name:

1162

name = "{" + name

1163

self._names[key] = name = self._fixtext(name)

1164

return name

1165

1166

def _start(self, tag, attrib_in):

1167

fixname = self._fixname

1168

tag = fixname(tag)

1169

attrib = {}

1170

for key, value in attrib_in.items():

1171

attrib[fixname(key)] = self._fixtext(value)

1172

return self._target.start(tag, attrib)

1173

1174

def _start_list(self, tag, attrib_in):

1175

fixname = self._fixname

1176

tag = fixname(tag)

1177

attrib = {}

1178

if attrib_in:

1179

for i in range(0, len(attrib_in), 2):

1180

attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])

1181

return self._target.start(tag, attrib)

1182

1183

def _data(self, text):

1184

return self._target.data(self._fixtext(text))

1185

1186

def _end(self, tag):

1187

return self._target.end(self._fixname(tag))

1188

1189

def _default(self, text):

1190

prefix = text[:1]

1191

if prefix == "&":

1192

# deal with undefined entities

1193

try:

1194

self._target.data(self.entity[text[1:-1]])

1195

except KeyError:

1196

from xml.parsers import expat

1197

raise expat.error(

1198

"undefined entity %s: line %d, column %d" %

1199

(text, self._parser.ErrorLineNumber,

1200

self._parser.ErrorColumnNumber)

1201

)

1202

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1203

self._doctype = [] # inside a doctype declaration

1204

elif self._doctype is not None:

1205

# parse doctype contents

1206

if prefix == ">":

1207

self._doctype = None

1208

return

1209

text = string.strip(text)

1210

if not text:

1211

return

1212

self._doctype.append(text)

1213

n = len(self._doctype)

1214

if n > 2:

1215

type = self._doctype[1]

1216

if type == "PUBLIC" and n == 4:

1217

name, type, pubid, system = self._doctype

1218

elif type == "SYSTEM" and n == 3:

1219

name, type, system = self._doctype

1220

pubid = None

1221

else:

1222

return

1223

if pubid:

1224

pubid = pubid[1:-1]

1225

self.doctype(name, pubid, system[1:-1])

1226

self._doctype = None

1227

1228

##

1229

# Handles a doctype declaration.

1230

#

1231

# @param name Doctype name.

1232

# @param pubid Public identifier.

1233

# @param system System identifier.

1234

1235

def doctype(self, name, pubid, system):

1236

pass

1237

1238

##

1239

# Feeds data to the parser.

1240

#

1241

# @param data Encoded data.

1242

1243

def feed(self, data):

1244

self._parser.Parse(data, 0)

1245

1246

##

1247

# Finishes feeding data to the parser.

1248

#

1249

# @return An element structure.

1250

# @defreturn Element

1251

1252

def close(self):

1253

self._parser.Parse("", 1) # end of data

1254

tree = self._target.close()

1255

del self._target, self._parser # get rid of circular references

1256

return tree