~bzr-pqm/bzr/bzr.dev : revision 628

1

#

2

# ElementTree

3

# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $

4

#

5

# light-weight XML support for Python 1.5.2 and later.

6

#

7

# history:

8

# 2001-10-20 fl created (from various sources)

9

# 2001-11-01 fl return root from parse method

10

# 2002-02-16 fl sort attributes in lexical order

11

# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup

12

# 2002-05-01 fl finished TreeBuilder refactoring

13

# 2002-07-14 fl added basic namespace support to ElementTree.write

14

# 2002-07-25 fl added QName attribute support

15

# 2002-10-20 fl fixed encoding in write

16

# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding

17

# 2002-11-27 fl accept file objects or file names for parse/write

18

# 2002-12-04 fl moved XMLTreeBuilder back to this module

19

# 2003-01-11 fl fixed entity encoding glitch for us-ascii

20

# 2003-02-13 fl added XML literal factory

21

# 2003-02-21 fl added ProcessingInstruction/PI factory

22

# 2003-05-11 fl added tostring/fromstring helpers

23

# 2003-05-26 fl added ElementPath support

24

# 2003-07-05 fl added makeelement factory method

25

# 2003-07-28 fl added more well-known namespace prefixes

26

# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)

27

# 2003-09-04 fl fall back on emulator if ElementPath is not installed

28

# 2003-10-31 fl markup updates

29

# 2003-11-15 fl fixed nested namespace bug

30

# 2004-03-28 fl added XMLID helper

31

# 2004-06-02 fl added default support to findtext

32

# 2004-06-08 fl fixed encoding of non-ascii element/attribute names

33

# 2004-08-23 fl take advantage of post-2.1 expat features

34

# 2005-02-01 fl added iterparse implementation

35

# 2005-03-02 fl fixed iterparse support for pre-2.2 versions

36

#

37

38

#

39

# fredrik@pythonware.com

40

# http://www.pythonware.com

41

#

42

# --------------------------------------------------------------------

43

# The ElementTree toolkit is

44

#

45

46

#

47

# By obtaining, using, and/or copying this software and/or its

48

# associated documentation, you agree that you have read, understood,

49

# and will comply with the following terms and conditions:

50

#

51

# Permission to use, copy, modify, and distribute this software and

52

# its associated documentation for any purpose and without fee is

53

# hereby granted, provided that the above copyright notice appears in

54

# all copies, and that both that copyright notice and this permission

55

# notice appear in supporting documentation, and that the name of

56

# Secret Labs AB or the author not be used in advertising or publicity

57

# pertaining to distribution of the software without specific, written

58

# prior permission.

59

#

60

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

61

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

62

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

63

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

64

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

65

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

66

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

67

# OF THIS SOFTWARE.

68

# --------------------------------------------------------------------

69

70

__all__ = [

71

# public symbols

72

"Comment",

73

"dump",

74

"Element", "ElementTree",

75

"fromstring",

76

"iselement", "iterparse",

77

"parse",

78

"PI", "ProcessingInstruction",

79

"QName",

80

"SubElement",

81

"tostring",

82

"TreeBuilder",

83

"VERSION", "XML",

84

"XMLTreeBuilder",

85

]

86

87

##

88

# The Element type is a flexible container object, designed to

89

# store hierarchical data structures in memory. The type can be

90

# described as a cross between a list and a dictionary.

91

#

92

# Each element has a number of properties associated with it:

93

# <ul>

94

# <li>a tag. This is a string identifying what kind of data

95

# this element represents (the element type, in other words).</li>

96

# <li>a number of attributes, stored in a Python dictionary.</li>

97

# <li>a text string.</li>

98

# <li>an optional tail string.</li>

99

# <li>a number of child elements, stored in a Python sequence</li>

100

# </ul>

101

#

102

# To create an element instance, use the {@link #Element} or {@link

103

# #SubElement} factory functions.

104

#

105

# The {@link #ElementTree} class can be used to wrap an element

106

# structure, and convert it from and to XML.

107

##

108

109

import string, sys, re

110

111

class _SimpleElementPath:

112

# emulate pre-1.2 find/findtext/findall behaviour

113

def find(self, element, tag):

114

for elem in element:

115

if elem.tag == tag:

116

return elem

117

return None

118

def findtext(self, element, tag, default=None):

119

for elem in element:

120

if elem.tag == tag:

121

return elem.text or ""

122

return default

123

def findall(self, element, tag):

124

if tag[:3] == ".//":

125

return element.getiterator(tag[3:])

126

result = []

127

for elem in element:

128

if elem.tag == tag:

129

result.append(elem)

130

return result

131

132

try:

133

import ElementPath

134

except ImportError:

135

# FIXME: issue warning in this case?

136

ElementPath = _SimpleElementPath()

137

138

# TODO: add support for custom namespace resolvers/default namespaces

139

# TODO: add improved support for incremental parsing

140

141

VERSION = "1.2.6"

142

143

##

144

# Internal element class. This class defines the Element interface,

145

# and provides a reference implementation of this interface.

146

#

147

# You should not create instances of this class directly. Use the

148

# appropriate factory functions instead, such as {@link #Element}

149

# and {@link #SubElement}.

150

#

151

# @see Element

152

# @see SubElement

153

# @see Comment

154

# @see ProcessingInstruction

155

156

class _ElementInterface:

157

# <tag attrib>text<child/>...</tag>tail

158

159

##

160

# (Attribute) Element tag.

161

162

tag = None

163

164

##

165

# (Attribute) Element attribute dictionary. Where possible, use

166

# {@link #_ElementInterface.get},

167

# {@link #_ElementInterface.set},

168

# {@link #_ElementInterface.keys}, and

169

# {@link #_ElementInterface.items} to access

170

# element attributes.

171

172

attrib = None

173

174

##

175

# (Attribute) Text before first subelement. This is either a

176

# string or the value None, if there was no text.

177

178

text = None

179

180

##

181

# (Attribute) Text after this element's end tag, but before the

182

# next sibling element's start tag. This is either a string or

183

# the value None, if there was no text.

184

185

tail = None # text after end tag, if any

186

187

def __init__(self, tag, attrib):

188

self.tag = tag

189

self.attrib = attrib

190

self._children = []

191

192

def __repr__(self):

193

return "<Element %s at %x>" % (self.tag, id(self))

194

195

##

196

# Creates a new element object of the same type as this element.

197

#

198

# @param tag Element tag.

199

# @param attrib Element attributes, given as a dictionary.

200

# @return A new element instance.

201

202

def makeelement(self, tag, attrib):

203

return Element(tag, attrib)

204

205

##

206

# Returns the number of subelements.

207

#

208

# @return The number of subelements.

209

210

def __len__(self):

211

return len(self._children)

212

213

##

214

# Returns the given subelement.

215

#

216

# @param index What subelement to return.

217

# @return The given subelement.

218

# @exception IndexError If the given element does not exist.

219

220

def __getitem__(self, index):

221

return self._children[index]

222

223

##

224

# Replaces the given subelement.

225

#

226

# @param index What subelement to replace.

227

# @param element The new element value.

228

# @exception IndexError If the given element does not exist.

229

# @exception AssertionError If element is not a valid object.

230

231

def __setitem__(self, index, element):

232

assert iselement(element)

233

self._children[index] = element

234

235

##

236

# Deletes the given subelement.

237

#

238

# @param index What subelement to delete.

239

# @exception IndexError If the given element does not exist.

240

241

def __delitem__(self, index):

242

del self._children[index]

243

244

##

245

# Returns a list containing subelements in the given range.

246

#

247

# @param start The first subelement to return.

248

# @param stop The first subelement that shouldn't be returned.

249

# @return A sequence object containing subelements.

250

251

def __getslice__(self, start, stop):

252

return self._children[start:stop]

253

254

##

255

# Replaces a number of subelements with elements from a sequence.

256

#

257

# @param start The first subelement to replace.

258

# @param stop The first subelement that shouldn't be replaced.

259

# @param elements A sequence object with zero or more elements.

260

# @exception AssertionError If a sequence member is not a valid object.

261

262

def __setslice__(self, start, stop, elements):

263

for element in elements:

264

assert iselement(element)

265

self._children[start:stop] = list(elements)

266

267

##

268

# Deletes a number of subelements.

269

#

270

# @param start The first subelement to delete.

271

# @param stop The first subelement to leave in there.

272

273

def __delslice__(self, start, stop):

274

del self._children[start:stop]

275

276

##

277

# Adds a subelement to the end of this element.

278

#

279

# @param element The element to add.

280

# @exception AssertionError If a sequence member is not a valid object.

281

282

def append(self, element):

283

assert iselement(element)

284

self._children.append(element)

285

286

##

287

# Inserts a subelement at the given position in this element.

288

#

289

# @param index Where to insert the new subelement.

290

# @exception AssertionError If the element is not a valid object.

291

292

def insert(self, index, element):

293

assert iselement(element)

294

self._children.insert(index, element)

295

296

##

297

# Removes a matching subelement. Unlike the find methods,

298

# this method compares elements based on identity, not on tag

299

# value or contents.

300

#

301

# @param element What element to remove.

302

# @exception ValueError If a matching element could not be found.

303

# @exception AssertionError If the element is not a valid object.

304

305

def remove(self, element):

306

assert iselement(element)

307

self._children.remove(element)

308

309

##

310

# Returns all subelements. The elements are returned in document

311

# order.

312

#

313

# @return A list of subelements.

314

# @defreturn list of Element instances

315

316

def getchildren(self):

317

return self._children

318

319

##

320

# Finds the first matching subelement, by tag name or path.

321

#

322

# @param path What element to look for.

323

# @return The first matching element, or None if no element was found.

324

# @defreturn Element or None

325

326

def find(self, path):

327

return ElementPath.find(self, path)

328

329

##

330

# Finds text for the first matching subelement, by tag name or path.

331

#

332

# @param path What element to look for.

333

# @param default What to return if the element was not found.

334

# @return The text content of the first matching element, or the

335

# default value no element was found. Note that if the element

336

# has is found, but has no text content, this method returns an

337

# empty string.

338

# @defreturn string

339

340

def findtext(self, path, default=None):

341

return ElementPath.findtext(self, path, default)

342

343

##

344

# Finds all matching subelements, by tag name or path.

345

#

346

# @param path What element to look for.

347

# @return A list or iterator containing all matching elements,

348

# in document order.

349

# @defreturn list of Element instances

350

351

def findall(self, path):

352

return ElementPath.findall(self, path)

353

354

##

355

# Resets an element. This function removes all subelements, clears

356

# all attributes, and sets the text and tail attributes to None.

357

358

def clear(self):

359

self.attrib.clear()

360

self._children = []

361

self.text = self.tail = None

362

363

##

364

# Gets an element attribute.

365

#

366

# @param key What attribute to look for.

367

# @param default What to return if the attribute was not found.

368

# @return The attribute value, or the default value, if the

369

# attribute was not found.

370

# @defreturn string or None

371

372

def get(self, key, default=None):

373

return self.attrib.get(key, default)

374

375

##

376

# Sets an element attribute.

377

#

378

# @param key What attribute to set.

379

# @param value The attribute value.

380

381

def set(self, key, value):

382

self.attrib[key] = value

383

384

##

385

# Gets a list of attribute names. The names are returned in an

386

# arbitrary order (just like for an ordinary Python dictionary).

387

#

388

# @return A list of element attribute names.

389

# @defreturn list of strings

390

391

def keys(self):

392

return self.attrib.keys()

393

394

##

395

# Gets element attributes, as a sequence. The attributes are

396

# returned in an arbitrary order.

397

#

398

# @return A list of (name, value) tuples for all attributes.

399

# @defreturn list of (string, string) tuples

400

401

def items(self):

402

return self.attrib.items()

403

404

##

405

# Creates a tree iterator. The iterator loops over this element

406

# and all subelements, in document order, and returns all elements

407

# with a matching tag.

408

#

409

# If the tree structure is modified during iteration, the result

410

# is undefined.

411

#

412

# @param tag What tags to look for (default is to return all elements).

413

# @return A list or iterator containing all the matching elements.

414

# @defreturn list or iterator

415

416

def getiterator(self, tag=None):

417

nodes = []

418

if tag == "*":

419

tag = None

420

if tag is None or self.tag == tag:

421

nodes.append(self)

422

for node in self._children:

423

nodes.extend(node.getiterator(tag))

424

return nodes

425

426

# compatibility

427

_Element = _ElementInterface

428

429

##

430

# Element factory. This function returns an object implementing the

431

# standard Element interface. The exact class or type of that object

432

# is implementation dependent, but it will always be compatible with

433

# the {@link #_ElementInterface} class in this module.

434

#

435

# The element name, attribute names, and attribute values can be

436

# either 8-bit ASCII strings or Unicode strings.

437

#

438

# @param tag The element name.

439

# @param attrib An optional dictionary, containing element attributes.

440

# @param **extra Additional attributes, given as keyword arguments.

441

# @return An element instance.

442

# @defreturn Element

443

444

def Element(tag, attrib={}, **extra):

445

attrib = attrib.copy()

446

attrib.update(extra)

447

return _ElementInterface(tag, attrib)

448

449

##

450

# Subelement factory. This function creates an element instance, and

451

# appends it to an existing element.

452

#

453

# The element name, attribute names, and attribute values can be

454

# either 8-bit ASCII strings or Unicode strings.

455

#

456

# @param parent The parent element.

457

# @param tag The subelement name.

458

# @param attrib An optional dictionary, containing element attributes.

459

# @param **extra Additional attributes, given as keyword arguments.

460

# @return An element instance.

461

# @defreturn Element

462

463

def SubElement(parent, tag, attrib={}, **extra):

464

attrib = attrib.copy()

465

attrib.update(extra)

466

element = parent.makeelement(tag, attrib)

467

parent.append(element)

468

return element

469

470

##

471

# Comment element factory. This factory function creates a special

472

# element that will be serialized as an XML comment.

473

#

474

# The comment string can be either an 8-bit ASCII string or a Unicode

475

# string.

476

#

477

# @param text A string containing the comment string.

478

# @return An element instance, representing a comment.

479

# @defreturn Element

480

481

def Comment(text=None):

482

element = Element(Comment)

483

element.text = text

484

return element

485

486

##

487

# PI element factory. This factory function creates a special element

488

# that will be serialized as an XML processing instruction.

489

#

490

# @param target A string containing the PI target.

491

# @param text A string containing the PI contents, if any.

492

# @return An element instance, representing a PI.

493

# @defreturn Element

494

495

def ProcessingInstruction(target, text=None):

496

element = Element(ProcessingInstruction)

497

element.text = target

498

if text:

499

element.text = element.text + " " + text

500

return element

501

502

PI = ProcessingInstruction

503

504

##

505

# QName wrapper. This can be used to wrap a QName attribute value, in

506

# order to get proper namespace handling on output.

507

#

508

# @param text A string containing the QName value, in the form {uri}local,

509

# or, if the tag argument is given, the URI part of a QName.

510

# @param tag Optional tag. If given, the first argument is interpreted as

511

# an URI, and this argument is interpreted as a local name.

512

# @return An opaque object, representing the QName.

513

514

class QName:

515

def __init__(self, text_or_uri, tag=None):

516

if tag:

517

text_or_uri = "{%s}%s" % (text_or_uri, tag)

518

self.text = text_or_uri

519

def __str__(self):

520

return self.text

521

def __hash__(self):

522

return hash(self.text)

523

def __cmp__(self, other):

524

if isinstance(other, QName):

525

return cmp(self.text, other.text)

526

return cmp(self.text, other)

527

528

##

529

# ElementTree wrapper class. This class represents an entire element

530

# hierarchy, and adds some extra support for serialization to and from

531

# standard XML.

532

#

533

# @param element Optional root element.

534

# @keyparam file Optional file handle or name. If given, the

535

# tree is initialized with the contents of this XML file.

536

537

class ElementTree:

538

539

def __init__(self, element=None, file=None):

540

assert element is None or iselement(element)

541

self._root = element # first node

542

if file:

543

self.parse(file)

544

545

##

546

# Gets the root element for this tree.

547

#

548

# @return An element instance.

549

# @defreturn Element

550

551

def getroot(self):

552

return self._root

553

554

##

555

# Replaces the root element for this tree. This discards the

556

# current contents of the tree, and replaces it with the given

557

# element. Use with care.

558

#

559

# @param element An element instance.

560

561

def _setroot(self, element):

562

assert iselement(element)

563

self._root = element

564

565

##

566

# Loads an external XML document into this element tree.

567

#

568

# @param source A file name or file object.

569

# @param parser An optional parser instance. If not given, the

570

# standard {@link XMLTreeBuilder} parser is used.

571

# @return The document root element.

572

# @defreturn Element

573

574

def parse(self, source, parser=None):

575

if not hasattr(source, "read"):

576

source = open(source, "rb")

577

if not parser:

578

parser = XMLTreeBuilder()

579

while 1:

580

data = source.read(32768)

581

if not data:

582

break

583

parser.feed(data)

584

self._root = parser.close()

585

return self._root

586

587

##

588

# Creates a tree iterator for the root element. The iterator loops

589

# over all elements in this tree, in document order.

590

#

591

# @param tag What tags to look for (default is to return all elements)

592

# @return An iterator.

593

# @defreturn iterator

594

595

def getiterator(self, tag=None):

596

assert self._root is not None

597

return self._root.getiterator(tag)

598

599

##

600

# Finds the first toplevel element with given tag.

601

# Same as getroot().find(path).

602

#

603

# @param path What element to look for.

604

# @return The first matching element, or None if no element was found.

605

# @defreturn Element or None

606

607

def find(self, path):

608

assert self._root is not None

609

if path[:1] == "/":

610

path = "." + path

611

return self._root.find(path)

612

613

##

614

# Finds the element text for the first toplevel element with given

615

# tag. Same as getroot().findtext(path).

616

#

617

# @param path What toplevel element to look for.

618

# @param default What to return if the element was not found.

619

# @return The text content of the first matching element, or the

620

# default value no element was found. Note that if the element

621

# has is found, but has no text content, this method returns an

622

# empty string.

623

# @defreturn string

624

625

def findtext(self, path, default=None):

626

assert self._root is not None

627

if path[:1] == "/":

628

path = "." + path

629

return self._root.findtext(path, default)

630

631

##

632

# Finds all toplevel elements with the given tag.

633

# Same as getroot().findall(path).

634

#

635

# @param path What element to look for.

636

# @return A list or iterator containing all matching elements,

637

# in document order.

638

# @defreturn list of Element instances

639

640

def findall(self, path):

641

assert self._root is not None

642

if path[:1] == "/":

643

path = "." + path

644

return self._root.findall(path)

645

646

##

647

# Writes the element tree to a file, as XML.

648

#

649

# @param file A file name, or a file object opened for writing.

650

# @param encoding Optional output encoding (default is US-ASCII).

651

652

def write(self, file, encoding="us-ascii"):

653

assert self._root is not None

654

if not hasattr(file, "write"):

655

file = open(file, "wb")

656

if not encoding:

657

encoding = "us-ascii"

658

elif encoding != "utf-8" and encoding != "us-ascii":

659

file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)

660

self._write(file, self._root, encoding, {})

661

662

def _write(self, file, node, encoding, namespaces):

663

# write XML to file

664

tag = node.tag

665

if tag is Comment:

666

file.write("" % _escape_cdata(node.text, encoding))

667

elif tag is ProcessingInstruction:

668

file.write("<?%s?>" % _escape_cdata(node.text, encoding))

669

else:

670

items = node.items()

671

xmlns_items = [] # new namespaces in this scope

672

try:

673

if isinstance(tag, QName) or tag[:1] == "{":

674

tag, xmlns = fixtag(tag, namespaces)

675

if xmlns: xmlns_items.append(xmlns)

676

except TypeError:

677

_raise_serialization_error(tag)

678

file.write("<" + _encode(tag, encoding))

679

if items or xmlns_items:

680

items.sort() # lexical order

681

for k, v in items:

682

try:

683

if isinstance(k, QName) or k[:1] == "{":

684

k, xmlns = fixtag(k, namespaces)

685

if xmlns: xmlns_items.append(xmlns)

686

except TypeError:

687

_raise_serialization_error(k)

688

try:

689

if isinstance(v, QName):

690

v, xmlns = fixtag(v, namespaces)

691

if xmlns: xmlns_items.append(xmlns)

692

except TypeError:

693

_raise_serialization_error(v)

694

file.write(" %s=\"%s\"" % (_encode(k, encoding),

695

_escape_attrib(v, encoding)))

696

for k, v in xmlns_items:

697

file.write(" %s=\"%s\"" % (_encode(k, encoding),

698

_escape_attrib(v, encoding)))

699

if node.text or len(node):

700

file.write(">")

701

if node.text:

702

file.write(_escape_cdata(node.text, encoding))

703

for n in node:

704

self._write(file, n, encoding, namespaces)

705

file.write("</" + _encode(tag, encoding) + ">")

706

else:

707

file.write(" />")

708

for k, v in xmlns_items:

709

del namespaces[v]

710

if node.tail:

711

file.write(_escape_cdata(node.tail, encoding))

712

713

# --------------------------------------------------------------------

714

# helpers

715

716

##

717

# Checks if an object appears to be a valid element object.

718

#

719

# @param An element instance.

720

# @return A true value if this is an element object.

721

# @defreturn flag

722

723

def iselement(element):

724

# FIXME: not sure about this; might be a better idea to look

725

# for tag/attrib/text attributes

726

return isinstance(element, _ElementInterface) or hasattr(element, "tag")

727

728

##

729

# Writes an element tree or element structure to sys.stdout. This

730

# function should be used for debugging only.

731

#

732

# The exact output format is implementation dependent. In this

733

# version, it's written as an ordinary XML file.

734

#

735

# @param elem An element tree or an individual element.

736

737

def dump(elem):

738

# debugging

739

if not isinstance(elem, ElementTree):

740

elem = ElementTree(elem)

741

elem.write(sys.stdout)

742

tail = elem.getroot().tail

743

if not tail or tail[-1] != "\n":

744

sys.stdout.write("\n")

745

746

def _encode(s, encoding):

747

try:

748

return s.encode(encoding)

749

except AttributeError:

750

return s # 1.5.2: assume the string uses the right encoding

751

752

if sys.version[:3] == "1.5":

753

_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2

754

else:

755

_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))

756

757

_escape_map = {

758

"&": "&",

759

"<": "<",

760

">": ">",

761

'"': """,

762

}

763

764

_namespace_map = {

765

# "well-known" namespace prefixes

766

"http://www.w3.org/XML/1998/namespace": "xml",

767

"http://www.w3.org/1999/xhtml": "html",

768

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

769

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

770

}

771

772

def _raise_serialization_error(text):

773

raise TypeError(

774

"cannot serialize %r (type %s)" % (text, type(text).__name__)

775

)

776

777

def _encode_entity(text, pattern=_escape):

778

# map reserved and non-ascii characters to numerical entities

779

def escape_entities(m, map=_escape_map):

780

out = []

781

append = out.append

782

for char in m.group():

783

text = map.get(char)

784

if text is None:

785

text = "&#%d;" % ord(char)

786

append(text)

787

return string.join(out, "")

788

try:

789

return _encode(pattern.sub(escape_entities, text), "ascii")

790

except TypeError:

791

_raise_serialization_error(text)

792

793

#

794

# the following functions assume an ascii-compatible encoding

795

# (or "utf-16")

796

797

def _escape_cdata(text, encoding=None, replace=string.replace):

798

# escape character data

799

try:

800

if encoding:

801

try:

802

text = _encode(text, encoding)

803

except UnicodeError:

804

return _encode_entity(text)

805

text = replace(text, "&", "&")

806

text = replace(text, "<", "<")

807

text = replace(text, ">", ">")

808

return text

809

except (TypeError, AttributeError):

810

_raise_serialization_error(text)

811

812

def _escape_attrib(text, encoding=None, replace=string.replace):

813

# escape attribute value

814

try:

815

if encoding:

816

try:

817

text = _encode(text, encoding)

818

except UnicodeError:

819

return _encode_entity(text)

820

text = replace(text, "&", "&")

821

text = replace(text, "'", "'") # FIXME: overkill

822

text = replace(text, "\"", """)

823

text = replace(text, "<", "<")

824

text = replace(text, ">", ">")

825

return text

826

except (TypeError, AttributeError):

827

_raise_serialization_error(text)

828

829

def fixtag(tag, namespaces):

830

# given a decorated tag (of the form {uri}tag), return prefixed

831

# tag and namespace declaration, if any

832

if isinstance(tag, QName):

833

tag = tag.text

834

namespace_uri, tag = string.split(tag[1:], "}", 1)

835

prefix = namespaces.get(namespace_uri)

836

if prefix is None:

837

prefix = _namespace_map.get(namespace_uri)

838

if prefix is None:

839

prefix = "ns%d" % len(namespaces)

840

namespaces[namespace_uri] = prefix

841

if prefix == "xml":

842

xmlns = None

843

else:

844

xmlns = ("xmlns:%s" % prefix, namespace_uri)

845

else:

846

xmlns = None

847

return "%s:%s" % (prefix, tag), xmlns

848

849

##

850

# Parses an XML document into an element tree.

851

#

852

# @param source A filename or file object containing XML data.

853

# @param parser An optional parser instance. If not given, the

854

# standard {@link XMLTreeBuilder} parser is used.

855

# @return An ElementTree instance

856

857

def parse(source, parser=None):

858

tree = ElementTree()

859

tree.parse(source, parser)

860

return tree

861

862

##

863

# Parses an XML document into an element tree incrementally, and reports

864

# what's going on to the user.

865

#

866

# @param source A filename or file object containing XML data.

867

# @param events A list of events to report back. If omitted, only "end"

868

# events are reported.

869

# @return A (event, elem) iterator.

870

871

class iterparse:

872

873

def __init__(self, source, events=None):

874

if not hasattr(source, "read"):

875

source = open(source, "rb")

876

self._file = source

877

self._events = []

878

self._index = 0

879

self.root = self._root = None

880

self._parser = XMLTreeBuilder()

881

# wire up the parser for event reporting

882

parser = self._parser._parser

883

append = self._events.append

884

if events is None:

885

events = ["end"]

886

for event in events:

887

if event == "start":

888

try:

889

parser.ordered_attributes = 1

890

parser.specified_attributes = 1

891

def handler(tag, attrib_in, event=event, append=append,

892

start=self._parser._start_list):

893

append((event, start(tag, attrib_in)))

894

parser.StartElementHandler = handler

895

except AttributeError:

896

def handler(tag, attrib_in, event=event, append=append,

897

start=self._parser._start):

898

append((event, start(tag, attrib_in)))

899

parser.StartElementHandler = handler

900

elif event == "end":

901

def handler(tag, event=event, append=append,

902

end=self._parser._end):

903

append((event, end(tag)))

904

parser.EndElementHandler = handler

905

elif event == "start-ns":

906

def handler(prefix, uri, event=event, append=append):

907

try:

908

uri = _encode(uri, "ascii")

909

except UnicodeError:

910

pass

911

append((event, (prefix or "", uri)))

912

parser.StartNamespaceDeclHandler = handler

913

elif event == "end-ns":

914

def handler(prefix, event=event, append=append):

915

append((event, None))

916

parser.EndNamespaceDeclHandler = handler

917

918

def next(self):

919

while 1:

920

try:

921

item = self._events[self._index]

922

except IndexError:

923

if self._parser is None:

924

self.root = self._root

925

try:

926

raise StopIteration

927

except NameError:

928

raise IndexError

929

# load event buffer

930

del self._events[:]

931

self._index = 0

932

data = self._file.read(16384)

933

if data:

934

self._parser.feed(data)

935

else:

936

self._root = self._parser.close()

937

self._parser = None

938

else:

939

self._index = self._index + 1

940

return item

941

942

try:

943

iter

944

def __iter__(self):

945

return self

946

except NameError:

947

def __getitem__(self, index):

948

return self.next()

949

950

##

951

# Parses an XML document from a string constant. This function can

952

# be used to embed "XML literals" in Python code.

953

#

954

# @param source A string containing XML data.

955

# @return An Element instance.

956

# @defreturn Element

957

958

def XML(text):

959

parser = XMLTreeBuilder()

960

parser.feed(text)

961

return parser.close()

962

963

##

964

# Parses an XML document from a string constant, and also returns

965

# a dictionary which maps from element id:s to elements.

966

#

967

# @param source A string containing XML data.

968

# @return A tuple containing an Element instance and a dictionary.

969

# @defreturn (Element, dictionary)

970

971

def XMLID(text):

972

parser = XMLTreeBuilder()

973

parser.feed(text)

974

tree = parser.close()

975

ids = {}

976

for elem in tree.getiterator():

977

id = elem.get("id")

978

if id:

979

ids[id] = elem

980

return tree, ids

981

982

##

983

# Parses an XML document from a string constant. Same as {@link #XML}.

984

#

985

# @def fromstring(text)

986

# @param source A string containing XML data.

987

# @return An Element instance.

988

# @defreturn Element

989

990

fromstring = XML

991

992

##

993

# Generates a string representation of an XML element, including all

994

# subelements.

995

#

996

# @param element An Element instance.

997

# @return An encoded string containing the XML data.

998

# @defreturn string

999

1000

def tostring(element, encoding=None):

1001

class dummy:

1002

pass

1003

data = []

1004

file = dummy()

1005

file.write = data.append

1006

ElementTree(element).write(file, encoding)

1007

return string.join(data, "")

1008

1009

##

1010

# Generic element structure builder. This builder converts a sequence

1011

# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link

1012

# #TreeBuilder.end} method calls to a well-formed element structure.

1013

#

1014

# You can use this class to build an element structure using a custom XML

1015

# parser, or a parser for some other XML-like format.

1016

#

1017

# @param element_factory Optional element factory. This factory

1018

# is called to create new Element instances, as necessary.

1019

1020

class TreeBuilder:

1021

1022

def __init__(self, element_factory=None):

1023

self._data = [] # data collector

1024

self._elem = [] # element stack

1025

self._last = None # last element

1026

self._tail = None # true if we're after an end tag

1027

if element_factory is None:

1028

element_factory = _ElementInterface

1029

self._factory = element_factory

1030

1031

##

1032

# Flushes the parser buffers, and returns the toplevel documen

1033

# element.

1034

#

1035

# @return An Element instance.

1036

# @defreturn Element

1037

1038

def close(self):

1039

assert len(self._elem) == 0, "missing end tags"

1040

assert self._last != None, "missing toplevel element"

1041

return self._last

1042

1043

def _flush(self):

1044

if self._data:

1045

if self._last is not None:

1046

text = string.join(self._data, "")

1047

if self._tail:

1048

assert self._last.tail is None, "internal error (tail)"

1049

self._last.tail = text

1050

else:

1051

assert self._last.text is None, "internal error (text)"

1052

self._last.text = text

1053

self._data = []

1054

1055

##

1056

# Adds text to the current element.

1057

#

1058

# @param data A string. This should be either an 8-bit string

1059

# containing ASCII text, or a Unicode string.

1060

1061

def data(self, data):

1062

self._data.append(data)

1063

1064

##

1065

# Opens a new element.

1066

#

1067

# @param tag The element name.

1068

# @param attrib A dictionary containing element attributes.

1069

# @return The opened element.

1070

# @defreturn Element

1071

1072

def start(self, tag, attrs):

1073

self._flush()

1074

self._last = elem = self._factory(tag, attrs)

1075

if self._elem:

1076

self._elem[-1].append(elem)

1077

self._elem.append(elem)

1078

self._tail = 0

1079

return elem

1080

1081

##

1082

# Closes the current element.

1083

#

1084

# @param tag The element name.

1085

# @return The closed element.

1086

# @defreturn Element

1087

1088

def end(self, tag):

1089

self._flush()

1090

self._last = self._elem.pop()

1091

assert self._last.tag == tag,\

1092

"end tag mismatch (expected %s, got %s)" % (

1093

self._last.tag, tag)

1094

self._tail = 1

1095

return self._last

1096

1097

##

1098

# Element structure builder for XML source data, based on the

1099

# expat parser.

1100

#

1101

# @keyparam target Target object. If omitted, the builder uses an

1102

# instance of the standard {@link #TreeBuilder} class.

1103

# @keyparam html Predefine HTML entities. This flag is not supported

1104

# by the current implementation.

1105

# @see #ElementTree

1106

# @see #TreeBuilder

1107

1108

class XMLTreeBuilder:

1109

1110

def __init__(self, html=0, target=None):

1111

try:

1112

from xml.parsers import expat

1113

except ImportError:

1114

raise ImportError(

1115

"No module named expat; use SimpleXMLTreeBuilder instead"

1116

)

1117

self._parser = parser = expat.ParserCreate(None, "}")

1118

if target is None:

1119

target = TreeBuilder()

1120

self._target = target

1121

self._names = {} # name memo cache

1122

# callbacks

1123

parser.DefaultHandlerExpand = self._default

1124

parser.StartElementHandler = self._start

1125

parser.EndElementHandler = self._end

1126

parser.CharacterDataHandler = self._data

1127

# let expat do the buffering, if supported

1128

try:

1129

self._parser.buffer_text = 1

1130

except AttributeError:

1131

pass

1132

# use new-style attribute handling, if supported

1133

try:

1134

self._parser.ordered_attributes = 1

1135

self._parser.specified_attributes = 1

1136

parser.StartElementHandler = self._start_list

1137

except AttributeError:

1138

pass

1139

encoding = None

1140

if not parser.returns_unicode:

1141

encoding = "utf-8"

1142

# target.xml(encoding, None)

1143

self._doctype = None

1144

self.entity = {}

1145

1146

def _fixtext(self, text):

1147

# convert text string to ascii, if possible

1148

try:

1149

return _encode(text, "ascii")

1150

except UnicodeError:

1151

return text

1152

1153

def _fixname(self, key):

1154

# expand qname, and convert name string to ascii, if possible

1155

try:

1156

name = self._names[key]

1157

except KeyError:

1158

name = key

1159

if "}" in name:

1160

name = "{" + name

1161

self._names[key] = name = self._fixtext(name)

1162

return name

1163

1164

def _start(self, tag, attrib_in):

1165

fixname = self._fixname

1166

tag = fixname(tag)

1167

attrib = {}

1168

for key, value in attrib_in.items():

1169

attrib[fixname(key)] = self._fixtext(value)

1170

return self._target.start(tag, attrib)

1171

1172

def _start_list(self, tag, attrib_in):

1173

fixname = self._fixname

1174

tag = fixname(tag)

1175

attrib = {}

1176

if attrib_in:

1177

for i in range(0, len(attrib_in), 2):

1178

attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])

1179

return self._target.start(tag, attrib)

1180

1181

def _data(self, text):

1182

return self._target.data(self._fixtext(text))

1183

1184

def _end(self, tag):

1185

return self._target.end(self._fixname(tag))

1186

1187

def _default(self, text):

1188

prefix = text[:1]

1189

if prefix == "&":

1190

# deal with undefined entities

1191

try:

1192

self._target.data(self.entity[text[1:-1]])

1193

except KeyError:

1194

from xml.parsers import expat

1195

raise expat.error(

1196

"undefined entity %s: line %d, column %d" %

1197

(text, self._parser.ErrorLineNumber,

1198

self._parser.ErrorColumnNumber)

1199

)

1200

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1201

self._doctype = [] # inside a doctype declaration

1202

elif self._doctype is not None:

1203

# parse doctype contents

1204

if prefix == ">":

1205

self._doctype = None

1206

return

1207

text = string.strip(text)

1208

if not text:

1209

return

1210

self._doctype.append(text)

1211

n = len(self._doctype)

1212

if n > 2:

1213

type = self._doctype[1]

1214

if type == "PUBLIC" and n == 4:

1215

name, type, pubid, system = self._doctype

1216

elif type == "SYSTEM" and n == 3:

1217

name, type, system = self._doctype

1218

pubid = None

1219

else:

1220

return

1221

if pubid:

1222

pubid = pubid[1:-1]

1223

self.doctype(name, pubid, system[1:-1])

1224

self._doctype = None

1225

1226

##

1227

# Handles a doctype declaration.

1228

#

1229

# @param name Doctype name.

1230

# @param pubid Public identifier.

1231

# @param system System identifier.

1232

1233

def doctype(self, name, pubid, system):

1234

pass

1235

1236

##

1237

# Feeds data to the parser.

1238

#

1239

# @param data Encoded data.

1240

1241

def feed(self, data):

1242

self._parser.Parse(data, 0)

1243

1244

##

1245

# Finishes feeding data to the parser.

1246

#

1247

# @return An element structure.

1248

# @defreturn Element

1249

1250

def close(self):

1251

self._parser.Parse("", 1) # end of data

1252

tree = self._target.close()

1253

del self._target, self._parser # get rid of circular references

1254

return tree