3
# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
5
# light-weight XML support for Python 1.5.2 and later.
8
# 2001-10-20 fl created (from various sources)
9
# 2001-11-01 fl return root from parse method
10
# 2002-02-16 fl sort attributes in lexical order
11
# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12
# 2002-05-01 fl finished TreeBuilder refactoring
13
# 2002-07-14 fl added basic namespace support to ElementTree.write
14
# 2002-07-25 fl added QName attribute support
15
# 2002-10-20 fl fixed encoding in write
16
# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17
# 2002-11-27 fl accept file objects or file names for parse/write
18
# 2002-12-04 fl moved XMLTreeBuilder back to this module
19
# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20
# 2003-02-13 fl added XML literal factory
21
# 2003-02-21 fl added ProcessingInstruction/PI factory
22
# 2003-05-11 fl added tostring/fromstring helpers
23
# 2003-05-26 fl added ElementPath support
24
# 2003-07-05 fl added makeelement factory method
25
# 2003-07-28 fl added more well-known namespace prefixes
26
# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27
# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28
# 2003-10-31 fl markup updates
29
# 2003-11-15 fl fixed nested namespace bug
30
# 2004-03-28 fl added XMLID helper
31
# 2004-06-02 fl added default support to findtext
32
# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33
# 2004-08-23 fl take advantage of post-2.1 expat features
34
# 2005-02-01 fl added iterparse implementation
35
# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
37
# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
39
# fredrik@pythonware.com
40
# http://www.pythonware.com
42
# --------------------------------------------------------------------
43
# The ElementTree toolkit is
45
# Copyright (c) 1999-2005 by Fredrik Lundh
47
# By obtaining, using, and/or copying this software and/or its
48
# associated documentation, you agree that you have read, understood,
49
# and will comply with the following terms and conditions:
51
# Permission to use, copy, modify, and distribute this software and
52
# its associated documentation for any purpose and without fee is
53
# hereby granted, provided that the above copyright notice appears in
54
# all copies, and that both that copyright notice and this permission
55
# notice appear in supporting documentation, and that the name of
56
# Secret Labs AB or the author not be used in advertising or publicity
57
# pertaining to distribution of the software without specific, written
60
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
68
# --------------------------------------------------------------------
70
from __future__ import absolute_import
76
"Element", "ElementTree",
78
"iselement", "iterparse",
80
"PI", "ProcessingInstruction",
90
# The <b>Element</b> type is a flexible container object, designed to
91
# store hierarchical data structures in memory. The type can be
92
# described as a cross between a list and a dictionary.
94
# Each element has a number of properties associated with it:
96
# <li>a <i>tag</i>. This is a string identifying what kind of data
97
# this element represents (the element type, in other words).</li>
98
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
99
# <li>a <i>text</i> string.</li>
100
# <li>an optional <i>tail</i> string.</li>
101
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
104
# To create an element instance, use the {@link #Element} or {@link
105
# #SubElement} factory functions.
107
# The {@link #ElementTree} class can be used to wrap an element
108
# structure, and convert it from and to XML.
111
import string, sys, re
113
class _SimpleElementPath:
114
# emulate pre-1.2 find/findtext/findall behaviour
115
def find(self, element, tag):
120
def findtext(self, element, tag, default=None):
123
return elem.text or ""
125
def findall(self, element, tag):
127
return element.getiterator(tag[3:])
137
# FIXME: issue warning in this case?
138
ElementPath = _SimpleElementPath()
140
# TODO: add support for custom namespace resolvers/default namespaces
141
# TODO: add improved support for incremental parsing
146
# Internal element class. This class defines the Element interface,
147
# and provides a reference implementation of this interface.
149
# You should not create instances of this class directly. Use the
150
# appropriate factory functions instead, such as {@link #Element}
151
# and {@link #SubElement}.
156
# @see ProcessingInstruction
158
class _ElementInterface:
159
# <tag attrib>text<child/>...</tag>tail
162
# (Attribute) Element tag.
167
# (Attribute) Element attribute dictionary. Where possible, use
168
# {@link #_ElementInterface.get},
169
# {@link #_ElementInterface.set},
170
# {@link #_ElementInterface.keys}, and
171
# {@link #_ElementInterface.items} to access
172
# element attributes.
177
# (Attribute) Text before first subelement. This is either a
178
# string or the value None, if there was no text.
183
# (Attribute) Text after this element's end tag, but before the
184
# next sibling element's start tag. This is either a string or
185
# the value None, if there was no text.
187
tail = None # text after end tag, if any
189
def __init__(self, tag, attrib):
195
return "<Element %s at %x>" % (self.tag, id(self))
198
# Creates a new element object of the same type as this element.
200
# @param tag Element tag.
201
# @param attrib Element attributes, given as a dictionary.
202
# @return A new element instance.
204
def makeelement(self, tag, attrib):
205
return Element(tag, attrib)
208
# Returns the number of subelements.
210
# @return The number of subelements.
213
return len(self._children)
216
# Returns the given subelement.
218
# @param index What subelement to return.
219
# @return The given subelement.
220
# @exception IndexError If the given element does not exist.
222
def __getitem__(self, index):
223
return self._children[index]
226
# Replaces the given subelement.
228
# @param index What subelement to replace.
229
# @param element The new element value.
230
# @exception IndexError If the given element does not exist.
231
# @exception AssertionError If element is not a valid object.
233
def __setitem__(self, index, element):
234
assert iselement(element)
235
self._children[index] = element
238
# Deletes the given subelement.
240
# @param index What subelement to delete.
241
# @exception IndexError If the given element does not exist.
243
def __delitem__(self, index):
244
del self._children[index]
247
# Returns a list containing subelements in the given range.
249
# @param start The first subelement to return.
250
# @param stop The first subelement that shouldn't be returned.
251
# @return A sequence object containing subelements.
253
def __getslice__(self, start, stop):
254
return self._children[start:stop]
257
# Replaces a number of subelements with elements from a sequence.
259
# @param start The first subelement to replace.
260
# @param stop The first subelement that shouldn't be replaced.
261
# @param elements A sequence object with zero or more elements.
262
# @exception AssertionError If a sequence member is not a valid object.
264
def __setslice__(self, start, stop, elements):
265
for element in elements:
266
assert iselement(element)
267
self._children[start:stop] = list(elements)
270
# Deletes a number of subelements.
272
# @param start The first subelement to delete.
273
# @param stop The first subelement to leave in there.
275
def __delslice__(self, start, stop):
276
del self._children[start:stop]
279
# Adds a subelement to the end of this element.
281
# @param element The element to add.
282
# @exception AssertionError If a sequence member is not a valid object.
284
def append(self, element):
285
assert iselement(element)
286
self._children.append(element)
289
# Inserts a subelement at the given position in this element.
291
# @param index Where to insert the new subelement.
292
# @exception AssertionError If the element is not a valid object.
294
def insert(self, index, element):
295
assert iselement(element)
296
self._children.insert(index, element)
299
# Removes a matching subelement. Unlike the <b>find</b> methods,
300
# this method compares elements based on identity, not on tag
303
# @param element What element to remove.
304
# @exception ValueError If a matching element could not be found.
305
# @exception AssertionError If the element is not a valid object.
307
def remove(self, element):
308
assert iselement(element)
309
self._children.remove(element)
312
# Returns all subelements. The elements are returned in document
315
# @return A list of subelements.
316
# @defreturn list of Element instances
318
def getchildren(self):
319
return self._children
322
# Finds the first matching subelement, by tag name or path.
324
# @param path What element to look for.
325
# @return The first matching element, or None if no element was found.
326
# @defreturn Element or None
328
def find(self, path):
329
return ElementPath.find(self, path)
332
# Finds text for the first matching subelement, by tag name or path.
334
# @param path What element to look for.
335
# @param default What to return if the element was not found.
336
# @return The text content of the first matching element, or the
337
# default value no element was found. Note that if the element
338
# has is found, but has no text content, this method returns an
342
def findtext(self, path, default=None):
343
return ElementPath.findtext(self, path, default)
346
# Finds all matching subelements, by tag name or path.
348
# @param path What element to look for.
349
# @return A list or iterator containing all matching elements,
351
# @defreturn list of Element instances
353
def findall(self, path):
354
return ElementPath.findall(self, path)
357
# Resets an element. This function removes all subelements, clears
358
# all attributes, and sets the text and tail attributes to None.
363
self.text = self.tail = None
366
# Gets an element attribute.
368
# @param key What attribute to look for.
369
# @param default What to return if the attribute was not found.
370
# @return The attribute value, or the default value, if the
371
# attribute was not found.
372
# @defreturn string or None
374
def get(self, key, default=None):
375
return self.attrib.get(key, default)
378
# Sets an element attribute.
380
# @param key What attribute to set.
381
# @param value The attribute value.
383
def set(self, key, value):
384
self.attrib[key] = value
387
# Gets a list of attribute names. The names are returned in an
388
# arbitrary order (just like for an ordinary Python dictionary).
390
# @return A list of element attribute names.
391
# @defreturn list of strings
394
return self.attrib.keys()
397
# Gets element attributes, as a sequence. The attributes are
398
# returned in an arbitrary order.
400
# @return A list of (name, value) tuples for all attributes.
401
# @defreturn list of (string, string) tuples
404
return self.attrib.items()
407
# Creates a tree iterator. The iterator loops over this element
408
# and all subelements, in document order, and returns all elements
409
# with a matching tag.
411
# If the tree structure is modified during iteration, the result
414
# @param tag What tags to look for (default is to return all elements).
415
# @return A list or iterator containing all the matching elements.
416
# @defreturn list or iterator
418
def getiterator(self, tag=None):
422
if tag is None or self.tag == tag:
424
for node in self._children:
425
nodes.extend(node.getiterator(tag))
429
_Element = _ElementInterface
432
# Element factory. This function returns an object implementing the
433
# standard Element interface. The exact class or type of that object
434
# is implementation dependent, but it will always be compatible with
435
# the {@link #_ElementInterface} class in this module.
437
# The element name, attribute names, and attribute values can be
438
# either 8-bit ASCII strings or Unicode strings.
440
# @param tag The element name.
441
# @param attrib An optional dictionary, containing element attributes.
442
# @param **extra Additional attributes, given as keyword arguments.
443
# @return An element instance.
446
def Element(tag, attrib={}, **extra):
447
attrib = attrib.copy()
449
return _ElementInterface(tag, attrib)
452
# Subelement factory. This function creates an element instance, and
453
# appends it to an existing element.
455
# The element name, attribute names, and attribute values can be
456
# either 8-bit ASCII strings or Unicode strings.
458
# @param parent The parent element.
459
# @param tag The subelement name.
460
# @param attrib An optional dictionary, containing element attributes.
461
# @param **extra Additional attributes, given as keyword arguments.
462
# @return An element instance.
465
def SubElement(parent, tag, attrib={}, **extra):
466
attrib = attrib.copy()
468
element = parent.makeelement(tag, attrib)
469
parent.append(element)
473
# Comment element factory. This factory function creates a special
474
# element that will be serialized as an XML comment.
476
# The comment string can be either an 8-bit ASCII string or a Unicode
479
# @param text A string containing the comment string.
480
# @return An element instance, representing a comment.
483
def Comment(text=None):
484
element = Element(Comment)
489
# PI element factory. This factory function creates a special element
490
# that will be serialized as an XML processing instruction.
492
# @param target A string containing the PI target.
493
# @param text A string containing the PI contents, if any.
494
# @return An element instance, representing a PI.
497
def ProcessingInstruction(target, text=None):
498
element = Element(ProcessingInstruction)
499
element.text = target
501
element.text = element.text + " " + text
504
PI = ProcessingInstruction
507
# QName wrapper. This can be used to wrap a QName attribute value, in
508
# order to get proper namespace handling on output.
510
# @param text A string containing the QName value, in the form {uri}local,
511
# or, if the tag argument is given, the URI part of a QName.
512
# @param tag Optional tag. If given, the first argument is interpreted as
513
# an URI, and this argument is interpreted as a local name.
514
# @return An opaque object, representing the QName.
517
def __init__(self, text_or_uri, tag=None):
519
text_or_uri = "{%s}%s" % (text_or_uri, tag)
520
self.text = text_or_uri
524
return hash(self.text)
525
def __cmp__(self, other):
526
if isinstance(other, QName):
527
return cmp(self.text, other.text)
528
return cmp(self.text, other)
531
# ElementTree wrapper class. This class represents an entire element
532
# hierarchy, and adds some extra support for serialization to and from
535
# @param element Optional root element.
536
# @keyparam file Optional file handle or name. If given, the
537
# tree is initialized with the contents of this XML file.
541
def __init__(self, element=None, file=None):
542
assert element is None or iselement(element)
543
self._root = element # first node
548
# Gets the root element for this tree.
550
# @return An element instance.
557
# Replaces the root element for this tree. This discards the
558
# current contents of the tree, and replaces it with the given
559
# element. Use with care.
561
# @param element An element instance.
563
def _setroot(self, element):
564
assert iselement(element)
568
# Loads an external XML document into this element tree.
570
# @param source A file name or file object.
571
# @param parser An optional parser instance. If not given, the
572
# standard {@link XMLTreeBuilder} parser is used.
573
# @return The document root element.
576
def parse(self, source, parser=None):
577
if getattr(source, "read", None) is None:
578
source = open(source, "rb")
580
parser = XMLTreeBuilder()
582
data = source.read(32768)
586
self._root = parser.close()
590
# Creates a tree iterator for the root element. The iterator loops
591
# over all elements in this tree, in document order.
593
# @param tag What tags to look for (default is to return all elements)
594
# @return An iterator.
595
# @defreturn iterator
597
def getiterator(self, tag=None):
598
assert self._root is not None
599
return self._root.getiterator(tag)
602
# Finds the first toplevel element with given tag.
603
# Same as getroot().find(path).
605
# @param path What element to look for.
606
# @return The first matching element, or None if no element was found.
607
# @defreturn Element or None
609
def find(self, path):
610
assert self._root is not None
613
return self._root.find(path)
616
# Finds the element text for the first toplevel element with given
617
# tag. Same as getroot().findtext(path).
619
# @param path What toplevel element to look for.
620
# @param default What to return if the element was not found.
621
# @return The text content of the first matching element, or the
622
# default value no element was found. Note that if the element
623
# has is found, but has no text content, this method returns an
627
def findtext(self, path, default=None):
628
assert self._root is not None
631
return self._root.findtext(path, default)
634
# Finds all toplevel elements with the given tag.
635
# Same as getroot().findall(path).
637
# @param path What element to look for.
638
# @return A list or iterator containing all matching elements,
640
# @defreturn list of Element instances
642
def findall(self, path):
643
assert self._root is not None
646
return self._root.findall(path)
649
# Writes the element tree to a file, as XML.
651
# @param file A file name, or a file object opened for writing.
652
# @param encoding Optional output encoding (default is US-ASCII).
654
def write(self, file, encoding="us-ascii"):
655
assert self._root is not None
656
if getattr(file, "write", None) is None:
657
file = open(file, "wb")
659
encoding = "us-ascii"
660
elif encoding != "utf-8" and encoding != "us-ascii":
661
file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
662
self._write(file, self._root, encoding, {})
664
def _write(self, file, node, encoding, namespaces):
668
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
669
elif tag is ProcessingInstruction:
670
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
673
xmlns_items = [] # new namespaces in this scope
675
if isinstance(tag, QName) or tag[:1] == "{":
676
tag, xmlns = fixtag(tag, namespaces)
677
if xmlns: xmlns_items.append(xmlns)
679
_raise_serialization_error(tag)
680
file.write("<" + _encode(tag, encoding))
681
if items or xmlns_items:
682
items.sort() # lexical order
685
if isinstance(k, QName) or k[:1] == "{":
686
k, xmlns = fixtag(k, namespaces)
687
if xmlns: xmlns_items.append(xmlns)
689
_raise_serialization_error(k)
691
if isinstance(v, QName):
692
v, xmlns = fixtag(v, namespaces)
693
if xmlns: xmlns_items.append(xmlns)
695
_raise_serialization_error(v)
696
file.write(" %s=\"%s\"" % (_encode(k, encoding),
697
_escape_attrib(v, encoding)))
698
for k, v in xmlns_items:
699
file.write(" %s=\"%s\"" % (_encode(k, encoding),
700
_escape_attrib(v, encoding)))
701
if node.text or len(node):
704
file.write(_escape_cdata(node.text, encoding))
706
self._write(file, n, encoding, namespaces)
707
file.write("</" + _encode(tag, encoding) + ">")
710
for k, v in xmlns_items:
713
file.write(_escape_cdata(node.tail, encoding))
715
# --------------------------------------------------------------------
719
# Checks if an object appears to be a valid element object.
721
# @param An element instance.
722
# @return A true value if this is an element object.
725
def iselement(element):
726
# FIXME: not sure about this; might be a better idea to look
727
# for tag/attrib/text attributes
728
return isinstance(element, _ElementInterface) or (getattr(element, "tag", None) is not None)
731
# Writes an element tree or element structure to sys.stdout. This
732
# function should be used for debugging only.
734
# The exact output format is implementation dependent. In this
735
# version, it's written as an ordinary XML file.
737
# @param elem An element tree or an individual element.
741
if not isinstance(elem, ElementTree):
742
elem = ElementTree(elem)
743
elem.write(sys.stdout)
744
tail = elem.getroot().tail
745
if not tail or tail[-1] != "\n":
746
sys.stdout.write("\n")
748
def _encode(s, encoding):
750
return s.encode(encoding)
751
except AttributeError:
752
return s # 1.5.2: assume the string uses the right encoding
754
if sys.version[:3] == "1.5":
755
_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
757
_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
767
# "well-known" namespace prefixes
768
"http://www.w3.org/XML/1998/namespace": "xml",
769
"http://www.w3.org/1999/xhtml": "html",
770
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
771
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
774
def _raise_serialization_error(text):
776
"cannot serialize %r (type %s)" % (text, type(text).__name__)
779
def _encode_entity(text, pattern=_escape):
780
# map reserved and non-ascii characters to numerical entities
781
def escape_entities(m, map=_escape_map):
784
for char in m.group():
787
text = "&#%d;" % ord(char)
789
return string.join(out, "")
791
return _encode(pattern.sub(escape_entities, text), "ascii")
793
_raise_serialization_error(text)
796
# the following functions assume an ascii-compatible encoding
799
def _escape_cdata(text, encoding=None, replace=string.replace):
800
# escape character data
804
text = _encode(text, encoding)
806
return _encode_entity(text)
807
text = replace(text, "&", "&")
808
text = replace(text, "<", "<")
809
text = replace(text, ">", ">")
811
except (TypeError, AttributeError):
812
_raise_serialization_error(text)
814
def _escape_attrib(text, encoding=None, replace=string.replace):
815
# escape attribute value
819
text = _encode(text, encoding)
821
return _encode_entity(text)
822
text = replace(text, "&", "&")
823
text = replace(text, "'", "'") # FIXME: overkill
824
text = replace(text, "\"", """)
825
text = replace(text, "<", "<")
826
text = replace(text, ">", ">")
828
except (TypeError, AttributeError):
829
_raise_serialization_error(text)
831
def fixtag(tag, namespaces):
832
# given a decorated tag (of the form {uri}tag), return prefixed
833
# tag and namespace declaration, if any
834
if isinstance(tag, QName):
836
namespace_uri, tag = string.split(tag[1:], "}", 1)
837
prefix = namespaces.get(namespace_uri)
839
prefix = _namespace_map.get(namespace_uri)
841
prefix = "ns%d" % len(namespaces)
842
namespaces[namespace_uri] = prefix
846
xmlns = ("xmlns:%s" % prefix, namespace_uri)
849
return "%s:%s" % (prefix, tag), xmlns
852
# Parses an XML document into an element tree.
854
# @param source A filename or file object containing XML data.
855
# @param parser An optional parser instance. If not given, the
856
# standard {@link XMLTreeBuilder} parser is used.
857
# @return An ElementTree instance
859
def parse(source, parser=None):
861
tree.parse(source, parser)
865
# Parses an XML document into an element tree incrementally, and reports
866
# what's going on to the user.
868
# @param source A filename or file object containing XML data.
869
# @param events A list of events to report back. If omitted, only "end"
870
# events are reported.
871
# @return A (event, elem) iterator.
875
def __init__(self, source, events=None):
876
if getattr(source, "read", None) is None:
877
source = open(source, "rb")
881
self.root = self._root = None
882
self._parser = XMLTreeBuilder()
883
# wire up the parser for event reporting
884
parser = self._parser._parser
885
append = self._events.append
891
parser.ordered_attributes = 1
892
parser.specified_attributes = 1
893
def handler(tag, attrib_in, event=event, append=append,
894
start=self._parser._start_list):
895
append((event, start(tag, attrib_in)))
896
parser.StartElementHandler = handler
897
except AttributeError:
898
def handler(tag, attrib_in, event=event, append=append,
899
start=self._parser._start):
900
append((event, start(tag, attrib_in)))
901
parser.StartElementHandler = handler
903
def handler(tag, event=event, append=append,
904
end=self._parser._end):
905
append((event, end(tag)))
906
parser.EndElementHandler = handler
907
elif event == "start-ns":
908
def handler(prefix, uri, event=event, append=append):
910
uri = _encode(uri, "ascii")
913
append((event, (prefix or "", uri)))
914
parser.StartNamespaceDeclHandler = handler
915
elif event == "end-ns":
916
def handler(prefix, event=event, append=append):
917
append((event, None))
918
parser.EndNamespaceDeclHandler = handler
923
item = self._events[self._index]
925
if self._parser is None:
926
self.root = self._root
934
data = self._file.read(16384)
936
self._parser.feed(data)
938
self._root = self._parser.close()
941
self._index = self._index + 1
949
def __getitem__(self, index):
953
# Parses an XML document from a string constant. This function can
954
# be used to embed "XML literals" in Python code.
956
# @param source A string containing XML data.
957
# @return An Element instance.
961
parser = XMLTreeBuilder()
963
return parser.close()
966
# Parses an XML document from a string constant, and also returns
967
# a dictionary which maps from element id:s to elements.
969
# @param source A string containing XML data.
970
# @return A tuple containing an Element instance and a dictionary.
971
# @defreturn (Element, dictionary)
974
parser = XMLTreeBuilder()
976
tree = parser.close()
978
for elem in tree.getiterator():
985
# Parses an XML document from a string constant. Same as {@link #XML}.
987
# @def fromstring(text)
988
# @param source A string containing XML data.
989
# @return An Element instance.
995
# Generates a string representation of an XML element, including all
998
# @param element An Element instance.
999
# @return An encoded string containing the XML data.
1002
def tostring(element, encoding=None):
1007
file.write = data.append
1008
ElementTree(element).write(file, encoding)
1009
return string.join(data, "")
1012
# Generic element structure builder. This builder converts a sequence
1013
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1014
# #TreeBuilder.end} method calls to a well-formed element structure.
1016
# You can use this class to build an element structure using a custom XML
1017
# parser, or a parser for some other XML-like format.
1019
# @param element_factory Optional element factory. This factory
1020
# is called to create new Element instances, as necessary.
1024
def __init__(self, element_factory=None):
1025
self._data = [] # data collector
1026
self._elem = [] # element stack
1027
self._last = None # last element
1028
self._tail = None # true if we're after an end tag
1029
if element_factory is None:
1030
element_factory = _ElementInterface
1031
self._factory = element_factory
1034
# Flushes the parser buffers, and returns the toplevel documen
1037
# @return An Element instance.
1038
# @defreturn Element
1041
assert len(self._elem) == 0, "missing end tags"
1042
assert self._last is not None, "missing toplevel element"
1047
if self._last is not None:
1048
text = string.join(self._data, "")
1050
assert self._last.tail is None, "internal error (tail)"
1051
self._last.tail = text
1053
assert self._last.text is None, "internal error (text)"
1054
self._last.text = text
1058
# Adds text to the current element.
1060
# @param data A string. This should be either an 8-bit string
1061
# containing ASCII text, or a Unicode string.
1063
def data(self, data):
1064
self._data.append(data)
1067
# Opens a new element.
1069
# @param tag The element name.
1070
# @param attrib A dictionary containing element attributes.
1071
# @return The opened element.
1072
# @defreturn Element
1074
def start(self, tag, attrs):
1076
self._last = elem = self._factory(tag, attrs)
1078
self._elem[-1].append(elem)
1079
self._elem.append(elem)
1084
# Closes the current element.
1086
# @param tag The element name.
1087
# @return The closed element.
1088
# @defreturn Element
1092
self._last = self._elem.pop()
1093
assert self._last.tag == tag,\
1094
"end tag mismatch (expected %s, got %s)" % (
1095
self._last.tag, tag)
1100
# Element structure builder for XML source data, based on the
1101
# <b>expat</b> parser.
1103
# @keyparam target Target object. If omitted, the builder uses an
1104
# instance of the standard {@link #TreeBuilder} class.
1105
# @keyparam html Predefine HTML entities. This flag is not supported
1106
# by the current implementation.
1110
class XMLTreeBuilder:
1112
def __init__(self, html=0, target=None):
1114
from xml.parsers import expat
1117
"No module named expat; use SimpleXMLTreeBuilder instead"
1119
self._parser = parser = expat.ParserCreate(None, "}")
1121
target = TreeBuilder()
1122
self._target = target
1123
self._names = {} # name memo cache
1125
parser.DefaultHandlerExpand = self._default
1126
parser.StartElementHandler = self._start
1127
parser.EndElementHandler = self._end
1128
parser.CharacterDataHandler = self._data
1129
# let expat do the buffering, if supported
1131
self._parser.buffer_text = 1
1132
except AttributeError:
1134
# use new-style attribute handling, if supported
1136
self._parser.ordered_attributes = 1
1137
self._parser.specified_attributes = 1
1138
parser.StartElementHandler = self._start_list
1139
except AttributeError:
1142
if not parser.returns_unicode:
1144
# target.xml(encoding, None)
1145
self._doctype = None
1148
def _fixtext(self, text):
1149
# convert text string to ascii, if possible
1151
return _encode(text, "ascii")
1152
except UnicodeError:
1155
def _fixname(self, key):
1156
# expand qname, and convert name string to ascii, if possible
1158
name = self._names[key]
1163
self._names[key] = name = self._fixtext(name)
1166
def _start(self, tag, attrib_in):
1167
fixname = self._fixname
1170
for key, value in attrib_in.items():
1171
attrib[fixname(key)] = self._fixtext(value)
1172
return self._target.start(tag, attrib)
1174
def _start_list(self, tag, attrib_in):
1175
fixname = self._fixname
1179
for i in range(0, len(attrib_in), 2):
1180
attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1181
return self._target.start(tag, attrib)
1183
def _data(self, text):
1184
return self._target.data(self._fixtext(text))
1186
def _end(self, tag):
1187
return self._target.end(self._fixname(tag))
1189
def _default(self, text):
1192
# deal with undefined entities
1194
self._target.data(self.entity[text[1:-1]])
1196
from xml.parsers import expat
1198
"undefined entity %s: line %d, column %d" %
1199
(text, self._parser.ErrorLineNumber,
1200
self._parser.ErrorColumnNumber)
1202
elif prefix == "<" and text[:9] == "<!DOCTYPE":
1203
self._doctype = [] # inside a doctype declaration
1204
elif self._doctype is not None:
1205
# parse doctype contents
1207
self._doctype = None
1209
text = string.strip(text)
1212
self._doctype.append(text)
1213
n = len(self._doctype)
1215
type = self._doctype[1]
1216
if type == "PUBLIC" and n == 4:
1217
name, type, pubid, system = self._doctype
1218
elif type == "SYSTEM" and n == 3:
1219
name, type, system = self._doctype
1225
self.doctype(name, pubid, system[1:-1])
1226
self._doctype = None
1229
# Handles a doctype declaration.
1231
# @param name Doctype name.
1232
# @param pubid Public identifier.
1233
# @param system System identifier.
1235
def doctype(self, name, pubid, system):
1239
# Feeds data to the parser.
1241
# @param data Encoded data.
1243
def feed(self, data):
1244
self._parser.Parse(data, 0)
1247
# Finishes feeding data to the parser.
1249
# @return An element structure.
1250
# @defreturn Element
1253
self._parser.Parse("", 1) # end of data
1254
tree = self._target.close()
1255
del self._target, self._parser # get rid of circular references