3
# $Id: ElementTree.py 1862 2004-06-18 07:31:02Z Fredrik $
5
# light-weight XML support for Python 1.5.2 and later.
7
# this is a stripped-down version of Secret Labs' effDOM library (part
8
# of xmlToolkit). compared to effDOM, this implementation has:
10
# - no support for observers
11
# - no html-specific extensions (e.g. entity preload)
12
# - no custom entities, doctypes, etc
13
# - no accelerator module
16
# 2001-10-20 fl created (from various sources)
17
# 2001-11-01 fl return root from parse method
18
# 2002-02-16 fl sort attributes in lexical order
19
# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
20
# 2002-05-01 fl finished TreeBuilder refactoring
21
# 2002-07-14 fl added basic namespace support to ElementTree.write
22
# 2002-07-25 fl added QName attribute support
23
# 2002-10-20 fl fixed encoding in write
24
# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
25
# 2002-11-27 fl accept file objects or file names for parse/write
26
# 2002-12-04 fl moved XMLTreeBuilder back to this module
27
# 2003-01-11 fl fixed entity encoding glitch for us-ascii
28
# 2003-02-13 fl added XML literal factory
29
# 2003-02-21 fl added ProcessingInstruction/PI factory
30
# 2003-05-11 fl added tostring/fromstring helpers
31
# 2003-05-26 fl added ElementPath support
32
# 2003-07-05 fl added makeelement factory method
33
# 2003-07-28 fl added more well-known namespace prefixes
34
# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
35
# 2003-09-04 fl fall back on emulator if ElementPath is not installed
36
# 2003-10-31 fl markup updates
37
# 2003-11-15 fl fixed nested namespace bug
38
# 2004-03-28 fl added XMLID helper
39
# 2004-06-02 fl added default support to findtext
40
# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
42
# Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.
44
# fredrik@pythonware.com
45
# http://www.pythonware.com
47
# --------------------------------------------------------------------
48
# The ElementTree toolkit is
50
# Copyright (c) 1999-2004 by Fredrik Lundh
52
# By obtaining, using, and/or copying this software and/or its
53
# associated documentation, you agree that you have read, understood,
54
# and will comply with the following terms and conditions:
56
# Permission to use, copy, modify, and distribute this software and
57
# its associated documentation for any purpose and without fee is
58
# hereby granted, provided that the above copyright notice appears in
59
# all copies, and that both that copyright notice and this permission
60
# notice appear in supporting documentation, and that the name of
61
# Secret Labs AB or the author not be used in advertising or publicity
62
# pertaining to distribution of the software without specific, written
65
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
66
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
67
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
68
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
69
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
70
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
71
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
73
# --------------------------------------------------------------------
79
"Element", "ElementTree",
83
"PI", "ProcessingInstruction",
93
# The <b>Element</b> type is a flexible container object, designed to
94
# store hierarchical data structures in memory. The type can be
95
# described as a cross between a list and a dictionary.
97
# Each element has a number of properties associated with it:
99
# <li>a <i>tag</i>. This is a string identifying what kind of data
100
# this element represents (the element type, in other words).</li>
101
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
102
# <li>a <i>text</i> string.</li>
103
# <li>an optional <i>tail</i> string.</li>
104
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
107
# To create an element instance, use the {@link #Element} or {@link
108
# #SubElement} factory functions.
110
# The {@link #ElementTree} class can be used to wrap an element
111
# structure, and convert it from and to XML.
114
import string, sys, re
116
class _SimpleElementPath:
117
# emulate pre-1.2 find/findtext/findall behaviour
118
def find(self, element, tag):
123
def findtext(self, element, tag, default=None):
126
return elem.text or ""
128
def findall(self, element, tag):
130
return element.getiterator(tag[3:])
140
# FIXME: issue warning in this case?
141
ElementPath = _SimpleElementPath()
143
# TODO: add support for custom namespace resolvers/default namespaces
144
# TODO: add improved support for incremental parsing
149
# Internal element class. This class defines the Element interface,
150
# and provides a reference implementation of this interface.
152
# You should not create instances of this class directly. Use the
153
# appropriate factory functions instead, such as {@link #Element}
154
# and {@link #SubElement}.
159
# @see ProcessingInstruction
161
class _ElementInterface:
162
# <tag attrib>text<child/>...</tag>tail
165
# (Attribute) Element tag.
170
# (Attribute) Element attribute dictionary. Where possible, use
171
# {@link #_ElementInterface.get},
172
# {@link #_ElementInterface.set},
173
# {@link #_ElementInterface.keys}, and
174
# {@link #_ElementInterface.items} to access
175
# element attributes.
180
# (Attribute) Text before first subelement. This is either a
181
# string or the value None, if there was no text.
186
# (Attribute) Text after this element's end tag, but before the
187
# next sibling element's start tag. This is either a string or
188
# the value None, if there was no text.
190
tail = None # text after end tag, if any
192
def __init__(self, tag, attrib):
198
return "<Element %s at %x>" % (self.tag, id(self))
201
# Creates a new element object of the same type as this element.
203
# @param tag Element tag.
204
# @param attrib Element attributes, given as a dictionary.
205
# @return A new element instance.
207
def makeelement(self, tag, attrib):
208
return Element(tag, attrib)
211
# Returns the number of subelements.
213
# @return The number of subelements.
216
return len(self._children)
219
# Returns the given subelement.
221
# @param index What subelement to return.
222
# @return The given subelement.
223
# @exception IndexError If the given element does not exist.
225
def __getitem__(self, index):
226
return self._children[index]
229
# Replaces the given subelement.
231
# @param index What subelement to replace.
232
# @param element The new element value.
233
# @exception IndexError If the given element does not exist.
234
# @exception AssertionError If element is not a valid object.
236
def __setitem__(self, index, element):
237
assert iselement(element)
238
self._children[index] = element
241
# Deletes the given subelement.
243
# @param index What subelement to delete.
244
# @exception IndexError If the given element does not exist.
246
def __delitem__(self, index):
247
del self._children[index]
250
# Returns a list containing subelements in the given range.
252
# @param start The first subelement to return.
253
# @param stop The first subelement that shouldn't be returned.
254
# @return A sequence object containing subelements.
256
def __getslice__(self, start, stop):
257
return self._children[start:stop]
260
# Replaces a number of subelements with elements from a sequence.
262
# @param start The first subelement to replace.
263
# @param stop The first subelement that shouldn't be replaced.
264
# @param elements A sequence object with zero or more elements.
265
# @exception AssertionError If a sequence member is not a valid object.
267
def __setslice__(self, start, stop, elements):
268
for element in elements:
269
assert iselement(element)
270
self._children[start:stop] = list(elements)
273
# Deletes a number of subelements.
275
# @param start The first subelement to delete.
276
# @param stop The first subelement to leave in there.
278
def __delslice__(self, start, stop):
279
del self._children[start:stop]
282
# Adds a subelement to the end of this element.
284
# @param element The element to add.
285
# @exception AssertionError If a sequence member is not a valid object.
287
def append(self, element):
288
assert iselement(element)
289
self._children.append(element)
292
# Inserts a subelement at the given position in this element.
294
# @param index Where to insert the new subelement.
295
# @exception AssertionError If the element is not a valid object.
297
def insert(self, index, element):
298
assert iselement(element)
299
self._children.insert(index, element)
302
# Removes a matching subelement. Unlike the <b>find</b> methods,
303
# this method compares elements based on identity, not on tag
306
# @param element What element to remove.
307
# @exception ValueError If a matching element could not be found.
308
# @exception AssertionError If the element is not a valid object.
310
def remove(self, element):
311
assert iselement(element)
312
self._children.remove(element)
315
# Returns all subelements. The elements are returned in document
318
# @return A list of subelements.
319
# @defreturn list of Element instances
321
def getchildren(self):
322
return self._children
325
# Finds the first matching subelement, by tag name or path.
327
# @param path What element to look for.
328
# @return The first matching element, or None if no element was found.
329
# @defreturn Element or None
331
def find(self, path):
332
return ElementPath.find(self, path)
335
# Finds text for the first matching subelement, by tag name or path.
337
# @param path What element to look for.
338
# @param default What to return if the element was not found.
339
# @return The text content of the first matching element, or the
340
# default value no element was found. Note that if the element
341
# has is found, but has no text content, this method returns an
345
def findtext(self, path, default=None):
346
return ElementPath.findtext(self, path, default)
349
# Finds all matching subelements, by tag name or path.
351
# @param path What element to look for.
352
# @return A list or iterator containing all matching elements,
354
# @defreturn list of Element instances
356
def findall(self, path):
357
return ElementPath.findall(self, path)
360
# Resets an element. This function removes all subelements, clears
361
# all attributes, and sets the text and tail attributes to None.
366
self.text = self.tail = None
369
# Gets an element attribute.
371
# @param key What attribute to look for.
372
# @param default What to return if the attribute was not found.
373
# @return The attribute value, or the default value, if the
374
# attribute was not found.
375
# @defreturn string or None
377
def get(self, key, default=None):
378
return self.attrib.get(key, default)
381
# Sets an element attribute.
383
# @param key What attribute to set.
384
# @param value The attribute value.
386
def set(self, key, value):
387
self.attrib[key] = value
390
# Gets a list of attribute names. The names are returned in an
391
# arbitrary order (just like for an ordinary Python dictionary).
393
# @return A list of element attribute names.
394
# @defreturn list of strings
397
return self.attrib.keys()
400
# Gets element attributes, as a sequence. The attributes are
401
# returned in an arbitrary order.
403
# @return A list of (name, value) tuples for all attributes.
404
# @defreturn list of (string, string) tuples
407
return self.attrib.items()
410
# Creates a tree iterator. The iterator loops over this element
411
# and all subelements, in document order, and returns all elements
412
# with a matching tag.
414
# If the tree structure is modified during iteration, the result
417
# @param tag What tags to look for (default is to return all elements).
418
# @return A list or iterator containing all the matching elements.
419
# @defreturn list or iterator
421
def getiterator(self, tag=None):
425
if tag is None or self.tag == tag:
427
for node in self._children:
428
nodes.extend(node.getiterator(tag))
432
_Element = _ElementInterface
435
# Element factory. This function returns an object implementing the
436
# standard Element interface. The exact class or type of that object
437
# is implementation dependent, but it will always be compatible with
438
# the {@link #_ElementInterface} class in this module.
440
# The element name, attribute names, and attribute values can be
441
# either 8-bit ASCII strings or Unicode strings.
443
# @param tag The element name.
444
# @param attrib An optional dictionary, containing element attributes.
445
# @param **extra Additional attributes, given as keyword arguments.
446
# @return An element instance.
449
def Element(tag, attrib={}, **extra):
450
attrib = attrib.copy()
452
return _ElementInterface(tag, attrib)
455
# Subelement factory. This function creates an element instance, and
456
# appends it to an existing element.
458
# The element name, attribute names, and attribute values can be
459
# either 8-bit ASCII strings or Unicode strings.
461
# @param parent The parent element.
462
# @param tag The subelement name.
463
# @param attrib An optional dictionary, containing element attributes.
464
# @param **extra Additional attributes, given as keyword arguments.
465
# @return An element instance.
468
def SubElement(parent, tag, attrib={}, **extra):
469
attrib = attrib.copy()
471
element = parent.makeelement(tag, attrib)
472
parent.append(element)
476
# Comment element factory. This factory function creates a special
477
# element that will be serialized as an XML comment.
479
# The comment string can be either an 8-bit ASCII string or a Unicode
482
# @param text A string containing the comment string.
483
# @return An element instance, representing a comment.
486
def Comment(text=None):
487
element = Element(Comment)
492
# PI element factory. This factory function creates a special element
493
# that will be serialized as an XML processing instruction.
495
# @param target A string containing the PI target.
496
# @param text A string containing the PI contents, if any.
497
# @return An element instance, representing a PI.
500
def ProcessingInstruction(target, text=None):
501
element = Element(ProcessingInstruction)
502
element.text = target
504
element.text = element.text + " " + text
507
PI = ProcessingInstruction
510
# QName wrapper. This can be used to wrap a QName attribute value, in
511
# order to get proper namespace handling on output.
513
# @param text A string containing the QName value, in the form {uri}local,
514
# or, if the tag argument is given, the URI part of a QName.
515
# @param tag Optional tag. If given, the first argument is interpreted as
516
# an URI, and this argument is interpreted as a local name.
517
# @return An opaque object, representing the QName.
520
def __init__(self, text_or_uri, tag=None):
522
text_or_uri = "{%s}%s" % (text_or_uri, tag)
523
self.text = text_or_uri
527
return hash(self.text)
528
def __cmp__(self, other):
529
if isinstance(other, QName):
530
return cmp(self.text, other.text)
531
return cmp(self.text, other)
534
# ElementTree wrapper class. This class represents an entire element
535
# hierarchy, and adds some extra support for serialization to and from
538
# @param element Optional root element.
539
# @keyparam file Optional file handle or name. If given, the
540
# tree is initialized with the contents of this XML file.
544
def __init__(self, element=None, file=None):
545
assert element is None or iselement(element)
546
self._root = element # first node
551
# Gets the root element for this tree.
553
# @return An element instance.
560
# Replaces the root element for this tree. This discards the
561
# current contents of the tree, and replaces it with the given
562
# element. Use with care.
564
# @param element An element instance.
566
def _setroot(self, element):
567
assert iselement(element)
571
# Loads an external XML document into this element tree.
573
# @param source A file name or file object.
574
# @param parser An optional parser instance. If not given, the
575
# standard {@link XMLTreeBuilder} parser is used.
576
# @return The document root element.
579
def parse(self, source, parser=None):
580
if not hasattr(source, "read"):
581
source = open(source, "rb")
583
parser = XMLTreeBuilder()
585
data = source.read(32768)
589
self._root = parser.close()
593
# Creates a tree iterator for the root element. The iterator loops
594
# over all elements in this tree, in document order.
596
# @param tag What tags to look for (default is to return all elements)
597
# @return An iterator.
598
# @defreturn iterator
600
def getiterator(self, tag=None):
601
assert self._root is not None
602
return self._root.getiterator(tag)
605
# Finds the first toplevel element with given tag.
606
# Same as getroot().find(path).
608
# @param path What element to look for.
609
# @return The first matching element, or None if no element was found.
610
# @defreturn Element or None
612
def find(self, path):
613
assert self._root is not None
616
return self._root.find(path)
619
# Finds the element text for the first toplevel element with given
620
# tag. Same as getroot().findtext(path).
622
# @param path What toplevel element to look for.
623
# @param default What to return if the element was not found.
624
# @return The text content of the first matching element, or the
625
# default value no element was found. Note that if the element
626
# has is found, but has no text content, this method returns an
630
def findtext(self, path, default=None):
631
assert self._root is not None
634
return self._root.findtext(path, default)
637
# Finds all toplevel elements with the given tag.
638
# Same as getroot().findall(path).
640
# @param path What element to look for.
641
# @return A list or iterator containing all matching elements,
643
# @defreturn list of Element instances
645
def findall(self, path):
646
assert self._root is not None
649
return self._root.findall(path)
652
# Writes the element tree to a file, as XML.
654
# @param file A file name, or a file object opened for writing.
655
# @param encoding Optional output encoding (default is US-ASCII).
657
def write(self, file, encoding="us-ascii"):
658
assert self._root is not None
659
if not hasattr(file, "write"):
660
file = open(file, "wb")
662
encoding = "us-ascii"
663
elif encoding != "utf-8" and encoding != "us-ascii":
664
file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
665
self._write(file, self._root, encoding, {})
667
def _write(self, file, node, encoding, namespaces):
671
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
672
elif tag is ProcessingInstruction:
673
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
676
xmlns_items = [] # new namespaces in this scope
678
if isinstance(tag, QName) or tag[:1] == "{":
679
tag, xmlns = fixtag(tag, namespaces)
680
if xmlns: xmlns_items.append(xmlns)
682
_raise_serialization_error(tag)
683
file.write("<" + _encode(tag, encoding))
684
if items or xmlns_items:
685
items.sort() # lexical order
688
if isinstance(k, QName) or k[:1] == "{":
689
k, xmlns = fixtag(k, namespaces)
690
if xmlns: xmlns_items.append(xmlns)
692
_raise_serialization_error(k)
694
if isinstance(v, QName):
695
v, xmlns = fixtag(v, namespaces)
696
if xmlns: xmlns_items.append(xmlns)
698
_raise_serialization_error(v)
699
file.write(" %s=\"%s\"" % (_encode(k, encoding),
700
_escape_attrib(v, encoding)))
701
for k, v in xmlns_items:
702
file.write(" %s=\"%s\"" % (_encode(k, encoding),
703
_escape_attrib(v, encoding)))
704
if node.text or node:
707
file.write(_escape_cdata(node.text, encoding))
709
self._write(file, n, encoding, namespaces)
710
file.write("</" + _encode(tag, encoding) + ">")
713
for k, v in xmlns_items:
716
file.write(_escape_cdata(node.tail, encoding))
718
# --------------------------------------------------------------------
722
# Checks if an object appears to be a valid element object.
724
# @param An element instance.
725
# @return A true value if this is an element object.
728
def iselement(element):
729
# FIXME: not sure about this; might be a better idea to look
730
# for tag/attrib/text attributes
731
return isinstance(element, _ElementInterface) or hasattr(element, "tag")
734
# Writes an element tree or element structure to sys.stdout. This
735
# function should be used for debugging only.
737
# The exact output format is implementation dependent. In this
738
# version, it's written as an ordinary XML file.
740
# @param elem An element tree or an individual element.
744
if not isinstance(elem, ElementTree):
745
elem = ElementTree(elem)
746
elem.write(sys.stdout)
747
tail = elem.getroot().tail
748
if not tail or tail[-1] != "\n":
749
sys.stdout.write("\n")
751
def _encode(s, encoding):
753
return s.encode(encoding)
754
except AttributeError:
755
return s # 1.5.2: assume the string uses the right encoding
757
if sys.version[:3] == "1.5":
758
_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
760
_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
770
# "well-known" namespace prefixes
771
"http://www.w3.org/XML/1998/namespace": "xml",
772
"http://www.w3.org/1999/xhtml": "html",
773
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
774
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
777
def _raise_serialization_error(text):
779
"cannot serialize %r (type %s)" % (text, type(text).__name__)
782
def _encode_entity(text, pattern=_escape):
783
# map reserved and non-ascii characters to numerical entities
784
def escape_entities(m, map=_escape_map):
787
for char in m.group():
790
text = "&#%d;" % ord(char)
792
return string.join(out, "")
794
return _encode(pattern.sub(escape_entities, text), "ascii")
796
_raise_serialization_error(text)
799
# the following functions assume an ascii-compatible encoding
802
def _escape_cdata(text, encoding=None, replace=string.replace):
803
# escape character data
807
text = _encode(text, encoding)
809
return _encode_entity(text)
810
text = replace(text, "&", "&")
811
text = replace(text, "<", "<")
812
text = replace(text, ">", ">")
814
except (TypeError, AttributeError):
815
_raise_serialization_error(text)
817
def _escape_attrib(text, encoding=None, replace=string.replace):
818
# escape attribute value
822
text = _encode(text, encoding)
824
return _encode_entity(text)
825
text = replace(text, "&", "&")
826
text = replace(text, "'", "'") # FIXME: overkill
827
text = replace(text, "\"", """)
828
text = replace(text, "<", "<")
829
text = replace(text, ">", ">")
831
except (TypeError, AttributeError):
832
_raise_serialization_error(text)
834
def fixtag(tag, namespaces):
835
# given a decorated tag (of the form {uri}tag), return prefixed
836
# tag and namespace declaration, if any
837
if isinstance(tag, QName):
839
namespace_uri, tag = string.split(tag[1:], "}", 1)
840
prefix = namespaces.get(namespace_uri)
842
prefix = _namespace_map.get(namespace_uri)
844
prefix = "ns%d" % len(namespaces)
845
namespaces[namespace_uri] = prefix
849
xmlns = ("xmlns:%s" % prefix, namespace_uri)
852
return "%s:%s" % (prefix, tag), xmlns
855
# Parses an XML document into an element tree.
857
# @param source A filename or file object containing XML data.
858
# @param parser An optional parser instance. If not given, the
859
# standard {@link XMLTreeBuilder} parser is used.
860
# @return An ElementTree instance
862
def parse(source, parser=None):
864
tree.parse(source, parser)
868
# Parses an XML document from a string constant. This function can
869
# be used to embed "XML literals" in Python code.
871
# @param source A string containing XML data.
872
# @return An Element instance.
876
parser = XMLTreeBuilder()
878
return parser.close()
881
# Parses an XML document from a string constant, and also returns
882
# a dictionary which maps from element id:s to elements.
884
# @param source A string containing XML data.
885
# @return A tuple containing an Element instance and a dictionary.
886
# @defreturn (Element, dictionary)
889
parser = XMLTreeBuilder()
891
tree = parser.close()
893
for elem in tree.getiterator():
900
# Parses an XML document from a string constant. Same as {@link #XML}.
902
# @def fromstring(text)
903
# @param source A string containing XML data.
904
# @return An Element instance.
910
# Generates a string representation of an XML element, including all
913
# @param element An Element instance.
914
# @return An encoded string containing the XML data.
917
def tostring(element, encoding=None):
922
file.write = data.append
923
ElementTree(element).write(file, encoding)
924
return string.join(data, "")
927
# Generic element structure builder. This builder converts a sequence
928
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
929
# #TreeBuilder.end} method calls to a well-formed element structure.
931
# You can use this class to build an element structure using a custom XML
932
# parser, or a parser for some other XML-like format.
934
# @param element_factory Optional element factory. This factory
935
# is called to create new Element instances, as necessary.
939
def __init__(self, element_factory=None):
940
self._data = [] # data collector
941
self._elem = [] # element stack
942
self._last = None # last element
943
self._tail = None # true if we're after an end tag
944
if element_factory is None:
945
element_factory = _ElementInterface
946
self._factory = element_factory
949
# Flushes the parser buffers, and returns the toplevel documen
952
# @return An Element instance.
956
assert len(self._elem) == 0, "missing end tags"
957
assert self._last != None, "missing toplevel element"
962
if self._last is not None:
963
text = string.join(self._data, "")
965
assert self._last.tail is None, "internal error (tail)"
966
self._last.tail = text
968
assert self._last.text is None, "internal error (text)"
969
self._last.text = text
973
# Adds text to the current element.
975
# @param data A string. This should be either an 8-bit string
976
# containing ASCII text, or a Unicode string.
978
def data(self, data):
979
self._data.append(data)
982
# Opens a new element.
984
# @param tag The element name.
985
# @param attrib A dictionary containing element attributes.
986
# @return The opened element.
989
def start(self, tag, attrs):
991
self._last = elem = self._factory(tag, attrs)
993
self._elem[-1].append(elem)
994
self._elem.append(elem)
999
# Closes the current element.
1001
# @param tag The element name.
1002
# @return The closed element.
1003
# @defreturn Element
1007
self._last = self._elem.pop()
1008
assert self._last.tag == tag,\
1009
"end tag mismatch (expected %s, got %s)" % (
1010
self._last.tag, tag)
1015
# Element structure builder for XML source data, based on the
1016
# <b>expat</b> parser.
1018
# @keyparam target Target object. If omitted, the builder uses an
1019
# instance of the standard {@link #TreeBuilder} class.
1020
# @keyparam html Predefine HTML entities. This flag is not supported
1021
# by the current implementation.
1025
class XMLTreeBuilder:
1027
def __init__(self, html=0, target=None):
1028
from xml.parsers import expat
1029
self._parser = parser = expat.ParserCreate(None, "}")
1031
target = TreeBuilder()
1032
self._target = target
1033
self._names = {} # name memo cache
1034
parser.DefaultHandler = self._default
1035
parser.StartElementHandler = self._start
1036
parser.EndElementHandler = self._end
1037
parser.CharacterDataHandler = self._data
1039
if not parser.returns_unicode:
1041
# target.xml(encoding, None)
1042
self._doctype = None
1045
def _fixtext(self, text):
1046
# convert text string to ascii, if possible
1048
return str(text) # what if the default encoding is changed?
1049
except UnicodeError:
1052
def _fixname(self, key):
1053
# expand qname, and convert name string to ascii, if possible
1055
name = self._names[key]
1060
self._names[key] = name = self._fixtext(name)
1063
def _start(self, tag, attrib_in):
1064
fixname = self._fixname
1067
for key, value in attrib_in.items():
1068
attrib[fixname(key)] = self._fixtext(value)
1069
return self._target.start(tag, attrib)
1071
def _data(self, text):
1072
return self._target.data(self._fixtext(text))
1074
def _end(self, tag):
1075
return self._target.end(self._fixname(tag))
1077
def _default(self, text):
1080
# deal with undefined entities
1082
self._target.data(self.entity[text[1:-1]])
1084
from xml.parsers import expat
1086
"undefined entity %s: line %d, column %d" %
1087
(text, self._parser.ErrorLineNumber,
1088
self._parser.ErrorColumnNumber)
1090
elif prefix == "<" and text[:9] == "<!DOCTYPE":
1091
self._doctype = [] # inside a doctype declaration
1092
elif self._doctype is not None:
1093
# parse doctype contents
1095
self._doctype = None
1097
text = string.strip(text)
1100
self._doctype.append(text)
1101
n = len(self._doctype)
1103
type = self._doctype[1]
1104
if type == "PUBLIC" and n == 4:
1105
name, type, pubid, system = self._doctype
1106
elif type == "SYSTEM" and n == 3:
1107
name, type, system = self._doctype
1113
self.doctype(name, pubid, system[1:-1])
1114
self._doctype = None
1117
# Handles a doctype declaration.
1119
# @param name Doctype name.
1120
# @param pubid Public identifier.
1121
# @param system System identifier.
1123
def doctype(self, name, pubid, system):
1127
# Feeds data to the parser.
1129
# @param data Encoded data.
1131
def feed(self, data):
1132
self._parser.Parse(data, 0)
1135
# Finishes feeding data to the parser.
1137
# @return An element structure.
1138
# @defreturn Element
1141
self._parser.Parse("", 1) # end of data
1142
tree = self._target.close()
1143
del self._target, self._parser # get rid of circular references