~bzr-pqm/bzr/bzr.dev

1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
1
#
2
# ElementTree
3
# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4
#
5
# light-weight XML support for Python 1.5.2 and later.
6
#
7
# history:
8
# 2001-10-20 fl   created (from various sources)
9
# 2001-11-01 fl   return root from parse method
10
# 2002-02-16 fl   sort attributes in lexical order
11
# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
12
# 2002-05-01 fl   finished TreeBuilder refactoring
13
# 2002-07-14 fl   added basic namespace support to ElementTree.write
14
# 2002-07-25 fl   added QName attribute support
15
# 2002-10-20 fl   fixed encoding in write
16
# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
17
# 2002-11-27 fl   accept file objects or file names for parse/write
18
# 2002-12-04 fl   moved XMLTreeBuilder back to this module
19
# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
20
# 2003-02-13 fl   added XML literal factory
21
# 2003-02-21 fl   added ProcessingInstruction/PI factory
22
# 2003-05-11 fl   added tostring/fromstring helpers
23
# 2003-05-26 fl   added ElementPath support
24
# 2003-07-05 fl   added makeelement factory method
25
# 2003-07-28 fl   added more well-known namespace prefixes
26
# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
27
# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
28
# 2003-10-31 fl   markup updates
29
# 2003-11-15 fl   fixed nested namespace bug
30
# 2004-03-28 fl   added XMLID helper
31
# 2004-06-02 fl   added default support to findtext
32
# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
33
# 2004-08-23 fl   take advantage of post-2.1 expat features
34
# 2005-02-01 fl   added iterparse implementation
35
# 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
36
#
37
# Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
38
#
39
# fredrik@pythonware.com
40
# http://www.pythonware.com
41
#
42
# --------------------------------------------------------------------
43
# The ElementTree toolkit is
44
#
45
# Copyright (c) 1999-2005 by Fredrik Lundh
46
#
47
# By obtaining, using, and/or copying this software and/or its
48
# associated documentation, you agree that you have read, understood,
49
# and will comply with the following terms and conditions:
50
#
51
# Permission to use, copy, modify, and distribute this software and
52
# its associated documentation for any purpose and without fee is
53
# hereby granted, provided that the above copyright notice appears in
54
# all copies, and that both that copyright notice and this permission
55
# notice appear in supporting documentation, and that the name of
56
# Secret Labs AB or the author not be used in advertising or publicity
57
# pertaining to distribution of the software without specific, written
58
# prior permission.
59
#
60
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67
# OF THIS SOFTWARE.
68
# --------------------------------------------------------------------
69
6379.6.3 by Jelmer Vernooij
Use absolute_import.
70
from __future__ import absolute_import
71
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
72
__all__ = [
73
    # public symbols
74
    "Comment",
75
    "dump",
76
    "Element", "ElementTree",
77
    "fromstring",
78
    "iselement", "iterparse",
79
    "parse",
80
    "PI", "ProcessingInstruction",
81
    "QName",
82
    "SubElement",
83
    "tostring",
84
    "TreeBuilder",
85
    "VERSION", "XML",
86
    "XMLTreeBuilder",
87
    ]
88
89
##
90
# The <b>Element</b> type is a flexible container object, designed to
91
# store hierarchical data structures in memory. The type can be
92
# described as a cross between a list and a dictionary.
93
# <p>
94
# Each element has a number of properties associated with it:
95
# <ul>
96
# <li>a <i>tag</i>. This is a string identifying what kind of data
97
# this element represents (the element type, in other words).</li>
98
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
99
# <li>a <i>text</i> string.</li>
100
# <li>an optional <i>tail</i> string.</li>
101
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
102
# </ul>
103
#
104
# To create an element instance, use the {@link #Element} or {@link
105
# #SubElement} factory functions.
106
# <p>
107
# The {@link #ElementTree} class can be used to wrap an element
108
# structure, and convert it from and to XML.
109
##
110
111
import string, sys, re
112
113
class _SimpleElementPath:
114
    # emulate pre-1.2 find/findtext/findall behaviour
115
    def find(self, element, tag):
116
        for elem in element:
117
            if elem.tag == tag:
118
                return elem
119
        return None
120
    def findtext(self, element, tag, default=None):
121
        for elem in element:
122
            if elem.tag == tag:
123
                return elem.text or ""
124
        return default
125
    def findall(self, element, tag):
126
        if tag[:3] == ".//":
127
            return element.getiterator(tag[3:])
128
        result = []
129
        for elem in element:
130
            if elem.tag == tag:
131
                result.append(elem)
132
        return result
133
134
try:
135
    import ElementPath
136
except ImportError:
137
    # FIXME: issue warning in this case?
138
    ElementPath = _SimpleElementPath()
139
140
# TODO: add support for custom namespace resolvers/default namespaces
141
# TODO: add improved support for incremental parsing
142
143
VERSION = "1.2.6"
144
145
##
146
# Internal element class.  This class defines the Element interface,
147
# and provides a reference implementation of this interface.
148
# <p>
149
# You should not create instances of this class directly.  Use the
150
# appropriate factory functions instead, such as {@link #Element}
151
# and {@link #SubElement}.
152
#
153
# @see Element
154
# @see SubElement
155
# @see Comment
156
# @see ProcessingInstruction
157
158
class _ElementInterface:
159
    # <tag attrib>text<child/>...</tag>tail
160
161
    ##
162
    # (Attribute) Element tag.
163
164
    tag = None
165
166
    ##
167
    # (Attribute) Element attribute dictionary.  Where possible, use
168
    # {@link #_ElementInterface.get},
169
    # {@link #_ElementInterface.set},
170
    # {@link #_ElementInterface.keys}, and
171
    # {@link #_ElementInterface.items} to access
172
    # element attributes.
173
174
    attrib = None
175
176
    ##
177
    # (Attribute) Text before first subelement.  This is either a
178
    # string or the value None, if there was no text.
179
180
    text = None
181
182
    ##
183
    # (Attribute) Text after this element's end tag, but before the
184
    # next sibling element's start tag.  This is either a string or
185
    # the value None, if there was no text.
186
187
    tail = None # text after end tag, if any
188
189
    def __init__(self, tag, attrib):
190
        self.tag = tag
191
        self.attrib = attrib
192
        self._children = []
193
194
    def __repr__(self):
195
        return "<Element %s at %x>" % (self.tag, id(self))
196
197
    ##
198
    # Creates a new element object of the same type as this element.
199
    #
200
    # @param tag Element tag.
201
    # @param attrib Element attributes, given as a dictionary.
202
    # @return A new element instance.
203
204
    def makeelement(self, tag, attrib):
205
        return Element(tag, attrib)
206
207
    ##
208
    # Returns the number of subelements.
209
    #
210
    # @return The number of subelements.
211
212
    def __len__(self):
213
        return len(self._children)
214
215
    ##
216
    # Returns the given subelement.
217
    #
218
    # @param index What subelement to return.
219
    # @return The given subelement.
220
    # @exception IndexError If the given element does not exist.
221
222
    def __getitem__(self, index):
223
        return self._children[index]
224
225
    ##
226
    # Replaces the given subelement.
227
    #
228
    # @param index What subelement to replace.
229
    # @param element The new element value.
230
    # @exception IndexError If the given element does not exist.
231
    # @exception AssertionError If element is not a valid object.
232
233
    def __setitem__(self, index, element):
234
        assert iselement(element)
235
        self._children[index] = element
236
237
    ##
238
    # Deletes the given subelement.
239
    #
240
    # @param index What subelement to delete.
241
    # @exception IndexError If the given element does not exist.
242
243
    def __delitem__(self, index):
244
        del self._children[index]
245
246
    ##
247
    # Returns a list containing subelements in the given range.
248
    #
249
    # @param start The first subelement to return.
250
    # @param stop The first subelement that shouldn't be returned.
251
    # @return A sequence object containing subelements.
252
253
    def __getslice__(self, start, stop):
254
        return self._children[start:stop]
255
256
    ##
257
    # Replaces a number of subelements with elements from a sequence.
258
    #
259
    # @param start The first subelement to replace.
260
    # @param stop The first subelement that shouldn't be replaced.
261
    # @param elements A sequence object with zero or more elements.
262
    # @exception AssertionError If a sequence member is not a valid object.
263
264
    def __setslice__(self, start, stop, elements):
265
        for element in elements:
266
            assert iselement(element)
267
        self._children[start:stop] = list(elements)
268
269
    ##
270
    # Deletes a number of subelements.
271
    #
272
    # @param start The first subelement to delete.
273
    # @param stop The first subelement to leave in there.
274
275
    def __delslice__(self, start, stop):
276
        del self._children[start:stop]
277
278
    ##
279
    # Adds a subelement to the end of this element.
280
    #
281
    # @param element The element to add.
282
    # @exception AssertionError If a sequence member is not a valid object.
283
284
    def append(self, element):
285
        assert iselement(element)
286
        self._children.append(element)
287
288
    ##
289
    # Inserts a subelement at the given position in this element.
290
    #
291
    # @param index Where to insert the new subelement.
292
    # @exception AssertionError If the element is not a valid object.
293
294
    def insert(self, index, element):
295
        assert iselement(element)
296
        self._children.insert(index, element)
297
298
    ##
299
    # Removes a matching subelement.  Unlike the <b>find</b> methods,
300
    # this method compares elements based on identity, not on tag
301
    # value or contents.
302
    #
303
    # @param element What element to remove.
304
    # @exception ValueError If a matching element could not be found.
305
    # @exception AssertionError If the element is not a valid object.
306
307
    def remove(self, element):
308
        assert iselement(element)
309
        self._children.remove(element)
310
311
    ##
312
    # Returns all subelements.  The elements are returned in document
313
    # order.
314
    #
315
    # @return A list of subelements.
316
    # @defreturn list of Element instances
317
318
    def getchildren(self):
319
        return self._children
320
321
    ##
322
    # Finds the first matching subelement, by tag name or path.
323
    #
324
    # @param path What element to look for.
325
    # @return The first matching element, or None if no element was found.
326
    # @defreturn Element or None
327
328
    def find(self, path):
329
        return ElementPath.find(self, path)
330
331
    ##
332
    # Finds text for the first matching subelement, by tag name or path.
333
    #
334
    # @param path What element to look for.
335
    # @param default What to return if the element was not found.
336
    # @return The text content of the first matching element, or the
337
    #     default value no element was found.  Note that if the element
338
    #     has is found, but has no text content, this method returns an
339
    #     empty string.
340
    # @defreturn string
341
342
    def findtext(self, path, default=None):
343
        return ElementPath.findtext(self, path, default)
344
345
    ##
346
    # Finds all matching subelements, by tag name or path.
347
    #
348
    # @param path What element to look for.
349
    # @return A list or iterator containing all matching elements,
350
    #    in document order.
351
    # @defreturn list of Element instances
352
353
    def findall(self, path):
354
        return ElementPath.findall(self, path)
355
356
    ##
357
    # Resets an element.  This function removes all subelements, clears
358
    # all attributes, and sets the text and tail attributes to None.
359
360
    def clear(self):
361
        self.attrib.clear()
362
        self._children = []
363
        self.text = self.tail = None
364
365
    ##
366
    # Gets an element attribute.
367
    #
368
    # @param key What attribute to look for.
369
    # @param default What to return if the attribute was not found.
370
    # @return The attribute value, or the default value, if the
371
    #     attribute was not found.
372
    # @defreturn string or None
373
374
    def get(self, key, default=None):
375
        return self.attrib.get(key, default)
376
377
    ##
378
    # Sets an element attribute.
379
    #
380
    # @param key What attribute to set.
381
    # @param value The attribute value.
382
383
    def set(self, key, value):
384
        self.attrib[key] = value
385
386
    ##
387
    # Gets a list of attribute names.  The names are returned in an
388
    # arbitrary order (just like for an ordinary Python dictionary).
389
    #
390
    # @return A list of element attribute names.
391
    # @defreturn list of strings
392
393
    def keys(self):
394
        return self.attrib.keys()
395
396
    ##
397
    # Gets element attributes, as a sequence.  The attributes are
398
    # returned in an arbitrary order.
399
    #
400
    # @return A list of (name, value) tuples for all attributes.
401
    # @defreturn list of (string, string) tuples
402
403
    def items(self):
404
        return self.attrib.items()
405
406
    ##
407
    # Creates a tree iterator.  The iterator loops over this element
408
    # and all subelements, in document order, and returns all elements
409
    # with a matching tag.
410
    # <p>
411
    # If the tree structure is modified during iteration, the result
412
    # is undefined.
413
    #
414
    # @param tag What tags to look for (default is to return all elements).
415
    # @return A list or iterator containing all the matching elements.
416
    # @defreturn list or iterator
417
418
    def getiterator(self, tag=None):
419
        nodes = []
420
        if tag == "*":
421
            tag = None
422
        if tag is None or self.tag == tag:
423
            nodes.append(self)
424
        for node in self._children:
425
            nodes.extend(node.getiterator(tag))
426
        return nodes
427
428
# compatibility
429
_Element = _ElementInterface
430
431
##
432
# Element factory.  This function returns an object implementing the
433
# standard Element interface.  The exact class or type of that object
434
# is implementation dependent, but it will always be compatible with
435
# the {@link #_ElementInterface} class in this module.
436
# <p>
437
# The element name, attribute names, and attribute values can be
438
# either 8-bit ASCII strings or Unicode strings.
439
#
440
# @param tag The element name.
441
# @param attrib An optional dictionary, containing element attributes.
442
# @param **extra Additional attributes, given as keyword arguments.
443
# @return An element instance.
444
# @defreturn Element
445
446
def Element(tag, attrib={}, **extra):
447
    attrib = attrib.copy()
448
    attrib.update(extra)
449
    return _ElementInterface(tag, attrib)
450
451
##
452
# Subelement factory.  This function creates an element instance, and
453
# appends it to an existing element.
454
# <p>
455
# The element name, attribute names, and attribute values can be
456
# either 8-bit ASCII strings or Unicode strings.
457
#
458
# @param parent The parent element.
459
# @param tag The subelement name.
460
# @param attrib An optional dictionary, containing element attributes.
461
# @param **extra Additional attributes, given as keyword arguments.
462
# @return An element instance.
463
# @defreturn Element
464
465
def SubElement(parent, tag, attrib={}, **extra):
466
    attrib = attrib.copy()
467
    attrib.update(extra)
468
    element = parent.makeelement(tag, attrib)
469
    parent.append(element)
470
    return element
471
472
##
473
# Comment element factory.  This factory function creates a special
474
# element that will be serialized as an XML comment.
475
# <p>
476
# The comment string can be either an 8-bit ASCII string or a Unicode
477
# string.
478
#
479
# @param text A string containing the comment string.
480
# @return An element instance, representing a comment.
481
# @defreturn Element
482
483
def Comment(text=None):
484
    element = Element(Comment)
485
    element.text = text
486
    return element
487
488
##
489
# PI element factory.  This factory function creates a special element
490
# that will be serialized as an XML processing instruction.
491
#
492
# @param target A string containing the PI target.
493
# @param text A string containing the PI contents, if any.
494
# @return An element instance, representing a PI.
495
# @defreturn Element
496
497
def ProcessingInstruction(target, text=None):
498
    element = Element(ProcessingInstruction)
499
    element.text = target
500
    if text:
501
        element.text = element.text + " " + text
502
    return element
503
504
PI = ProcessingInstruction
505
506
##
507
# QName wrapper.  This can be used to wrap a QName attribute value, in
508
# order to get proper namespace handling on output.
509
#
510
# @param text A string containing the QName value, in the form {uri}local,
511
#     or, if the tag argument is given, the URI part of a QName.
512
# @param tag Optional tag.  If given, the first argument is interpreted as
513
#     an URI, and this argument is interpreted as a local name.
514
# @return An opaque object, representing the QName.
515
516
class QName:
517
    def __init__(self, text_or_uri, tag=None):
518
        if tag:
519
            text_or_uri = "{%s}%s" % (text_or_uri, tag)
520
        self.text = text_or_uri
521
    def __str__(self):
522
        return self.text
523
    def __hash__(self):
524
        return hash(self.text)
525
    def __cmp__(self, other):
526
        if isinstance(other, QName):
527
            return cmp(self.text, other.text)
528
        return cmp(self.text, other)
529
530
##
531
# ElementTree wrapper class.  This class represents an entire element
532
# hierarchy, and adds some extra support for serialization to and from
533
# standard XML.
534
#
535
# @param element Optional root element.
536
# @keyparam file Optional file handle or name.  If given, the
537
#     tree is initialized with the contents of this XML file.
538
539
class ElementTree:
540
541
    def __init__(self, element=None, file=None):
542
        assert element is None or iselement(element)
543
        self._root = element # first node
544
        if file:
545
            self.parse(file)
546
547
    ##
548
    # Gets the root element for this tree.
549
    #
550
    # @return An element instance.
551
    # @defreturn Element
552
553
    def getroot(self):
554
        return self._root
555
556
    ##
557
    # Replaces the root element for this tree.  This discards the
558
    # current contents of the tree, and replaces it with the given
559
    # element.  Use with care.
560
    #
561
    # @param element An element instance.
562
563
    def _setroot(self, element):
564
        assert iselement(element)
565
        self._root = element
566
567
    ##
568
    # Loads an external XML document into this element tree.
569
    #
570
    # @param source A file name or file object.
571
    # @param parser An optional parser instance.  If not given, the
572
    #     standard {@link XMLTreeBuilder} parser is used.
573
    # @return The document root element.
574
    # @defreturn Element
575
576
    def parse(self, source, parser=None):
1963.2.6 by Robey Pointer
pychecker is on crack; go back to using 'is None'.
577
        if getattr(source, "read", None) is None:
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
578
            source = open(source, "rb")
579
        if not parser:
580
            parser = XMLTreeBuilder()
581
        while 1:
582
            data = source.read(32768)
583
            if not data:
584
                break
585
            parser.feed(data)
586
        self._root = parser.close()
587
        return self._root
588
589
    ##
590
    # Creates a tree iterator for the root element.  The iterator loops
591
    # over all elements in this tree, in document order.
592
    #
593
    # @param tag What tags to look for (default is to return all elements)
594
    # @return An iterator.
595
    # @defreturn iterator
596
597
    def getiterator(self, tag=None):
598
        assert self._root is not None
599
        return self._root.getiterator(tag)
600
601
    ##
602
    # Finds the first toplevel element with given tag.
603
    # Same as getroot().find(path).
604
    #
605
    # @param path What element to look for.
606
    # @return The first matching element, or None if no element was found.
607
    # @defreturn Element or None
608
609
    def find(self, path):
610
        assert self._root is not None
611
        if path[:1] == "/":
612
            path = "." + path
613
        return self._root.find(path)
614
615
    ##
616
    # Finds the element text for the first toplevel element with given
617
    # tag.  Same as getroot().findtext(path).
618
    #
619
    # @param path What toplevel element to look for.
620
    # @param default What to return if the element was not found.
621
    # @return The text content of the first matching element, or the
622
    #     default value no element was found.  Note that if the element
623
    #     has is found, but has no text content, this method returns an
624
    #     empty string.
625
    # @defreturn string
626
627
    def findtext(self, path, default=None):
628
        assert self._root is not None
629
        if path[:1] == "/":
630
            path = "." + path
631
        return self._root.findtext(path, default)
632
633
    ##
634
    # Finds all toplevel elements with the given tag.
635
    # Same as getroot().findall(path).
636
    #
637
    # @param path What element to look for.
638
    # @return A list or iterator containing all matching elements,
639
    #    in document order.
640
    # @defreturn list of Element instances
641
642
    def findall(self, path):
643
        assert self._root is not None
644
        if path[:1] == "/":
645
            path = "." + path
646
        return self._root.findall(path)
647
648
    ##
649
    # Writes the element tree to a file, as XML.
650
    #
651
    # @param file A file name, or a file object opened for writing.
652
    # @param encoding Optional output encoding (default is US-ASCII).
653
654
    def write(self, file, encoding="us-ascii"):
655
        assert self._root is not None
1963.2.6 by Robey Pointer
pychecker is on crack; go back to using 'is None'.
656
        if getattr(file, "write", None) is None:
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
657
            file = open(file, "wb")
658
        if not encoding:
659
            encoding = "us-ascii"
660
        elif encoding != "utf-8" and encoding != "us-ascii":
661
            file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
662
        self._write(file, self._root, encoding, {})
663
664
    def _write(self, file, node, encoding, namespaces):
665
        # write XML to file
666
        tag = node.tag
667
        if tag is Comment:
668
            file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
669
        elif tag is ProcessingInstruction:
670
            file.write("<?%s?>" % _escape_cdata(node.text, encoding))
671
        else:
672
            items = node.items()
673
            xmlns_items = [] # new namespaces in this scope
674
            try:
675
                if isinstance(tag, QName) or tag[:1] == "{":
676
                    tag, xmlns = fixtag(tag, namespaces)
677
                    if xmlns: xmlns_items.append(xmlns)
678
            except TypeError:
679
                _raise_serialization_error(tag)
680
            file.write("<" + _encode(tag, encoding))
681
            if items or xmlns_items:
682
                items.sort() # lexical order
683
                for k, v in items:
684
                    try:
685
                        if isinstance(k, QName) or k[:1] == "{":
686
                            k, xmlns = fixtag(k, namespaces)
687
                            if xmlns: xmlns_items.append(xmlns)
688
                    except TypeError:
689
                        _raise_serialization_error(k)
690
                    try:
691
                        if isinstance(v, QName):
692
                            v, xmlns = fixtag(v, namespaces)
693
                            if xmlns: xmlns_items.append(xmlns)
694
                    except TypeError:
695
                        _raise_serialization_error(v)
696
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
697
                                               _escape_attrib(v, encoding)))
698
                for k, v in xmlns_items:
699
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
700
                                               _escape_attrib(v, encoding)))
701
            if node.text or len(node):
702
                file.write(">")
703
                if node.text:
704
                    file.write(_escape_cdata(node.text, encoding))
705
                for n in node:
706
                    self._write(file, n, encoding, namespaces)
707
                file.write("</" + _encode(tag, encoding) + ">")
708
            else:
709
                file.write(" />")
710
            for k, v in xmlns_items:
711
                del namespaces[v]
712
        if node.tail:
713
            file.write(_escape_cdata(node.tail, encoding))
714
715
# --------------------------------------------------------------------
716
# helpers
717
718
##
719
# Checks if an object appears to be a valid element object.
720
#
721
# @param An element instance.
722
# @return A true value if this is an element object.
723
# @defreturn flag
724
725
def iselement(element):
726
    # FIXME: not sure about this; might be a better idea to look
727
    # for tag/attrib/text attributes
1963.2.6 by Robey Pointer
pychecker is on crack; go back to using 'is None'.
728
    return isinstance(element, _ElementInterface) or (getattr(element, "tag", None) is not None)
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
729
730
##
731
# Writes an element tree or element structure to sys.stdout.  This
732
# function should be used for debugging only.
733
# <p>
734
# The exact output format is implementation dependent.  In this
735
# version, it's written as an ordinary XML file.
736
#
737
# @param elem An element tree or an individual element.
738
739
def dump(elem):
740
    # debugging
741
    if not isinstance(elem, ElementTree):
742
        elem = ElementTree(elem)
743
    elem.write(sys.stdout)
744
    tail = elem.getroot().tail
745
    if not tail or tail[-1] != "\n":
746
        sys.stdout.write("\n")
747
748
def _encode(s, encoding):
749
    try:
750
        return s.encode(encoding)
751
    except AttributeError:
752
        return s # 1.5.2: assume the string uses the right encoding
753
754
if sys.version[:3] == "1.5":
755
    _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
756
else:
757
    _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
758
759
_escape_map = {
760
    "&": "&amp;",
761
    "<": "&lt;",
762
    ">": "&gt;",
763
    '"': "&quot;",
764
}
765
766
_namespace_map = {
767
    # "well-known" namespace prefixes
768
    "http://www.w3.org/XML/1998/namespace": "xml",
769
    "http://www.w3.org/1999/xhtml": "html",
770
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
771
    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
772
}
773
774
def _raise_serialization_error(text):
775
    raise TypeError(
776
        "cannot serialize %r (type %s)" % (text, type(text).__name__)
777
        )
778
779
def _encode_entity(text, pattern=_escape):
780
    # map reserved and non-ascii characters to numerical entities
781
    def escape_entities(m, map=_escape_map):
782
        out = []
783
        append = out.append
784
        for char in m.group():
785
            text = map.get(char)
786
            if text is None:
787
                text = "&#%d;" % ord(char)
788
            append(text)
789
        return string.join(out, "")
790
    try:
791
        return _encode(pattern.sub(escape_entities, text), "ascii")
792
    except TypeError:
793
        _raise_serialization_error(text)
794
795
#
796
# the following functions assume an ascii-compatible encoding
797
# (or "utf-16")
798
799
def _escape_cdata(text, encoding=None, replace=string.replace):
800
    # escape character data
801
    try:
802
        if encoding:
803
            try:
804
                text = _encode(text, encoding)
805
            except UnicodeError:
806
                return _encode_entity(text)
807
        text = replace(text, "&", "&amp;")
808
        text = replace(text, "<", "&lt;")
809
        text = replace(text, ">", "&gt;")
810
        return text
811
    except (TypeError, AttributeError):
812
        _raise_serialization_error(text)
813
814
def _escape_attrib(text, encoding=None, replace=string.replace):
815
    # escape attribute value
816
    try:
817
        if encoding:
818
            try:
819
                text = _encode(text, encoding)
820
            except UnicodeError:
821
                return _encode_entity(text)
822
        text = replace(text, "&", "&amp;")
823
        text = replace(text, "'", "&apos;") # FIXME: overkill
824
        text = replace(text, "\"", "&quot;")
825
        text = replace(text, "<", "&lt;")
826
        text = replace(text, ">", "&gt;")
827
        return text
828
    except (TypeError, AttributeError):
829
        _raise_serialization_error(text)
830
831
def fixtag(tag, namespaces):
832
    # given a decorated tag (of the form {uri}tag), return prefixed
833
    # tag and namespace declaration, if any
834
    if isinstance(tag, QName):
835
        tag = tag.text
836
    namespace_uri, tag = string.split(tag[1:], "}", 1)
837
    prefix = namespaces.get(namespace_uri)
838
    if prefix is None:
839
        prefix = _namespace_map.get(namespace_uri)
840
        if prefix is None:
841
            prefix = "ns%d" % len(namespaces)
842
        namespaces[namespace_uri] = prefix
843
        if prefix == "xml":
844
            xmlns = None
845
        else:
846
            xmlns = ("xmlns:%s" % prefix, namespace_uri)
847
    else:
848
        xmlns = None
849
    return "%s:%s" % (prefix, tag), xmlns
850
851
##
852
# Parses an XML document into an element tree.
853
#
854
# @param source A filename or file object containing XML data.
855
# @param parser An optional parser instance.  If not given, the
856
#     standard {@link XMLTreeBuilder} parser is used.
857
# @return An ElementTree instance
858
859
def parse(source, parser=None):
860
    tree = ElementTree()
861
    tree.parse(source, parser)
862
    return tree
863
864
##
865
# Parses an XML document into an element tree incrementally, and reports
866
# what's going on to the user.
867
#
868
# @param source A filename or file object containing XML data.
869
# @param events A list of events to report back.  If omitted, only "end"
870
#     events are reported.
871
# @return A (event, elem) iterator.
872
873
class iterparse:
874
875
    def __init__(self, source, events=None):
1963.2.6 by Robey Pointer
pychecker is on crack; go back to using 'is None'.
876
        if getattr(source, "read", None) is None:
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
877
            source = open(source, "rb")
878
        self._file = source
879
        self._events = []
880
        self._index = 0
881
        self.root = self._root = None
882
        self._parser = XMLTreeBuilder()
883
        # wire up the parser for event reporting
884
        parser = self._parser._parser
885
        append = self._events.append
886
        if events is None:
887
            events = ["end"]
888
        for event in events:
889
            if event == "start":
890
                try:
891
                    parser.ordered_attributes = 1
892
                    parser.specified_attributes = 1
893
                    def handler(tag, attrib_in, event=event, append=append,
894
                                start=self._parser._start_list):
895
                        append((event, start(tag, attrib_in)))
896
                    parser.StartElementHandler = handler
897
                except AttributeError:
898
                    def handler(tag, attrib_in, event=event, append=append,
899
                                start=self._parser._start):
900
                        append((event, start(tag, attrib_in)))
901
                    parser.StartElementHandler = handler
902
            elif event == "end":
903
                def handler(tag, event=event, append=append,
904
                            end=self._parser._end):
905
                    append((event, end(tag)))
906
                parser.EndElementHandler = handler
907
            elif event == "start-ns":
908
                def handler(prefix, uri, event=event, append=append):
909
                    try:
910
                        uri = _encode(uri, "ascii")
911
                    except UnicodeError:
912
                        pass
913
                    append((event, (prefix or "", uri)))
914
                parser.StartNamespaceDeclHandler = handler
915
            elif event == "end-ns":
916
                def handler(prefix, event=event, append=append):
917
                    append((event, None))
918
                parser.EndNamespaceDeclHandler = handler
919
920
    def next(self):
921
        while 1:
922
            try:
923
                item = self._events[self._index]
924
            except IndexError:
925
                if self._parser is None:
926
                    self.root = self._root
927
                    try:
928
                        raise StopIteration
929
                    except NameError:
930
                        raise IndexError
931
                # load event buffer
932
                del self._events[:]
933
                self._index = 0
934
                data = self._file.read(16384)
935
                if data:
936
                    self._parser.feed(data)
937
                else:
938
                    self._root = self._parser.close()
939
                    self._parser = None
940
            else:
941
                self._index = self._index + 1
942
                return item
943
944
    try:
945
        iter
946
        def __iter__(self):
947
            return self
948
    except NameError:
949
        def __getitem__(self, index):
950
            return self.next()
951
952
##
953
# Parses an XML document from a string constant.  This function can
954
# be used to embed "XML literals" in Python code.
955
#
956
# @param source A string containing XML data.
957
# @return An Element instance.
958
# @defreturn Element
959
960
def XML(text):
961
    parser = XMLTreeBuilder()
962
    parser.feed(text)
963
    return parser.close()
964
965
##
966
# Parses an XML document from a string constant, and also returns
967
# a dictionary which maps from element id:s to elements.
968
#
969
# @param source A string containing XML data.
970
# @return A tuple containing an Element instance and a dictionary.
971
# @defreturn (Element, dictionary)
972
973
def XMLID(text):
974
    parser = XMLTreeBuilder()
975
    parser.feed(text)
976
    tree = parser.close()
977
    ids = {}
978
    for elem in tree.getiterator():
979
        id = elem.get("id")
980
        if id:
981
            ids[id] = elem
982
    return tree, ids
983
984
##
985
# Parses an XML document from a string constant.  Same as {@link #XML}.
986
#
987
# @def fromstring(text)
988
# @param source A string containing XML data.
989
# @return An Element instance.
990
# @defreturn Element
991
992
fromstring = XML
993
994
##
995
# Generates a string representation of an XML element, including all
996
# subelements.
997
#
998
# @param element An Element instance.
999
# @return An encoded string containing the XML data.
1000
# @defreturn string
1001
1002
def tostring(element, encoding=None):
1003
    class dummy:
1004
        pass
1005
    data = []
1006
    file = dummy()
1007
    file.write = data.append
1008
    ElementTree(element).write(file, encoding)
1009
    return string.join(data, "")
1010
1011
##
1012
# Generic element structure builder.  This builder converts a sequence
1013
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1014
# #TreeBuilder.end} method calls to a well-formed element structure.
1015
# <p>
1016
# You can use this class to build an element structure using a custom XML
1017
# parser, or a parser for some other XML-like format.
1018
#
1019
# @param element_factory Optional element factory.  This factory
1020
#    is called to create new Element instances, as necessary.
1021
1022
class TreeBuilder:
1023
1024
    def __init__(self, element_factory=None):
1025
        self._data = [] # data collector
1026
        self._elem = [] # element stack
1027
        self._last = None # last element
1028
        self._tail = None # true if we're after an end tag
1029
        if element_factory is None:
1030
            element_factory = _ElementInterface
1031
        self._factory = element_factory
1032
1033
    ##
1034
    # Flushes the parser buffers, and returns the toplevel documen
1035
    # element.
1036
    #
1037
    # @return An Element instance.
1038
    # @defreturn Element
1039
1040
    def close(self):
1041
        assert len(self._elem) == 0, "missing end tags"
1963.2.6 by Robey Pointer
pychecker is on crack; go back to using 'is None'.
1042
        assert self._last is not None, "missing toplevel element"
1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
1043
        return self._last
1044
1045
    def _flush(self):
1046
        if self._data:
1047
            if self._last is not None:
1048
                text = string.join(self._data, "")
1049
                if self._tail:
1050
                    assert self._last.tail is None, "internal error (tail)"
1051
                    self._last.tail = text
1052
                else:
1053
                    assert self._last.text is None, "internal error (text)"
1054
                    self._last.text = text
1055
            self._data = []
1056
1057
    ##
1058
    # Adds text to the current element.
1059
    #
1060
    # @param data A string.  This should be either an 8-bit string
1061
    #    containing ASCII text, or a Unicode string.
1062
1063
    def data(self, data):
1064
        self._data.append(data)
1065
1066
    ##
1067
    # Opens a new element.
1068
    #
1069
    # @param tag The element name.
1070
    # @param attrib A dictionary containing element attributes.
1071
    # @return The opened element.
1072
    # @defreturn Element
1073
1074
    def start(self, tag, attrs):
1075
        self._flush()
1076
        self._last = elem = self._factory(tag, attrs)
1077
        if self._elem:
1078
            self._elem[-1].append(elem)
1079
        self._elem.append(elem)
1080
        self._tail = 0
1081
        return elem
1082
1083
    ##
1084
    # Closes the current element.
1085
    #
1086
    # @param tag The element name.
1087
    # @return The closed element.
1088
    # @defreturn Element
1089
1090
    def end(self, tag):
1091
        self._flush()
1092
        self._last = self._elem.pop()
1093
        assert self._last.tag == tag,\
1094
               "end tag mismatch (expected %s, got %s)" % (
1095
                   self._last.tag, tag)
1096
        self._tail = 1
1097
        return self._last
1098
1099
##
1100
# Element structure builder for XML source data, based on the
1101
# <b>expat</b> parser.
1102
#
1103
# @keyparam target Target object.  If omitted, the builder uses an
1104
#     instance of the standard {@link #TreeBuilder} class.
1105
# @keyparam html Predefine HTML entities.  This flag is not supported
1106
#     by the current implementation.
1107
# @see #ElementTree
1108
# @see #TreeBuilder
1109
1110
class XMLTreeBuilder:
1111
1112
    def __init__(self, html=0, target=None):
1113
        try:
1114
            from xml.parsers import expat
1115
        except ImportError:
1116
            raise ImportError(
1117
                "No module named expat; use SimpleXMLTreeBuilder instead"
1118
                )
1119
        self._parser = parser = expat.ParserCreate(None, "}")
1120
        if target is None:
1121
            target = TreeBuilder()
1122
        self._target = target
1123
        self._names = {} # name memo cache
1124
        # callbacks
1125
        parser.DefaultHandlerExpand = self._default
1126
        parser.StartElementHandler = self._start
1127
        parser.EndElementHandler = self._end
1128
        parser.CharacterDataHandler = self._data
1129
        # let expat do the buffering, if supported
1130
        try:
1131
            self._parser.buffer_text = 1
1132
        except AttributeError:
1133
            pass
1134
        # use new-style attribute handling, if supported
1135
        try:
1136
            self._parser.ordered_attributes = 1
1137
            self._parser.specified_attributes = 1
1138
            parser.StartElementHandler = self._start_list
1139
        except AttributeError:
1140
            pass
1141
        encoding = None
1142
        if not parser.returns_unicode:
1143
            encoding = "utf-8"
1144
        # target.xml(encoding, None)
1145
        self._doctype = None
1146
        self.entity = {}
1147
1148
    def _fixtext(self, text):
1149
        # convert text string to ascii, if possible
1150
        try:
1151
            return _encode(text, "ascii")
1152
        except UnicodeError:
1153
            return text
1154
1155
    def _fixname(self, key):
1156
        # expand qname, and convert name string to ascii, if possible
1157
        try:
1158
            name = self._names[key]
1159
        except KeyError:
1160
            name = key
1161
            if "}" in name:
1162
                name = "{" + name
1163
            self._names[key] = name = self._fixtext(name)
1164
        return name
1165
1166
    def _start(self, tag, attrib_in):
1167
        fixname = self._fixname
1168
        tag = fixname(tag)
1169
        attrib = {}
1170
        for key, value in attrib_in.items():
1171
            attrib[fixname(key)] = self._fixtext(value)
1172
        return self._target.start(tag, attrib)
1173
1174
    def _start_list(self, tag, attrib_in):
1175
        fixname = self._fixname
1176
        tag = fixname(tag)
1177
        attrib = {}
1178
        if attrib_in:
1179
            for i in range(0, len(attrib_in), 2):
1180
                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1181
        return self._target.start(tag, attrib)
1182
1183
    def _data(self, text):
1184
        return self._target.data(self._fixtext(text))
1185
1186
    def _end(self, tag):
1187
        return self._target.end(self._fixname(tag))
1188
1189
    def _default(self, text):
1190
        prefix = text[:1]
1191
        if prefix == "&":
1192
            # deal with undefined entities
1193
            try:
1194
                self._target.data(self.entity[text[1:-1]])
1195
            except KeyError:
1196
                from xml.parsers import expat
1197
                raise expat.error(
1198
                    "undefined entity %s: line %d, column %d" %
1199
                    (text, self._parser.ErrorLineNumber,
1200
                    self._parser.ErrorColumnNumber)
1201
                    )
1202
        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1203
            self._doctype = [] # inside a doctype declaration
1204
        elif self._doctype is not None:
1205
            # parse doctype contents
1206
            if prefix == ">":
1207
                self._doctype = None
1208
                return
1209
            text = string.strip(text)
1210
            if not text:
1211
                return
1212
            self._doctype.append(text)
1213
            n = len(self._doctype)
1214
            if n > 2:
1215
                type = self._doctype[1]
1216
                if type == "PUBLIC" and n == 4:
1217
                    name, type, pubid, system = self._doctype
1218
                elif type == "SYSTEM" and n == 3:
1219
                    name, type, system = self._doctype
1220
                    pubid = None
1221
                else:
1222
                    return
1223
                if pubid:
1224
                    pubid = pubid[1:-1]
1225
                self.doctype(name, pubid, system[1:-1])
1226
                self._doctype = None
1227
1228
    ##
1229
    # Handles a doctype declaration.
1230
    #
1231
    # @param name Doctype name.
1232
    # @param pubid Public identifier.
1233
    # @param system System identifier.
1234
1235
    def doctype(self, name, pubid, system):
1236
        pass
1237
1238
    ##
1239
    # Feeds data to the parser.
1240
    #
1241
    # @param data Encoded data.
1242
1243
    def feed(self, data):
1244
        self._parser.Parse(data, 0)
1245
1246
    ##
1247
    # Finishes feeding data to the parser.
1248
    #
1249
    # @return An element structure.
1250
    # @defreturn Element
1251
1252
    def close(self):
1253
        self._parser.Parse("", 1) # end of data
1254
        tree = self._target.close()
1255
        del self._target, self._parser # get rid of circular references
1256
        return tree