~bzr-pqm/bzr/bzr.dev

1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
1
#
2
# ElementTree
3
# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4
#
5
# light-weight XML support for Python 1.5.2 and later.
6
#
7
# history:
8
# 2001-10-20 fl   created (from various sources)
9
# 2001-11-01 fl   return root from parse method
10
# 2002-02-16 fl   sort attributes in lexical order
11
# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
12
# 2002-05-01 fl   finished TreeBuilder refactoring
13
# 2002-07-14 fl   added basic namespace support to ElementTree.write
14
# 2002-07-25 fl   added QName attribute support
15
# 2002-10-20 fl   fixed encoding in write
16
# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
17
# 2002-11-27 fl   accept file objects or file names for parse/write
18
# 2002-12-04 fl   moved XMLTreeBuilder back to this module
19
# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
20
# 2003-02-13 fl   added XML literal factory
21
# 2003-02-21 fl   added ProcessingInstruction/PI factory
22
# 2003-05-11 fl   added tostring/fromstring helpers
23
# 2003-05-26 fl   added ElementPath support
24
# 2003-07-05 fl   added makeelement factory method
25
# 2003-07-28 fl   added more well-known namespace prefixes
26
# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
27
# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
28
# 2003-10-31 fl   markup updates
29
# 2003-11-15 fl   fixed nested namespace bug
30
# 2004-03-28 fl   added XMLID helper
31
# 2004-06-02 fl   added default support to findtext
32
# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
33
# 2004-08-23 fl   take advantage of post-2.1 expat features
34
# 2005-02-01 fl   added iterparse implementation
35
# 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
36
#
37
# Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
38
#
39
# fredrik@pythonware.com
40
# http://www.pythonware.com
41
#
42
# --------------------------------------------------------------------
43
# The ElementTree toolkit is
44
#
45
# Copyright (c) 1999-2005 by Fredrik Lundh
46
#
47
# By obtaining, using, and/or copying this software and/or its
48
# associated documentation, you agree that you have read, understood,
49
# and will comply with the following terms and conditions:
50
#
51
# Permission to use, copy, modify, and distribute this software and
52
# its associated documentation for any purpose and without fee is
53
# hereby granted, provided that the above copyright notice appears in
54
# all copies, and that both that copyright notice and this permission
55
# notice appear in supporting documentation, and that the name of
56
# Secret Labs AB or the author not be used in advertising or publicity
57
# pertaining to distribution of the software without specific, written
58
# prior permission.
59
#
60
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67
# OF THIS SOFTWARE.
68
# --------------------------------------------------------------------
69
70
__all__ = [
71
    # public symbols
72
    "Comment",
73
    "dump",
74
    "Element", "ElementTree",
75
    "fromstring",
76
    "iselement", "iterparse",
77
    "parse",
78
    "PI", "ProcessingInstruction",
79
    "QName",
80
    "SubElement",
81
    "tostring",
82
    "TreeBuilder",
83
    "VERSION", "XML",
84
    "XMLTreeBuilder",
85
    ]
86
87
##
88
# The <b>Element</b> type is a flexible container object, designed to
89
# store hierarchical data structures in memory. The type can be
90
# described as a cross between a list and a dictionary.
91
# <p>
92
# Each element has a number of properties associated with it:
93
# <ul>
94
# <li>a <i>tag</i>. This is a string identifying what kind of data
95
# this element represents (the element type, in other words).</li>
96
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
97
# <li>a <i>text</i> string.</li>
98
# <li>an optional <i>tail</i> string.</li>
99
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
100
# </ul>
101
#
102
# To create an element instance, use the {@link #Element} or {@link
103
# #SubElement} factory functions.
104
# <p>
105
# The {@link #ElementTree} class can be used to wrap an element
106
# structure, and convert it from and to XML.
107
##
108
109
import string, sys, re
110
111
class _SimpleElementPath:
112
    # emulate pre-1.2 find/findtext/findall behaviour
113
    def find(self, element, tag):
114
        for elem in element:
115
            if elem.tag == tag:
116
                return elem
117
        return None
118
    def findtext(self, element, tag, default=None):
119
        for elem in element:
120
            if elem.tag == tag:
121
                return elem.text or ""
122
        return default
123
    def findall(self, element, tag):
124
        if tag[:3] == ".//":
125
            return element.getiterator(tag[3:])
126
        result = []
127
        for elem in element:
128
            if elem.tag == tag:
129
                result.append(elem)
130
        return result
131
132
try:
133
    import ElementPath
134
except ImportError:
135
    # FIXME: issue warning in this case?
136
    ElementPath = _SimpleElementPath()
137
138
# TODO: add support for custom namespace resolvers/default namespaces
139
# TODO: add improved support for incremental parsing
140
141
VERSION = "1.2.6"
142
143
##
144
# Internal element class.  This class defines the Element interface,
145
# and provides a reference implementation of this interface.
146
# <p>
147
# You should not create instances of this class directly.  Use the
148
# appropriate factory functions instead, such as {@link #Element}
149
# and {@link #SubElement}.
150
#
151
# @see Element
152
# @see SubElement
153
# @see Comment
154
# @see ProcessingInstruction
155
156
class _ElementInterface:
157
    # <tag attrib>text<child/>...</tag>tail
158
159
    ##
160
    # (Attribute) Element tag.
161
162
    tag = None
163
164
    ##
165
    # (Attribute) Element attribute dictionary.  Where possible, use
166
    # {@link #_ElementInterface.get},
167
    # {@link #_ElementInterface.set},
168
    # {@link #_ElementInterface.keys}, and
169
    # {@link #_ElementInterface.items} to access
170
    # element attributes.
171
172
    attrib = None
173
174
    ##
175
    # (Attribute) Text before first subelement.  This is either a
176
    # string or the value None, if there was no text.
177
178
    text = None
179
180
    ##
181
    # (Attribute) Text after this element's end tag, but before the
182
    # next sibling element's start tag.  This is either a string or
183
    # the value None, if there was no text.
184
185
    tail = None # text after end tag, if any
186
187
    def __init__(self, tag, attrib):
188
        self.tag = tag
189
        self.attrib = attrib
190
        self._children = []
191
192
    def __repr__(self):
193
        return "<Element %s at %x>" % (self.tag, id(self))
194
195
    ##
196
    # Creates a new element object of the same type as this element.
197
    #
198
    # @param tag Element tag.
199
    # @param attrib Element attributes, given as a dictionary.
200
    # @return A new element instance.
201
202
    def makeelement(self, tag, attrib):
203
        return Element(tag, attrib)
204
205
    ##
206
    # Returns the number of subelements.
207
    #
208
    # @return The number of subelements.
209
210
    def __len__(self):
211
        return len(self._children)
212
213
    ##
214
    # Returns the given subelement.
215
    #
216
    # @param index What subelement to return.
217
    # @return The given subelement.
218
    # @exception IndexError If the given element does not exist.
219
220
    def __getitem__(self, index):
221
        return self._children[index]
222
223
    ##
224
    # Replaces the given subelement.
225
    #
226
    # @param index What subelement to replace.
227
    # @param element The new element value.
228
    # @exception IndexError If the given element does not exist.
229
    # @exception AssertionError If element is not a valid object.
230
231
    def __setitem__(self, index, element):
232
        assert iselement(element)
233
        self._children[index] = element
234
235
    ##
236
    # Deletes the given subelement.
237
    #
238
    # @param index What subelement to delete.
239
    # @exception IndexError If the given element does not exist.
240
241
    def __delitem__(self, index):
242
        del self._children[index]
243
244
    ##
245
    # Returns a list containing subelements in the given range.
246
    #
247
    # @param start The first subelement to return.
248
    # @param stop The first subelement that shouldn't be returned.
249
    # @return A sequence object containing subelements.
250
251
    def __getslice__(self, start, stop):
252
        return self._children[start:stop]
253
254
    ##
255
    # Replaces a number of subelements with elements from a sequence.
256
    #
257
    # @param start The first subelement to replace.
258
    # @param stop The first subelement that shouldn't be replaced.
259
    # @param elements A sequence object with zero or more elements.
260
    # @exception AssertionError If a sequence member is not a valid object.
261
262
    def __setslice__(self, start, stop, elements):
263
        for element in elements:
264
            assert iselement(element)
265
        self._children[start:stop] = list(elements)
266
267
    ##
268
    # Deletes a number of subelements.
269
    #
270
    # @param start The first subelement to delete.
271
    # @param stop The first subelement to leave in there.
272
273
    def __delslice__(self, start, stop):
274
        del self._children[start:stop]
275
276
    ##
277
    # Adds a subelement to the end of this element.
278
    #
279
    # @param element The element to add.
280
    # @exception AssertionError If a sequence member is not a valid object.
281
282
    def append(self, element):
283
        assert iselement(element)
284
        self._children.append(element)
285
286
    ##
287
    # Inserts a subelement at the given position in this element.
288
    #
289
    # @param index Where to insert the new subelement.
290
    # @exception AssertionError If the element is not a valid object.
291
292
    def insert(self, index, element):
293
        assert iselement(element)
294
        self._children.insert(index, element)
295
296
    ##
297
    # Removes a matching subelement.  Unlike the <b>find</b> methods,
298
    # this method compares elements based on identity, not on tag
299
    # value or contents.
300
    #
301
    # @param element What element to remove.
302
    # @exception ValueError If a matching element could not be found.
303
    # @exception AssertionError If the element is not a valid object.
304
305
    def remove(self, element):
306
        assert iselement(element)
307
        self._children.remove(element)
308
309
    ##
310
    # Returns all subelements.  The elements are returned in document
311
    # order.
312
    #
313
    # @return A list of subelements.
314
    # @defreturn list of Element instances
315
316
    def getchildren(self):
317
        return self._children
318
319
    ##
320
    # Finds the first matching subelement, by tag name or path.
321
    #
322
    # @param path What element to look for.
323
    # @return The first matching element, or None if no element was found.
324
    # @defreturn Element or None
325
326
    def find(self, path):
327
        return ElementPath.find(self, path)
328
329
    ##
330
    # Finds text for the first matching subelement, by tag name or path.
331
    #
332
    # @param path What element to look for.
333
    # @param default What to return if the element was not found.
334
    # @return The text content of the first matching element, or the
335
    #     default value no element was found.  Note that if the element
336
    #     has is found, but has no text content, this method returns an
337
    #     empty string.
338
    # @defreturn string
339
340
    def findtext(self, path, default=None):
341
        return ElementPath.findtext(self, path, default)
342
343
    ##
344
    # Finds all matching subelements, by tag name or path.
345
    #
346
    # @param path What element to look for.
347
    # @return A list or iterator containing all matching elements,
348
    #    in document order.
349
    # @defreturn list of Element instances
350
351
    def findall(self, path):
352
        return ElementPath.findall(self, path)
353
354
    ##
355
    # Resets an element.  This function removes all subelements, clears
356
    # all attributes, and sets the text and tail attributes to None.
357
358
    def clear(self):
359
        self.attrib.clear()
360
        self._children = []
361
        self.text = self.tail = None
362
363
    ##
364
    # Gets an element attribute.
365
    #
366
    # @param key What attribute to look for.
367
    # @param default What to return if the attribute was not found.
368
    # @return The attribute value, or the default value, if the
369
    #     attribute was not found.
370
    # @defreturn string or None
371
372
    def get(self, key, default=None):
373
        return self.attrib.get(key, default)
374
375
    ##
376
    # Sets an element attribute.
377
    #
378
    # @param key What attribute to set.
379
    # @param value The attribute value.
380
381
    def set(self, key, value):
382
        self.attrib[key] = value
383
384
    ##
385
    # Gets a list of attribute names.  The names are returned in an
386
    # arbitrary order (just like for an ordinary Python dictionary).
387
    #
388
    # @return A list of element attribute names.
389
    # @defreturn list of strings
390
391
    def keys(self):
392
        return self.attrib.keys()
393
394
    ##
395
    # Gets element attributes, as a sequence.  The attributes are
396
    # returned in an arbitrary order.
397
    #
398
    # @return A list of (name, value) tuples for all attributes.
399
    # @defreturn list of (string, string) tuples
400
401
    def items(self):
402
        return self.attrib.items()
403
404
    ##
405
    # Creates a tree iterator.  The iterator loops over this element
406
    # and all subelements, in document order, and returns all elements
407
    # with a matching tag.
408
    # <p>
409
    # If the tree structure is modified during iteration, the result
410
    # is undefined.
411
    #
412
    # @param tag What tags to look for (default is to return all elements).
413
    # @return A list or iterator containing all the matching elements.
414
    # @defreturn list or iterator
415
416
    def getiterator(self, tag=None):
417
        nodes = []
418
        if tag == "*":
419
            tag = None
420
        if tag is None or self.tag == tag:
421
            nodes.append(self)
422
        for node in self._children:
423
            nodes.extend(node.getiterator(tag))
424
        return nodes
425
426
# compatibility
427
_Element = _ElementInterface
428
429
##
430
# Element factory.  This function returns an object implementing the
431
# standard Element interface.  The exact class or type of that object
432
# is implementation dependent, but it will always be compatible with
433
# the {@link #_ElementInterface} class in this module.
434
# <p>
435
# The element name, attribute names, and attribute values can be
436
# either 8-bit ASCII strings or Unicode strings.
437
#
438
# @param tag The element name.
439
# @param attrib An optional dictionary, containing element attributes.
440
# @param **extra Additional attributes, given as keyword arguments.
441
# @return An element instance.
442
# @defreturn Element
443
444
def Element(tag, attrib={}, **extra):
445
    attrib = attrib.copy()
446
    attrib.update(extra)
447
    return _ElementInterface(tag, attrib)
448
449
##
450
# Subelement factory.  This function creates an element instance, and
451
# appends it to an existing element.
452
# <p>
453
# The element name, attribute names, and attribute values can be
454
# either 8-bit ASCII strings or Unicode strings.
455
#
456
# @param parent The parent element.
457
# @param tag The subelement name.
458
# @param attrib An optional dictionary, containing element attributes.
459
# @param **extra Additional attributes, given as keyword arguments.
460
# @return An element instance.
461
# @defreturn Element
462
463
def SubElement(parent, tag, attrib={}, **extra):
464
    attrib = attrib.copy()
465
    attrib.update(extra)
466
    element = parent.makeelement(tag, attrib)
467
    parent.append(element)
468
    return element
469
470
##
471
# Comment element factory.  This factory function creates a special
472
# element that will be serialized as an XML comment.
473
# <p>
474
# The comment string can be either an 8-bit ASCII string or a Unicode
475
# string.
476
#
477
# @param text A string containing the comment string.
478
# @return An element instance, representing a comment.
479
# @defreturn Element
480
481
def Comment(text=None):
482
    element = Element(Comment)
483
    element.text = text
484
    return element
485
486
##
487
# PI element factory.  This factory function creates a special element
488
# that will be serialized as an XML processing instruction.
489
#
490
# @param target A string containing the PI target.
491
# @param text A string containing the PI contents, if any.
492
# @return An element instance, representing a PI.
493
# @defreturn Element
494
495
def ProcessingInstruction(target, text=None):
496
    element = Element(ProcessingInstruction)
497
    element.text = target
498
    if text:
499
        element.text = element.text + " " + text
500
    return element
501
502
PI = ProcessingInstruction
503
504
##
505
# QName wrapper.  This can be used to wrap a QName attribute value, in
506
# order to get proper namespace handling on output.
507
#
508
# @param text A string containing the QName value, in the form {uri}local,
509
#     or, if the tag argument is given, the URI part of a QName.
510
# @param tag Optional tag.  If given, the first argument is interpreted as
511
#     an URI, and this argument is interpreted as a local name.
512
# @return An opaque object, representing the QName.
513
514
class QName:
515
    def __init__(self, text_or_uri, tag=None):
516
        if tag:
517
            text_or_uri = "{%s}%s" % (text_or_uri, tag)
518
        self.text = text_or_uri
519
    def __str__(self):
520
        return self.text
521
    def __hash__(self):
522
        return hash(self.text)
523
    def __cmp__(self, other):
524
        if isinstance(other, QName):
525
            return cmp(self.text, other.text)
526
        return cmp(self.text, other)
527
528
##
529
# ElementTree wrapper class.  This class represents an entire element
530
# hierarchy, and adds some extra support for serialization to and from
531
# standard XML.
532
#
533
# @param element Optional root element.
534
# @keyparam file Optional file handle or name.  If given, the
535
#     tree is initialized with the contents of this XML file.
536
537
class ElementTree:
538
539
    def __init__(self, element=None, file=None):
540
        assert element is None or iselement(element)
541
        self._root = element # first node
542
        if file:
543
            self.parse(file)
544
545
    ##
546
    # Gets the root element for this tree.
547
    #
548
    # @return An element instance.
549
    # @defreturn Element
550
551
    def getroot(self):
552
        return self._root
553
554
    ##
555
    # Replaces the root element for this tree.  This discards the
556
    # current contents of the tree, and replaces it with the given
557
    # element.  Use with care.
558
    #
559
    # @param element An element instance.
560
561
    def _setroot(self, element):
562
        assert iselement(element)
563
        self._root = element
564
565
    ##
566
    # Loads an external XML document into this element tree.
567
    #
568
    # @param source A file name or file object.
569
    # @param parser An optional parser instance.  If not given, the
570
    #     standard {@link XMLTreeBuilder} parser is used.
571
    # @return The document root element.
572
    # @defreturn Element
573
574
    def parse(self, source, parser=None):
575
        if not hasattr(source, "read"):
576
            source = open(source, "rb")
577
        if not parser:
578
            parser = XMLTreeBuilder()
579
        while 1:
580
            data = source.read(32768)
581
            if not data:
582
                break
583
            parser.feed(data)
584
        self._root = parser.close()
585
        return self._root
586
587
    ##
588
    # Creates a tree iterator for the root element.  The iterator loops
589
    # over all elements in this tree, in document order.
590
    #
591
    # @param tag What tags to look for (default is to return all elements)
592
    # @return An iterator.
593
    # @defreturn iterator
594
595
    def getiterator(self, tag=None):
596
        assert self._root is not None
597
        return self._root.getiterator(tag)
598
599
    ##
600
    # Finds the first toplevel element with given tag.
601
    # Same as getroot().find(path).
602
    #
603
    # @param path What element to look for.
604
    # @return The first matching element, or None if no element was found.
605
    # @defreturn Element or None
606
607
    def find(self, path):
608
        assert self._root is not None
609
        if path[:1] == "/":
610
            path = "." + path
611
        return self._root.find(path)
612
613
    ##
614
    # Finds the element text for the first toplevel element with given
615
    # tag.  Same as getroot().findtext(path).
616
    #
617
    # @param path What toplevel element to look for.
618
    # @param default What to return if the element was not found.
619
    # @return The text content of the first matching element, or the
620
    #     default value no element was found.  Note that if the element
621
    #     has is found, but has no text content, this method returns an
622
    #     empty string.
623
    # @defreturn string
624
625
    def findtext(self, path, default=None):
626
        assert self._root is not None
627
        if path[:1] == "/":
628
            path = "." + path
629
        return self._root.findtext(path, default)
630
631
    ##
632
    # Finds all toplevel elements with the given tag.
633
    # Same as getroot().findall(path).
634
    #
635
    # @param path What element to look for.
636
    # @return A list or iterator containing all matching elements,
637
    #    in document order.
638
    # @defreturn list of Element instances
639
640
    def findall(self, path):
641
        assert self._root is not None
642
        if path[:1] == "/":
643
            path = "." + path
644
        return self._root.findall(path)
645
646
    ##
647
    # Writes the element tree to a file, as XML.
648
    #
649
    # @param file A file name, or a file object opened for writing.
650
    # @param encoding Optional output encoding (default is US-ASCII).
651
652
    def write(self, file, encoding="us-ascii"):
653
        assert self._root is not None
654
        if not hasattr(file, "write"):
655
            file = open(file, "wb")
656
        if not encoding:
657
            encoding = "us-ascii"
658
        elif encoding != "utf-8" and encoding != "us-ascii":
659
            file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
660
        self._write(file, self._root, encoding, {})
661
662
    def _write(self, file, node, encoding, namespaces):
663
        # write XML to file
664
        tag = node.tag
665
        if tag is Comment:
666
            file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
667
        elif tag is ProcessingInstruction:
668
            file.write("<?%s?>" % _escape_cdata(node.text, encoding))
669
        else:
670
            items = node.items()
671
            xmlns_items = [] # new namespaces in this scope
672
            try:
673
                if isinstance(tag, QName) or tag[:1] == "{":
674
                    tag, xmlns = fixtag(tag, namespaces)
675
                    if xmlns: xmlns_items.append(xmlns)
676
            except TypeError:
677
                _raise_serialization_error(tag)
678
            file.write("<" + _encode(tag, encoding))
679
            if items or xmlns_items:
680
                items.sort() # lexical order
681
                for k, v in items:
682
                    try:
683
                        if isinstance(k, QName) or k[:1] == "{":
684
                            k, xmlns = fixtag(k, namespaces)
685
                            if xmlns: xmlns_items.append(xmlns)
686
                    except TypeError:
687
                        _raise_serialization_error(k)
688
                    try:
689
                        if isinstance(v, QName):
690
                            v, xmlns = fixtag(v, namespaces)
691
                            if xmlns: xmlns_items.append(xmlns)
692
                    except TypeError:
693
                        _raise_serialization_error(v)
694
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
695
                                               _escape_attrib(v, encoding)))
696
                for k, v in xmlns_items:
697
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
698
                                               _escape_attrib(v, encoding)))
699
            if node.text or len(node):
700
                file.write(">")
701
                if node.text:
702
                    file.write(_escape_cdata(node.text, encoding))
703
                for n in node:
704
                    self._write(file, n, encoding, namespaces)
705
                file.write("</" + _encode(tag, encoding) + ">")
706
            else:
707
                file.write(" />")
708
            for k, v in xmlns_items:
709
                del namespaces[v]
710
        if node.tail:
711
            file.write(_escape_cdata(node.tail, encoding))
712
713
# --------------------------------------------------------------------
714
# helpers
715
716
##
717
# Checks if an object appears to be a valid element object.
718
#
719
# @param An element instance.
720
# @return A true value if this is an element object.
721
# @defreturn flag
722
723
def iselement(element):
724
    # FIXME: not sure about this; might be a better idea to look
725
    # for tag/attrib/text attributes
726
    return isinstance(element, _ElementInterface) or hasattr(element, "tag")
727
728
##
729
# Writes an element tree or element structure to sys.stdout.  This
730
# function should be used for debugging only.
731
# <p>
732
# The exact output format is implementation dependent.  In this
733
# version, it's written as an ordinary XML file.
734
#
735
# @param elem An element tree or an individual element.
736
737
def dump(elem):
738
    # debugging
739
    if not isinstance(elem, ElementTree):
740
        elem = ElementTree(elem)
741
    elem.write(sys.stdout)
742
    tail = elem.getroot().tail
743
    if not tail or tail[-1] != "\n":
744
        sys.stdout.write("\n")
745
746
def _encode(s, encoding):
747
    try:
748
        return s.encode(encoding)
749
    except AttributeError:
750
        return s # 1.5.2: assume the string uses the right encoding
751
752
if sys.version[:3] == "1.5":
753
    _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
754
else:
755
    _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
756
757
_escape_map = {
758
    "&": "&amp;",
759
    "<": "&lt;",
760
    ">": "&gt;",
761
    '"': "&quot;",
762
}
763
764
_namespace_map = {
765
    # "well-known" namespace prefixes
766
    "http://www.w3.org/XML/1998/namespace": "xml",
767
    "http://www.w3.org/1999/xhtml": "html",
768
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
769
    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
770
}
771
772
def _raise_serialization_error(text):
773
    raise TypeError(
774
        "cannot serialize %r (type %s)" % (text, type(text).__name__)
775
        )
776
777
def _encode_entity(text, pattern=_escape):
778
    # map reserved and non-ascii characters to numerical entities
779
    def escape_entities(m, map=_escape_map):
780
        out = []
781
        append = out.append
782
        for char in m.group():
783
            text = map.get(char)
784
            if text is None:
785
                text = "&#%d;" % ord(char)
786
            append(text)
787
        return string.join(out, "")
788
    try:
789
        return _encode(pattern.sub(escape_entities, text), "ascii")
790
    except TypeError:
791
        _raise_serialization_error(text)
792
793
#
794
# the following functions assume an ascii-compatible encoding
795
# (or "utf-16")
796
797
def _escape_cdata(text, encoding=None, replace=string.replace):
798
    # escape character data
799
    try:
800
        if encoding:
801
            try:
802
                text = _encode(text, encoding)
803
            except UnicodeError:
804
                return _encode_entity(text)
805
        text = replace(text, "&", "&amp;")
806
        text = replace(text, "<", "&lt;")
807
        text = replace(text, ">", "&gt;")
808
        return text
809
    except (TypeError, AttributeError):
810
        _raise_serialization_error(text)
811
812
def _escape_attrib(text, encoding=None, replace=string.replace):
813
    # escape attribute value
814
    try:
815
        if encoding:
816
            try:
817
                text = _encode(text, encoding)
818
            except UnicodeError:
819
                return _encode_entity(text)
820
        text = replace(text, "&", "&amp;")
821
        text = replace(text, "'", "&apos;") # FIXME: overkill
822
        text = replace(text, "\"", "&quot;")
823
        text = replace(text, "<", "&lt;")
824
        text = replace(text, ">", "&gt;")
825
        return text
826
    except (TypeError, AttributeError):
827
        _raise_serialization_error(text)
828
829
def fixtag(tag, namespaces):
830
    # given a decorated tag (of the form {uri}tag), return prefixed
831
    # tag and namespace declaration, if any
832
    if isinstance(tag, QName):
833
        tag = tag.text
834
    namespace_uri, tag = string.split(tag[1:], "}", 1)
835
    prefix = namespaces.get(namespace_uri)
836
    if prefix is None:
837
        prefix = _namespace_map.get(namespace_uri)
838
        if prefix is None:
839
            prefix = "ns%d" % len(namespaces)
840
        namespaces[namespace_uri] = prefix
841
        if prefix == "xml":
842
            xmlns = None
843
        else:
844
            xmlns = ("xmlns:%s" % prefix, namespace_uri)
845
    else:
846
        xmlns = None
847
    return "%s:%s" % (prefix, tag), xmlns
848
849
##
850
# Parses an XML document into an element tree.
851
#
852
# @param source A filename or file object containing XML data.
853
# @param parser An optional parser instance.  If not given, the
854
#     standard {@link XMLTreeBuilder} parser is used.
855
# @return An ElementTree instance
856
857
def parse(source, parser=None):
858
    tree = ElementTree()
859
    tree.parse(source, parser)
860
    return tree
861
862
##
863
# Parses an XML document into an element tree incrementally, and reports
864
# what's going on to the user.
865
#
866
# @param source A filename or file object containing XML data.
867
# @param events A list of events to report back.  If omitted, only "end"
868
#     events are reported.
869
# @return A (event, elem) iterator.
870
871
class iterparse:
872
873
    def __init__(self, source, events=None):
874
        if not hasattr(source, "read"):
875
            source = open(source, "rb")
876
        self._file = source
877
        self._events = []
878
        self._index = 0
879
        self.root = self._root = None
880
        self._parser = XMLTreeBuilder()
881
        # wire up the parser for event reporting
882
        parser = self._parser._parser
883
        append = self._events.append
884
        if events is None:
885
            events = ["end"]
886
        for event in events:
887
            if event == "start":
888
                try:
889
                    parser.ordered_attributes = 1
890
                    parser.specified_attributes = 1
891
                    def handler(tag, attrib_in, event=event, append=append,
892
                                start=self._parser._start_list):
893
                        append((event, start(tag, attrib_in)))
894
                    parser.StartElementHandler = handler
895
                except AttributeError:
896
                    def handler(tag, attrib_in, event=event, append=append,
897
                                start=self._parser._start):
898
                        append((event, start(tag, attrib_in)))
899
                    parser.StartElementHandler = handler
900
            elif event == "end":
901
                def handler(tag, event=event, append=append,
902
                            end=self._parser._end):
903
                    append((event, end(tag)))
904
                parser.EndElementHandler = handler
905
            elif event == "start-ns":
906
                def handler(prefix, uri, event=event, append=append):
907
                    try:
908
                        uri = _encode(uri, "ascii")
909
                    except UnicodeError:
910
                        pass
911
                    append((event, (prefix or "", uri)))
912
                parser.StartNamespaceDeclHandler = handler
913
            elif event == "end-ns":
914
                def handler(prefix, event=event, append=append):
915
                    append((event, None))
916
                parser.EndNamespaceDeclHandler = handler
917
918
    def next(self):
919
        while 1:
920
            try:
921
                item = self._events[self._index]
922
            except IndexError:
923
                if self._parser is None:
924
                    self.root = self._root
925
                    try:
926
                        raise StopIteration
927
                    except NameError:
928
                        raise IndexError
929
                # load event buffer
930
                del self._events[:]
931
                self._index = 0
932
                data = self._file.read(16384)
933
                if data:
934
                    self._parser.feed(data)
935
                else:
936
                    self._root = self._parser.close()
937
                    self._parser = None
938
            else:
939
                self._index = self._index + 1
940
                return item
941
942
    try:
943
        iter
944
        def __iter__(self):
945
            return self
946
    except NameError:
947
        def __getitem__(self, index):
948
            return self.next()
949
950
##
951
# Parses an XML document from a string constant.  This function can
952
# be used to embed "XML literals" in Python code.
953
#
954
# @param source A string containing XML data.
955
# @return An Element instance.
956
# @defreturn Element
957
958
def XML(text):
959
    parser = XMLTreeBuilder()
960
    parser.feed(text)
961
    return parser.close()
962
963
##
964
# Parses an XML document from a string constant, and also returns
965
# a dictionary which maps from element id:s to elements.
966
#
967
# @param source A string containing XML data.
968
# @return A tuple containing an Element instance and a dictionary.
969
# @defreturn (Element, dictionary)
970
971
def XMLID(text):
972
    parser = XMLTreeBuilder()
973
    parser.feed(text)
974
    tree = parser.close()
975
    ids = {}
976
    for elem in tree.getiterator():
977
        id = elem.get("id")
978
        if id:
979
            ids[id] = elem
980
    return tree, ids
981
982
##
983
# Parses an XML document from a string constant.  Same as {@link #XML}.
984
#
985
# @def fromstring(text)
986
# @param source A string containing XML data.
987
# @return An Element instance.
988
# @defreturn Element
989
990
fromstring = XML
991
992
##
993
# Generates a string representation of an XML element, including all
994
# subelements.
995
#
996
# @param element An Element instance.
997
# @return An encoded string containing the XML data.
998
# @defreturn string
999
1000
def tostring(element, encoding=None):
1001
    class dummy:
1002
        pass
1003
    data = []
1004
    file = dummy()
1005
    file.write = data.append
1006
    ElementTree(element).write(file, encoding)
1007
    return string.join(data, "")
1008
1009
##
1010
# Generic element structure builder.  This builder converts a sequence
1011
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1012
# #TreeBuilder.end} method calls to a well-formed element structure.
1013
# <p>
1014
# You can use this class to build an element structure using a custom XML
1015
# parser, or a parser for some other XML-like format.
1016
#
1017
# @param element_factory Optional element factory.  This factory
1018
#    is called to create new Element instances, as necessary.
1019
1020
class TreeBuilder:
1021
1022
    def __init__(self, element_factory=None):
1023
        self._data = [] # data collector
1024
        self._elem = [] # element stack
1025
        self._last = None # last element
1026
        self._tail = None # true if we're after an end tag
1027
        if element_factory is None:
1028
            element_factory = _ElementInterface
1029
        self._factory = element_factory
1030
1031
    ##
1032
    # Flushes the parser buffers, and returns the toplevel documen
1033
    # element.
1034
    #
1035
    # @return An Element instance.
1036
    # @defreturn Element
1037
1038
    def close(self):
1039
        assert len(self._elem) == 0, "missing end tags"
1040
        assert self._last != None, "missing toplevel element"
1041
        return self._last
1042
1043
    def _flush(self):
1044
        if self._data:
1045
            if self._last is not None:
1046
                text = string.join(self._data, "")
1047
                if self._tail:
1048
                    assert self._last.tail is None, "internal error (tail)"
1049
                    self._last.tail = text
1050
                else:
1051
                    assert self._last.text is None, "internal error (text)"
1052
                    self._last.text = text
1053
            self._data = []
1054
1055
    ##
1056
    # Adds text to the current element.
1057
    #
1058
    # @param data A string.  This should be either an 8-bit string
1059
    #    containing ASCII text, or a Unicode string.
1060
1061
    def data(self, data):
1062
        self._data.append(data)
1063
1064
    ##
1065
    # Opens a new element.
1066
    #
1067
    # @param tag The element name.
1068
    # @param attrib A dictionary containing element attributes.
1069
    # @return The opened element.
1070
    # @defreturn Element
1071
1072
    def start(self, tag, attrs):
1073
        self._flush()
1074
        self._last = elem = self._factory(tag, attrs)
1075
        if self._elem:
1076
            self._elem[-1].append(elem)
1077
        self._elem.append(elem)
1078
        self._tail = 0
1079
        return elem
1080
1081
    ##
1082
    # Closes the current element.
1083
    #
1084
    # @param tag The element name.
1085
    # @return The closed element.
1086
    # @defreturn Element
1087
1088
    def end(self, tag):
1089
        self._flush()
1090
        self._last = self._elem.pop()
1091
        assert self._last.tag == tag,\
1092
               "end tag mismatch (expected %s, got %s)" % (
1093
                   self._last.tag, tag)
1094
        self._tail = 1
1095
        return self._last
1096
1097
##
1098
# Element structure builder for XML source data, based on the
1099
# <b>expat</b> parser.
1100
#
1101
# @keyparam target Target object.  If omitted, the builder uses an
1102
#     instance of the standard {@link #TreeBuilder} class.
1103
# @keyparam html Predefine HTML entities.  This flag is not supported
1104
#     by the current implementation.
1105
# @see #ElementTree
1106
# @see #TreeBuilder
1107
1108
class XMLTreeBuilder:
1109
1110
    def __init__(self, html=0, target=None):
1111
        try:
1112
            from xml.parsers import expat
1113
        except ImportError:
1114
            raise ImportError(
1115
                "No module named expat; use SimpleXMLTreeBuilder instead"
1116
                )
1117
        self._parser = parser = expat.ParserCreate(None, "}")
1118
        if target is None:
1119
            target = TreeBuilder()
1120
        self._target = target
1121
        self._names = {} # name memo cache
1122
        # callbacks
1123
        parser.DefaultHandlerExpand = self._default
1124
        parser.StartElementHandler = self._start
1125
        parser.EndElementHandler = self._end
1126
        parser.CharacterDataHandler = self._data
1127
        # let expat do the buffering, if supported
1128
        try:
1129
            self._parser.buffer_text = 1
1130
        except AttributeError:
1131
            pass
1132
        # use new-style attribute handling, if supported
1133
        try:
1134
            self._parser.ordered_attributes = 1
1135
            self._parser.specified_attributes = 1
1136
            parser.StartElementHandler = self._start_list
1137
        except AttributeError:
1138
            pass
1139
        encoding = None
1140
        if not parser.returns_unicode:
1141
            encoding = "utf-8"
1142
        # target.xml(encoding, None)
1143
        self._doctype = None
1144
        self.entity = {}
1145
1146
    def _fixtext(self, text):
1147
        # convert text string to ascii, if possible
1148
        try:
1149
            return _encode(text, "ascii")
1150
        except UnicodeError:
1151
            return text
1152
1153
    def _fixname(self, key):
1154
        # expand qname, and convert name string to ascii, if possible
1155
        try:
1156
            name = self._names[key]
1157
        except KeyError:
1158
            name = key
1159
            if "}" in name:
1160
                name = "{" + name
1161
            self._names[key] = name = self._fixtext(name)
1162
        return name
1163
1164
    def _start(self, tag, attrib_in):
1165
        fixname = self._fixname
1166
        tag = fixname(tag)
1167
        attrib = {}
1168
        for key, value in attrib_in.items():
1169
            attrib[fixname(key)] = self._fixtext(value)
1170
        return self._target.start(tag, attrib)
1171
1172
    def _start_list(self, tag, attrib_in):
1173
        fixname = self._fixname
1174
        tag = fixname(tag)
1175
        attrib = {}
1176
        if attrib_in:
1177
            for i in range(0, len(attrib_in), 2):
1178
                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1179
        return self._target.start(tag, attrib)
1180
1181
    def _data(self, text):
1182
        return self._target.data(self._fixtext(text))
1183
1184
    def _end(self, tag):
1185
        return self._target.end(self._fixname(tag))
1186
1187
    def _default(self, text):
1188
        prefix = text[:1]
1189
        if prefix == "&":
1190
            # deal with undefined entities
1191
            try:
1192
                self._target.data(self.entity[text[1:-1]])
1193
            except KeyError:
1194
                from xml.parsers import expat
1195
                raise expat.error(
1196
                    "undefined entity %s: line %d, column %d" %
1197
                    (text, self._parser.ErrorLineNumber,
1198
                    self._parser.ErrorColumnNumber)
1199
                    )
1200
        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1201
            self._doctype = [] # inside a doctype declaration
1202
        elif self._doctype is not None:
1203
            # parse doctype contents
1204
            if prefix == ">":
1205
                self._doctype = None
1206
                return
1207
            text = string.strip(text)
1208
            if not text:
1209
                return
1210
            self._doctype.append(text)
1211
            n = len(self._doctype)
1212
            if n > 2:
1213
                type = self._doctype[1]
1214
                if type == "PUBLIC" and n == 4:
1215
                    name, type, pubid, system = self._doctype
1216
                elif type == "SYSTEM" and n == 3:
1217
                    name, type, system = self._doctype
1218
                    pubid = None
1219
                else:
1220
                    return
1221
                if pubid:
1222
                    pubid = pubid[1:-1]
1223
                self.doctype(name, pubid, system[1:-1])
1224
                self._doctype = None
1225
1226
    ##
1227
    # Handles a doctype declaration.
1228
    #
1229
    # @param name Doctype name.
1230
    # @param pubid Public identifier.
1231
    # @param system System identifier.
1232
1233
    def doctype(self, name, pubid, system):
1234
        pass
1235
1236
    ##
1237
    # Feeds data to the parser.
1238
    #
1239
    # @param data Encoded data.
1240
1241
    def feed(self, data):
1242
        self._parser.Parse(data, 0)
1243
1244
    ##
1245
    # Finishes feeding data to the parser.
1246
    #
1247
    # @return An element structure.
1248
    # @defreturn Element
1249
1250
    def close(self):
1251
        self._parser.Parse("", 1) # end of data
1252
        tree = self._target.close()
1253
        del self._target, self._parser # get rid of circular references
1254
        return tree