~bzr-pqm/bzr/bzr.dev

72 by mbp at sourcefrog
- import a subset of elementtree for easier installation
1
#
2
# ElementTree
3
# $Id: ElementTree.py 1862 2004-06-18 07:31:02Z Fredrik $
4
#
5
# light-weight XML support for Python 1.5.2 and later.
6
#
7
# this is a stripped-down version of Secret Labs' effDOM library (part
8
# of xmlToolkit).  compared to effDOM, this implementation has:
9
#
10
# - no support for observers
11
# - no html-specific extensions (e.g. entity preload)
12
# - no custom entities, doctypes, etc
13
# - no accelerator module
14
#
15
# history:
16
# 2001-10-20 fl   created (from various sources)
17
# 2001-11-01 fl   return root from parse method
18
# 2002-02-16 fl   sort attributes in lexical order
19
# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
20
# 2002-05-01 fl   finished TreeBuilder refactoring
21
# 2002-07-14 fl   added basic namespace support to ElementTree.write
22
# 2002-07-25 fl   added QName attribute support
23
# 2002-10-20 fl   fixed encoding in write
24
# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
25
# 2002-11-27 fl   accept file objects or file names for parse/write
26
# 2002-12-04 fl   moved XMLTreeBuilder back to this module
27
# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
28
# 2003-02-13 fl   added XML literal factory
29
# 2003-02-21 fl   added ProcessingInstruction/PI factory
30
# 2003-05-11 fl   added tostring/fromstring helpers
31
# 2003-05-26 fl   added ElementPath support
32
# 2003-07-05 fl   added makeelement factory method
33
# 2003-07-28 fl   added more well-known namespace prefixes
34
# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
35
# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
36
# 2003-10-31 fl   markup updates
37
# 2003-11-15 fl   fixed nested namespace bug
38
# 2004-03-28 fl   added XMLID helper
39
# 2004-06-02 fl   added default support to findtext
40
# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
41
#
42
# Copyright (c) 1999-2004 by Fredrik Lundh.  All rights reserved.
43
#
44
# fredrik@pythonware.com
45
# http://www.pythonware.com
46
#
47
# --------------------------------------------------------------------
48
# The ElementTree toolkit is
49
#
50
# Copyright (c) 1999-2004 by Fredrik Lundh
51
#
52
# By obtaining, using, and/or copying this software and/or its
53
# associated documentation, you agree that you have read, understood,
54
# and will comply with the following terms and conditions:
55
#
56
# Permission to use, copy, modify, and distribute this software and
57
# its associated documentation for any purpose and without fee is
58
# hereby granted, provided that the above copyright notice appears in
59
# all copies, and that both that copyright notice and this permission
60
# notice appear in supporting documentation, and that the name of
61
# Secret Labs AB or the author not be used in advertising or publicity
62
# pertaining to distribution of the software without specific, written
63
# prior permission.
64
#
65
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
66
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
67
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
68
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
69
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
70
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
71
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
72
# OF THIS SOFTWARE.
73
# --------------------------------------------------------------------
74
75
__all__ = [
76
    # public symbols
77
    "Comment",
78
    "dump",
79
    "Element", "ElementTree",
80
    "fromstring",
81
    "iselement",
82
    "parse",
83
    "PI", "ProcessingInstruction",
84
    "QName",
85
    "SubElement",
86
    "tostring",
87
    "TreeBuilder",
88
    "VERSION", "XML",
89
    "XMLTreeBuilder",
90
    ]
91
92
##
93
# The <b>Element</b> type is a flexible container object, designed to
94
# store hierarchical data structures in memory. The type can be
95
# described as a cross between a list and a dictionary.
96
# <p>
97
# Each element has a number of properties associated with it:
98
# <ul>
99
# <li>a <i>tag</i>. This is a string identifying what kind of data
100
# this element represents (the element type, in other words).</li>
101
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
102
# <li>a <i>text</i> string.</li>
103
# <li>an optional <i>tail</i> string.</li>
104
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
105
# </ul>
106
#
107
# To create an element instance, use the {@link #Element} or {@link
108
# #SubElement} factory functions.
109
# <p>
110
# The {@link #ElementTree} class can be used to wrap an element
111
# structure, and convert it from and to XML.
112
##
113
114
import string, sys, re
115
116
class _SimpleElementPath:
117
    # emulate pre-1.2 find/findtext/findall behaviour
118
    def find(self, element, tag):
119
        for elem in element:
120
            if elem.tag == tag:
121
                return elem
122
        return None
123
    def findtext(self, element, tag, default=None):
124
        for elem in element:
125
            if elem.tag == tag:
126
                return elem.text or ""
127
        return default
128
    def findall(self, element, tag):
129
        if tag[:3] == ".//":
130
            return element.getiterator(tag[3:])
131
        result = []
132
        for elem in element:
133
            if elem.tag == tag:
134
                result.append(elem)
135
        return result
136
137
try:
138
    import ElementPath
139
except ImportError:
140
    # FIXME: issue warning in this case?
141
    ElementPath = _SimpleElementPath()
142
143
# TODO: add support for custom namespace resolvers/default namespaces
144
# TODO: add improved support for incremental parsing
145
146
VERSION = "1.2"
147
148
##
149
# Internal element class.  This class defines the Element interface,
150
# and provides a reference implementation of this interface.
151
# <p>
152
# You should not create instances of this class directly.  Use the
153
# appropriate factory functions instead, such as {@link #Element}
154
# and {@link #SubElement}.
155
#
156
# @see Element
157
# @see SubElement
158
# @see Comment
159
# @see ProcessingInstruction
160
161
class _ElementInterface:
162
    # <tag attrib>text<child/>...</tag>tail
163
164
    ##
165
    # (Attribute) Element tag.
166
167
    tag = None
168
169
    ##
170
    # (Attribute) Element attribute dictionary.  Where possible, use
171
    # {@link #_ElementInterface.get},
172
    # {@link #_ElementInterface.set},
173
    # {@link #_ElementInterface.keys}, and
174
    # {@link #_ElementInterface.items} to access
175
    # element attributes.
176
177
    attrib = None
178
179
    ##
180
    # (Attribute) Text before first subelement.  This is either a
181
    # string or the value None, if there was no text.
182
183
    text = None
184
185
    ##
186
    # (Attribute) Text after this element's end tag, but before the
187
    # next sibling element's start tag.  This is either a string or
188
    # the value None, if there was no text.
189
190
    tail = None # text after end tag, if any
191
192
    def __init__(self, tag, attrib):
193
        self.tag = tag
194
        self.attrib = attrib
195
        self._children = []
196
197
    def __repr__(self):
198
        return "<Element %s at %x>" % (self.tag, id(self))
199
200
    ##
201
    # Creates a new element object of the same type as this element.
202
    #
203
    # @param tag Element tag.
204
    # @param attrib Element attributes, given as a dictionary.
205
    # @return A new element instance.
206
207
    def makeelement(self, tag, attrib):
208
        return Element(tag, attrib)
209
210
    ##
211
    # Returns the number of subelements.
212
    #
213
    # @return The number of subelements.
214
215
    def __len__(self):
216
        return len(self._children)
217
218
    ##
219
    # Returns the given subelement.
220
    #
221
    # @param index What subelement to return.
222
    # @return The given subelement.
223
    # @exception IndexError If the given element does not exist.
224
225
    def __getitem__(self, index):
226
        return self._children[index]
227
228
    ##
229
    # Replaces the given subelement.
230
    #
231
    # @param index What subelement to replace.
232
    # @param element The new element value.
233
    # @exception IndexError If the given element does not exist.
234
    # @exception AssertionError If element is not a valid object.
235
236
    def __setitem__(self, index, element):
237
        assert iselement(element)
238
        self._children[index] = element
239
240
    ##
241
    # Deletes the given subelement.
242
    #
243
    # @param index What subelement to delete.
244
    # @exception IndexError If the given element does not exist.
245
246
    def __delitem__(self, index):
247
        del self._children[index]
248
249
    ##
250
    # Returns a list containing subelements in the given range.
251
    #
252
    # @param start The first subelement to return.
253
    # @param stop The first subelement that shouldn't be returned.
254
    # @return A sequence object containing subelements.
255
256
    def __getslice__(self, start, stop):
257
        return self._children[start:stop]
258
259
    ##
260
    # Replaces a number of subelements with elements from a sequence.
261
    #
262
    # @param start The first subelement to replace.
263
    # @param stop The first subelement that shouldn't be replaced.
264
    # @param elements A sequence object with zero or more elements.
265
    # @exception AssertionError If a sequence member is not a valid object.
266
267
    def __setslice__(self, start, stop, elements):
268
        for element in elements:
269
            assert iselement(element)
270
        self._children[start:stop] = list(elements)
271
272
    ##
273
    # Deletes a number of subelements.
274
    #
275
    # @param start The first subelement to delete.
276
    # @param stop The first subelement to leave in there.
277
278
    def __delslice__(self, start, stop):
279
        del self._children[start:stop]
280
281
    ##
282
    # Adds a subelement to the end of this element.
283
    #
284
    # @param element The element to add.
285
    # @exception AssertionError If a sequence member is not a valid object.
286
287
    def append(self, element):
288
        assert iselement(element)
289
        self._children.append(element)
290
291
    ##
292
    # Inserts a subelement at the given position in this element.
293
    #
294
    # @param index Where to insert the new subelement.
295
    # @exception AssertionError If the element is not a valid object.
296
297
    def insert(self, index, element):
298
        assert iselement(element)
299
        self._children.insert(index, element)
300
301
    ##
302
    # Removes a matching subelement.  Unlike the <b>find</b> methods,
303
    # this method compares elements based on identity, not on tag
304
    # value or contents.
305
    #
306
    # @param element What element to remove.
307
    # @exception ValueError If a matching element could not be found.
308
    # @exception AssertionError If the element is not a valid object.
309
310
    def remove(self, element):
311
        assert iselement(element)
312
        self._children.remove(element)
313
314
    ##
315
    # Returns all subelements.  The elements are returned in document
316
    # order.
317
    #
318
    # @return A list of subelements.
319
    # @defreturn list of Element instances
320
321
    def getchildren(self):
322
        return self._children
323
324
    ##
325
    # Finds the first matching subelement, by tag name or path.
326
    #
327
    # @param path What element to look for.
328
    # @return The first matching element, or None if no element was found.
329
    # @defreturn Element or None
330
331
    def find(self, path):
332
        return ElementPath.find(self, path)
333
334
    ##
335
    # Finds text for the first matching subelement, by tag name or path.
336
    #
337
    # @param path What element to look for.
338
    # @param default What to return if the element was not found.
339
    # @return The text content of the first matching element, or the
340
    #     default value no element was found.  Note that if the element
341
    #     has is found, but has no text content, this method returns an
342
    #     empty string.
343
    # @defreturn string
344
345
    def findtext(self, path, default=None):
346
        return ElementPath.findtext(self, path, default)
347
348
    ##
349
    # Finds all matching subelements, by tag name or path.
350
    #
351
    # @param path What element to look for.
352
    # @return A list or iterator containing all matching elements,
353
    #    in document order.
354
    # @defreturn list of Element instances
355
356
    def findall(self, path):
357
        return ElementPath.findall(self, path)
358
359
    ##
360
    # Resets an element.  This function removes all subelements, clears
361
    # all attributes, and sets the text and tail attributes to None.
362
363
    def clear(self):
364
        self.attrib.clear()
365
        self._children = []
366
        self.text = self.tail = None
367
368
    ##
369
    # Gets an element attribute.
370
    #
371
    # @param key What attribute to look for.
372
    # @param default What to return if the attribute was not found.
373
    # @return The attribute value, or the default value, if the
374
    #     attribute was not found.
375
    # @defreturn string or None
376
377
    def get(self, key, default=None):
378
        return self.attrib.get(key, default)
379
380
    ##
381
    # Sets an element attribute.
382
    #
383
    # @param key What attribute to set.
384
    # @param value The attribute value.
385
386
    def set(self, key, value):
387
        self.attrib[key] = value
388
389
    ##
390
    # Gets a list of attribute names.  The names are returned in an
391
    # arbitrary order (just like for an ordinary Python dictionary).
392
    #
393
    # @return A list of element attribute names.
394
    # @defreturn list of strings
395
396
    def keys(self):
397
        return self.attrib.keys()
398
399
    ##
400
    # Gets element attributes, as a sequence.  The attributes are
401
    # returned in an arbitrary order.
402
    #
403
    # @return A list of (name, value) tuples for all attributes.
404
    # @defreturn list of (string, string) tuples
405
406
    def items(self):
407
        return self.attrib.items()
408
409
    ##
410
    # Creates a tree iterator.  The iterator loops over this element
411
    # and all subelements, in document order, and returns all elements
412
    # with a matching tag.
413
    # <p>
414
    # If the tree structure is modified during iteration, the result
415
    # is undefined.
416
    #
417
    # @param tag What tags to look for (default is to return all elements).
418
    # @return A list or iterator containing all the matching elements.
419
    # @defreturn list or iterator
420
421
    def getiterator(self, tag=None):
422
        nodes = []
423
        if tag == "*":
424
            tag = None
425
        if tag is None or self.tag == tag:
426
            nodes.append(self)
427
        for node in self._children:
428
            nodes.extend(node.getiterator(tag))
429
        return nodes
430
431
# compatibility
432
_Element = _ElementInterface
433
434
##
435
# Element factory.  This function returns an object implementing the
436
# standard Element interface.  The exact class or type of that object
437
# is implementation dependent, but it will always be compatible with
438
# the {@link #_ElementInterface} class in this module.
439
# <p>
440
# The element name, attribute names, and attribute values can be
441
# either 8-bit ASCII strings or Unicode strings.
442
#
443
# @param tag The element name.
444
# @param attrib An optional dictionary, containing element attributes.
445
# @param **extra Additional attributes, given as keyword arguments.
446
# @return An element instance.
447
# @defreturn Element
448
449
def Element(tag, attrib={}, **extra):
450
    attrib = attrib.copy()
451
    attrib.update(extra)
452
    return _ElementInterface(tag, attrib)
453
454
##
455
# Subelement factory.  This function creates an element instance, and
456
# appends it to an existing element.
457
# <p>
458
# The element name, attribute names, and attribute values can be
459
# either 8-bit ASCII strings or Unicode strings.
460
#
461
# @param parent The parent element.
462
# @param tag The subelement name.
463
# @param attrib An optional dictionary, containing element attributes.
464
# @param **extra Additional attributes, given as keyword arguments.
465
# @return An element instance.
466
# @defreturn Element
467
468
def SubElement(parent, tag, attrib={}, **extra):
469
    attrib = attrib.copy()
470
    attrib.update(extra)
471
    element = parent.makeelement(tag, attrib)
472
    parent.append(element)
473
    return element
474
475
##
476
# Comment element factory.  This factory function creates a special
477
# element that will be serialized as an XML comment.
478
# <p>
479
# The comment string can be either an 8-bit ASCII string or a Unicode
480
# string.
481
#
482
# @param text A string containing the comment string.
483
# @return An element instance, representing a comment.
484
# @defreturn Element
485
486
def Comment(text=None):
487
    element = Element(Comment)
488
    element.text = text
489
    return element
490
491
##
492
# PI element factory.  This factory function creates a special element
493
# that will be serialized as an XML processing instruction.
494
#
495
# @param target A string containing the PI target.
496
# @param text A string containing the PI contents, if any.
497
# @return An element instance, representing a PI.
498
# @defreturn Element
499
500
def ProcessingInstruction(target, text=None):
501
    element = Element(ProcessingInstruction)
502
    element.text = target
503
    if text:
504
        element.text = element.text + " " + text
505
    return element
506
507
PI = ProcessingInstruction
508
509
##
510
# QName wrapper.  This can be used to wrap a QName attribute value, in
511
# order to get proper namespace handling on output.
512
#
513
# @param text A string containing the QName value, in the form {uri}local,
514
#     or, if the tag argument is given, the URI part of a QName.
515
# @param tag Optional tag.  If given, the first argument is interpreted as
516
#     an URI, and this argument is interpreted as a local name.
517
# @return An opaque object, representing the QName.
518
519
class QName:
520
    def __init__(self, text_or_uri, tag=None):
521
        if tag:
522
            text_or_uri = "{%s}%s" % (text_or_uri, tag)
523
        self.text = text_or_uri
524
    def __str__(self):
525
        return self.text
526
    def __hash__(self):
527
        return hash(self.text)
528
    def __cmp__(self, other):
529
        if isinstance(other, QName):
530
            return cmp(self.text, other.text)
531
        return cmp(self.text, other)
532
533
##
534
# ElementTree wrapper class.  This class represents an entire element
535
# hierarchy, and adds some extra support for serialization to and from
536
# standard XML.
537
#
538
# @param element Optional root element.
539
# @keyparam file Optional file handle or name.  If given, the
540
#     tree is initialized with the contents of this XML file.
541
542
class ElementTree:
543
544
    def __init__(self, element=None, file=None):
545
        assert element is None or iselement(element)
546
        self._root = element # first node
547
        if file:
548
            self.parse(file)
549
550
    ##
551
    # Gets the root element for this tree.
552
    #
553
    # @return An element instance.
554
    # @defreturn Element
555
556
    def getroot(self):
557
        return self._root
558
559
    ##
560
    # Replaces the root element for this tree.  This discards the
561
    # current contents of the tree, and replaces it with the given
562
    # element.  Use with care.
563
    #
564
    # @param element An element instance.
565
566
    def _setroot(self, element):
567
        assert iselement(element)
568
        self._root = element
569
570
    ##
571
    # Loads an external XML document into this element tree.
572
    #
573
    # @param source A file name or file object.
574
    # @param parser An optional parser instance.  If not given, the
575
    #     standard {@link XMLTreeBuilder} parser is used.
576
    # @return The document root element.
577
    # @defreturn Element
578
579
    def parse(self, source, parser=None):
580
        if not hasattr(source, "read"):
581
            source = open(source, "rb")
582
        if not parser:
583
            parser = XMLTreeBuilder()
584
        while 1:
585
            data = source.read(32768)
586
            if not data:
587
                break
588
            parser.feed(data)
589
        self._root = parser.close()
590
        return self._root
591
592
    ##
593
    # Creates a tree iterator for the root element.  The iterator loops
594
    # over all elements in this tree, in document order.
595
    #
596
    # @param tag What tags to look for (default is to return all elements)
597
    # @return An iterator.
598
    # @defreturn iterator
599
600
    def getiterator(self, tag=None):
601
        assert self._root is not None
602
        return self._root.getiterator(tag)
603
604
    ##
605
    # Finds the first toplevel element with given tag.
606
    # Same as getroot().find(path).
607
    #
608
    # @param path What element to look for.
609
    # @return The first matching element, or None if no element was found.
610
    # @defreturn Element or None
611
612
    def find(self, path):
613
        assert self._root is not None
614
        if path[:1] == "/":
615
            path = "." + path
616
        return self._root.find(path)
617
618
    ##
619
    # Finds the element text for the first toplevel element with given
620
    # tag.  Same as getroot().findtext(path).
621
    #
622
    # @param path What toplevel element to look for.
623
    # @param default What to return if the element was not found.
624
    # @return The text content of the first matching element, or the
625
    #     default value no element was found.  Note that if the element
626
    #     has is found, but has no text content, this method returns an
627
    #     empty string.
628
    # @defreturn string
629
630
    def findtext(self, path, default=None):
631
        assert self._root is not None
632
        if path[:1] == "/":
633
            path = "." + path
634
        return self._root.findtext(path, default)
635
636
    ##
637
    # Finds all toplevel elements with the given tag.
638
    # Same as getroot().findall(path).
639
    #
640
    # @param path What element to look for.
641
    # @return A list or iterator containing all matching elements,
642
    #    in document order.
643
    # @defreturn list of Element instances
644
645
    def findall(self, path):
646
        assert self._root is not None
647
        if path[:1] == "/":
648
            path = "." + path
649
        return self._root.findall(path)
650
651
    ##
652
    # Writes the element tree to a file, as XML.
653
    #
654
    # @param file A file name, or a file object opened for writing.
655
    # @param encoding Optional output encoding (default is US-ASCII).
656
657
    def write(self, file, encoding="us-ascii"):
658
        assert self._root is not None
659
        if not hasattr(file, "write"):
660
            file = open(file, "wb")
661
        if not encoding:
662
            encoding = "us-ascii"
663
        elif encoding != "utf-8" and encoding != "us-ascii":
664
            file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
665
        self._write(file, self._root, encoding, {})
666
667
    def _write(self, file, node, encoding, namespaces):
668
        # write XML to file
669
        tag = node.tag
670
        if tag is Comment:
671
            file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
672
        elif tag is ProcessingInstruction:
673
            file.write("<?%s?>" % _escape_cdata(node.text, encoding))
674
        else:
675
            items = node.items()
676
            xmlns_items = [] # new namespaces in this scope
677
            try:
678
                if isinstance(tag, QName) or tag[:1] == "{":
679
                    tag, xmlns = fixtag(tag, namespaces)
680
                    if xmlns: xmlns_items.append(xmlns)
681
            except TypeError:
682
                _raise_serialization_error(tag)
683
            file.write("<" + _encode(tag, encoding))
684
            if items or xmlns_items:
685
                items.sort() # lexical order
686
                for k, v in items:
687
                    try:
688
                        if isinstance(k, QName) or k[:1] == "{":
689
                            k, xmlns = fixtag(k, namespaces)
690
                            if xmlns: xmlns_items.append(xmlns)
691
                    except TypeError:
692
                        _raise_serialization_error(k)
693
                    try:
694
                        if isinstance(v, QName):
695
                            v, xmlns = fixtag(v, namespaces)
696
                            if xmlns: xmlns_items.append(xmlns)
697
                    except TypeError:
698
                        _raise_serialization_error(v)
699
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
700
                                               _escape_attrib(v, encoding)))
701
                for k, v in xmlns_items:
702
                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
703
                                               _escape_attrib(v, encoding)))
704
            if node.text or node:
705
                file.write(">")
706
                if node.text:
707
                    file.write(_escape_cdata(node.text, encoding))
708
                for n in node:
709
                    self._write(file, n, encoding, namespaces)
710
                file.write("</" + _encode(tag, encoding) + ">")
711
            else:
712
                file.write(" />")
713
            for k, v in xmlns_items:
714
                del namespaces[v]
715
        if node.tail:
716
            file.write(_escape_cdata(node.tail, encoding))
717
718
# --------------------------------------------------------------------
719
# helpers
720
721
##
722
# Checks if an object appears to be a valid element object.
723
#
724
# @param An element instance.
725
# @return A true value if this is an element object.
726
# @defreturn flag
727
728
def iselement(element):
729
    # FIXME: not sure about this; might be a better idea to look
730
    # for tag/attrib/text attributes
731
    return isinstance(element, _ElementInterface) or hasattr(element, "tag")
732
733
##
734
# Writes an element tree or element structure to sys.stdout.  This
735
# function should be used for debugging only.
736
# <p>
737
# The exact output format is implementation dependent.  In this
738
# version, it's written as an ordinary XML file.
739
#
740
# @param elem An element tree or an individual element.
741
742
def dump(elem):
743
    # debugging
744
    if not isinstance(elem, ElementTree):
745
        elem = ElementTree(elem)
746
    elem.write(sys.stdout)
747
    tail = elem.getroot().tail
748
    if not tail or tail[-1] != "\n":
749
        sys.stdout.write("\n")
750
751
def _encode(s, encoding):
752
    try:
753
        return s.encode(encoding)
754
    except AttributeError:
755
        return s # 1.5.2: assume the string uses the right encoding
756
757
if sys.version[:3] == "1.5":
758
    _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
759
else:
760
    _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
761
762
_escape_map = {
763
    "&": "&amp;",
764
    "<": "&lt;",
765
    ">": "&gt;",
766
    '"': "&quot;",
767
}
768
769
_namespace_map = {
770
    # "well-known" namespace prefixes
771
    "http://www.w3.org/XML/1998/namespace": "xml",
772
    "http://www.w3.org/1999/xhtml": "html",
773
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
774
    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
775
}
776
777
def _raise_serialization_error(text):
778
    raise TypeError(
779
        "cannot serialize %r (type %s)" % (text, type(text).__name__)
780
        )
781
782
def _encode_entity(text, pattern=_escape):
783
    # map reserved and non-ascii characters to numerical entities
784
    def escape_entities(m, map=_escape_map):
785
        out = []
786
        append = out.append
787
        for char in m.group():
788
            text = map.get(char)
789
            if text is None:
790
                text = "&#%d;" % ord(char)
791
            append(text)
792
        return string.join(out, "")
793
    try:
794
        return _encode(pattern.sub(escape_entities, text), "ascii")
795
    except TypeError:
796
        _raise_serialization_error(text)
797
798
#
799
# the following functions assume an ascii-compatible encoding
800
# (or "utf-16")
801
802
def _escape_cdata(text, encoding=None, replace=string.replace):
803
    # escape character data
804
    try:
805
        if encoding:
806
            try:
807
                text = _encode(text, encoding)
808
            except UnicodeError:
809
                return _encode_entity(text)
810
        text = replace(text, "&", "&amp;")
811
        text = replace(text, "<", "&lt;")
812
        text = replace(text, ">", "&gt;")
813
        return text
814
    except (TypeError, AttributeError):
815
        _raise_serialization_error(text)
816
817
def _escape_attrib(text, encoding=None, replace=string.replace):
818
    # escape attribute value
819
    try:
820
        if encoding:
821
            try:
822
                text = _encode(text, encoding)
823
            except UnicodeError:
824
                return _encode_entity(text)
825
        text = replace(text, "&", "&amp;")
826
        text = replace(text, "'", "&apos;") # FIXME: overkill
827
        text = replace(text, "\"", "&quot;")
828
        text = replace(text, "<", "&lt;")
829
        text = replace(text, ">", "&gt;")
830
        return text
831
    except (TypeError, AttributeError):
832
        _raise_serialization_error(text)
833
834
def fixtag(tag, namespaces):
835
    # given a decorated tag (of the form {uri}tag), return prefixed
836
    # tag and namespace declaration, if any
837
    if isinstance(tag, QName):
838
        tag = tag.text
839
    namespace_uri, tag = string.split(tag[1:], "}", 1)
840
    prefix = namespaces.get(namespace_uri)
841
    if prefix is None:
842
        prefix = _namespace_map.get(namespace_uri)
843
        if prefix is None:
844
            prefix = "ns%d" % len(namespaces)
845
        namespaces[namespace_uri] = prefix
846
        if prefix == "xml":
847
            xmlns = None
848
        else:
849
            xmlns = ("xmlns:%s" % prefix, namespace_uri)
850
    else:
851
        xmlns = None
852
    return "%s:%s" % (prefix, tag), xmlns
853
854
##
855
# Parses an XML document into an element tree.
856
#
857
# @param source A filename or file object containing XML data.
858
# @param parser An optional parser instance.  If not given, the
859
#     standard {@link XMLTreeBuilder} parser is used.
860
# @return An ElementTree instance
861
862
def parse(source, parser=None):
863
    tree = ElementTree()
864
    tree.parse(source, parser)
865
    return tree
866
867
##
868
# Parses an XML document from a string constant.  This function can
869
# be used to embed "XML literals" in Python code.
870
#
871
# @param source A string containing XML data.
872
# @return An Element instance.
873
# @defreturn Element
874
875
def XML(text):
876
    parser = XMLTreeBuilder()
877
    parser.feed(text)
878
    return parser.close()
879
880
##
881
# Parses an XML document from a string constant, and also returns
882
# a dictionary which maps from element id:s to elements.
883
#
884
# @param source A string containing XML data.
885
# @return A tuple containing an Element instance and a dictionary.
886
# @defreturn (Element, dictionary)
887
888
def XMLID(text):
889
    parser = XMLTreeBuilder()
890
    parser.feed(text)
891
    tree = parser.close()
892
    ids = {}
893
    for elem in tree.getiterator():
894
        id = elem.get("id")
895
        if id:
896
            ids[id] = elem
897
    return tree, ids
898
899
##
900
# Parses an XML document from a string constant.  Same as {@link #XML}.
901
#
902
# @def fromstring(text)
903
# @param source A string containing XML data.
904
# @return An Element instance.
905
# @defreturn Element
906
907
fromstring = XML
908
909
##
910
# Generates a string representation of an XML element, including all
911
# subelements.
912
#
913
# @param element An Element instance.
914
# @return An encoded string containing the XML data.
915
# @defreturn string
916
917
def tostring(element, encoding=None):
918
    class dummy:
919
        pass
920
    data = []
921
    file = dummy()
922
    file.write = data.append
923
    ElementTree(element).write(file, encoding)
924
    return string.join(data, "")
925
926
##
927
# Generic element structure builder.  This builder converts a sequence
928
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
929
# #TreeBuilder.end} method calls to a well-formed element structure.
930
# <p>
931
# You can use this class to build an element structure using a custom XML
932
# parser, or a parser for some other XML-like format.
933
#
934
# @param element_factory Optional element factory.  This factory
935
#    is called to create new Element instances, as necessary.
936
937
class TreeBuilder:
938
939
    def __init__(self, element_factory=None):
940
        self._data = [] # data collector
941
        self._elem = [] # element stack
942
        self._last = None # last element
943
        self._tail = None # true if we're after an end tag
944
        if element_factory is None:
945
            element_factory = _ElementInterface
946
        self._factory = element_factory
947
948
    ##
949
    # Flushes the parser buffers, and returns the toplevel documen
950
    # element.
951
    #
952
    # @return An Element instance.
953
    # @defreturn Element
954
955
    def close(self):
956
        assert len(self._elem) == 0, "missing end tags"
957
        assert self._last != None, "missing toplevel element"
958
        return self._last
959
960
    def _flush(self):
961
        if self._data:
962
            if self._last is not None:
963
                text = string.join(self._data, "")
964
                if self._tail:
965
                    assert self._last.tail is None, "internal error (tail)"
966
                    self._last.tail = text
967
                else:
968
                    assert self._last.text is None, "internal error (text)"
969
                    self._last.text = text
970
            self._data = []
971
972
    ##
973
    # Adds text to the current element.
974
    #
975
    # @param data A string.  This should be either an 8-bit string
976
    #    containing ASCII text, or a Unicode string.
977
978
    def data(self, data):
979
        self._data.append(data)
980
981
    ##
982
    # Opens a new element.
983
    #
984
    # @param tag The element name.
985
    # @param attrib A dictionary containing element attributes.
986
    # @return The opened element.
987
    # @defreturn Element
988
989
    def start(self, tag, attrs):
990
        self._flush()
991
        self._last = elem = self._factory(tag, attrs)
992
        if self._elem:
993
            self._elem[-1].append(elem)
994
        self._elem.append(elem)
995
        self._tail = 0
996
        return elem
997
998
    ##
999
    # Closes the current element.
1000
    #
1001
    # @param tag The element name.
1002
    # @return The closed element.
1003
    # @defreturn Element
1004
1005
    def end(self, tag):
1006
        self._flush()
1007
        self._last = self._elem.pop()
1008
        assert self._last.tag == tag,\
1009
               "end tag mismatch (expected %s, got %s)" % (
1010
                   self._last.tag, tag)
1011
        self._tail = 1
1012
        return self._last
1013
1014
##
1015
# Element structure builder for XML source data, based on the
1016
# <b>expat</b> parser.
1017
#
1018
# @keyparam target Target object.  If omitted, the builder uses an
1019
#     instance of the standard {@link #TreeBuilder} class.
1020
# @keyparam html Predefine HTML entities.  This flag is not supported
1021
#     by the current implementation.
1022
# @see #ElementTree
1023
# @see #TreeBuilder
1024
1025
class XMLTreeBuilder:
1026
1027
    def __init__(self, html=0, target=None):
1028
        from xml.parsers import expat
1029
        self._parser = parser = expat.ParserCreate(None, "}")
1030
        if target is None:
1031
            target = TreeBuilder()
1032
        self._target = target
1033
        self._names = {} # name memo cache
1034
        parser.DefaultHandler = self._default
1035
        parser.StartElementHandler = self._start
1036
        parser.EndElementHandler = self._end
1037
        parser.CharacterDataHandler = self._data
1038
        encoding = None
1039
        if not parser.returns_unicode:
1040
            encoding = "utf-8"
1041
        # target.xml(encoding, None)
1042
        self._doctype = None
1043
        self.entity = {}
1044
1045
    def _fixtext(self, text):
1046
        # convert text string to ascii, if possible
1047
        try:
1048
            return str(text) # what if the default encoding is changed?
1049
        except UnicodeError:
1050
            return text
1051
1052
    def _fixname(self, key):
1053
        # expand qname, and convert name string to ascii, if possible
1054
        try:
1055
            name = self._names[key]
1056
        except KeyError:
1057
            name = key
1058
            if "}" in name:
1059
                name = "{" + name
1060
            self._names[key] = name = self._fixtext(name)
1061
        return name
1062
1063
    def _start(self, tag, attrib_in):
1064
        fixname = self._fixname
1065
        tag = fixname(tag)
1066
        attrib = {}
1067
        for key, value in attrib_in.items():
1068
            attrib[fixname(key)] = self._fixtext(value)
1069
        return self._target.start(tag, attrib)
1070
1071
    def _data(self, text):
1072
        return self._target.data(self._fixtext(text))
1073
1074
    def _end(self, tag):
1075
        return self._target.end(self._fixname(tag))
1076
1077
    def _default(self, text):
1078
        prefix = text[:1]
1079
        if prefix == "&":
1080
            # deal with undefined entities
1081
            try:
1082
                self._target.data(self.entity[text[1:-1]])
1083
            except KeyError:
1084
                from xml.parsers import expat
1085
                raise expat.error(
1086
                    "undefined entity %s: line %d, column %d" %
1087
                    (text, self._parser.ErrorLineNumber,
1088
                    self._parser.ErrorColumnNumber)
1089
                    )
1090
        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1091
            self._doctype = [] # inside a doctype declaration
1092
        elif self._doctype is not None:
1093
            # parse doctype contents
1094
            if prefix == ">":
1095
                self._doctype = None
1096
                return
1097
            text = string.strip(text)
1098
            if not text:
1099
                return
1100
            self._doctype.append(text)
1101
            n = len(self._doctype)
1102
            if n > 2:
1103
                type = self._doctype[1]
1104
                if type == "PUBLIC" and n == 4:
1105
                    name, type, pubid, system = self._doctype
1106
                elif type == "SYSTEM" and n == 3:
1107
                    name, type, system = self._doctype
1108
                    pubid = None
1109
                else:
1110
                    return
1111
                if pubid:
1112
                    pubid = pubid[1:-1]
1113
                self.doctype(name, pubid, system[1:-1])
1114
                self._doctype = None
1115
1116
    ##
1117
    # Handles a doctype declaration.
1118
    #
1119
    # @param name Doctype name.
1120
    # @param pubid Public identifier.
1121
    # @param system System identifier.
1122
1123
    def doctype(self, name, pubid, system):
1124
        pass
1125
1126
    ##
1127
    # Feeds data to the parser.
1128
    #
1129
    # @param data Encoded data.
1130
1131
    def feed(self, data):
1132
        self._parser.Parse(data, 0)
1133
1134
    ##
1135
    # Finishes feeding data to the parser.
1136
    #
1137
    # @return An element structure.
1138
    # @defreturn Element
1139
1140
    def close(self):
1141
        self._parser.Parse("", 1) # end of data
1142
        tree = self._target.close()
1143
        del self._target, self._parser # get rid of circular references
1144
        return tree