Xorn
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
xml_writer.py
Go to the documentation of this file.
1 # Copyright (C) 2013-2016 Roland Lutz
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software Foundation,
15 # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 
17 ## \namespace xorn.xml_writer
18 ## Writing XML documents.
19 #
20 # See the documentation of the class XMLWriter for more information.
21 
22 STATE_IDLE, STATE_CHARACTER_DATA, STATE_START_TAG = xrange(3)
23 
24 ## Checks whether a string is a valid XML name.
25 
26 def valid_name(name):
27  if not isinstance(name, unicode):
28  raise TypeError, "invalid argument type (must be unicode)"
29 
30  if not name:
31  return False
32 
33  c = name[0]
34  if c == '-' or \
35  c == '.' or \
36  (c >= '0' and c <= '9') or \
37  c == 0xb7 or \
38  (c >= 0x0300 and c <= 0x036f) or \
39  (c >= 0x203f and c <= 0x2040):
40  return False
41 
42  for c in name:
43  if c != '-' and c != '.' and \
44  not (c >= '0' and c <= '9') and c != ':' and \
45  not (c >= 'A' and c <= 'Z') and c != '_' and \
46  not (c >= 'a' and c <= 'z') and c != 0xb7 and \
47  not (c >= u'\u00c0' and c <= u'\u00d6') and \
48  not (c >= u'\u00d8' and c <= u'\u00f6') and \
49  not (c >= u'\u00f8' and c <= u'\u02ff') and \
50  not (c >= u'\u0300' and c <= u'\u036f') and \
51  not (c >= u'\u0370' and c <= u'\u037d') and \
52  not (c >= u'\u037f' and c <= u'\u1fff') and \
53  not (c >= u'\u200c' and c <= u'\u200d') and \
54  not (c >= u'\u203f' and c <= u'\u2040') and \
55  not (c >= u'\u2070' and c <= u'\u218f') and \
56  not (c >= u'\u2c00' and c <= u'\u2fef') and \
57  not (c >= u'\u3001' and c <= u'\ud7ff') and \
58  not (c >= u'\uf900' and c <= u'\ufdcf') and \
59  not (c >= u'\ufdf0' and c <= u'\ufffd'):
60  # and not (c >= 0x10000 and c <= 0xeffff)
61  return False
62 
63  return True
64 
65 ## Escape XML metacharacters in a string.
66 
67 def escape(data):
68  if not isinstance(data, unicode):
69  raise TypeError, "invalid argument type (must be unicode)"
70 
71  return data.replace('&', '&amp;') \
72  .replace('<', '&lt;') \
73  .replace('>', '&gt;') \
74  .replace('"', '&quot;')
75 
76 ## Writing XML documents.
77 #
78 # This class is used for creating XML documents and writing them to
79 # some output function in a stream-like fashion. It does not create
80 # an in-memory representation of the document.
81 #
82 # Example usage:
83 # \snippet xml_writer.py use XMLWriter
84 #
85 # This yields the following output:
86 # \code{.xml}
87 # <?xml version="1.0" encoding="UTF-8"?>
88 # <document attribute="value">
89 # some text
90 # </document>
91 # \endcode
92 #
93 # \note The caller must only pass characters which are valid in XML,
94 # i.e. none of 00-1f (except 09, 0a, and 0d), d800-dfff, fffe,
95 # ffff, or 110000 and above.
96 
97 class XMLWriter:
98  ## Create an \c %XMLWriter instance.
99  #
100  # \a write must be a function accepting a single \c unicode object
101  # as an argument which is called to write the actual data.
102  #
103  # You need to create a separate \c %XMLWriter instance for each
104  # XML document.
105 
106  def __init__(self, write):
107  self.write = write
108  self.is_initialized = False
109  self.has_root_element = False
110  self.stack = []
111  self.preserve_depth = None
112  self.state = STATE_IDLE
113  self.current_attrs = set()
114 
115  ## Return whether the document written so far is complete.
116 
117  def is_done(self):
118  return self.has_root_element and not self.stack
119 
120  ## Callback for writing out data.
121  #
122  # When creating an instance, this is set to the \a write argument
123  # to the \c %XMLWriter constructor.
124 
125  def write(self, data):
126  raise NotImplementedError
127 
128  def _prepare_for_data(self):
129  if not self.is_initialized:
130  self.write('<?xml version="1.0" encoding="UTF-8"?>\n')
131  self.is_initialized = True
132 
133  if self.state == STATE_START_TAG:
134  self.write('>')
135  if self.preserve_depth is None:
136  self.write('\n')
137  self.current_attrs.clear()
138  self.state = STATE_IDLE
139 
140  ## Write an opening tag for a new element.
141  #
142  # \a name must be a valid XML element name. If \a
143  # preserve_whitespace is \c True, no extra formatting whitespace
144  # will be inserted inside this element.
145 
146  def start_element(self, name, preserve_whitespace = False):
147  name = unicode(name)
148  if not valid_name(name):
149  raise ValueError, "invalid element name '%s'" % name
150 
151  if not self.stack:
152  if self.has_root_element:
153  raise ValueError, "only one root element allowed"
154  self.has_root_element = True
155 
156  self._prepare_for_data()
157  if self.preserve_depth is None:
158  if self.state == STATE_CHARACTER_DATA:
159  self.write('\n')
160  self.write(' ' * len(self.stack))
161  self.write('<' + name)
162  if preserve_whitespace and self.preserve_depth is None:
163  self.preserve_depth = len(self.stack)
164  self.stack.append(name)
165  self.state = STATE_START_TAG
166 
167  ## Write a closing tag for the innermost element.
168 
169  def end_element(self):
170  if self.state == STATE_START_TAG:
171  self.write('/>')
172  elif self.stack:
173  if self.preserve_depth is None:
174  if self.state == STATE_CHARACTER_DATA:
175  self.write('\n')
176  self.write(' ' * (len(self.stack) - 1))
177  self.write('</%s>' % self.stack[-1])
178  else:
179  raise ValueError, "can't end element at root level"
180 
181  self.state = STATE_IDLE
182 
183  del self.stack[-1]
184  if self.preserve_depth == len(self.stack):
185  self.preserve_depth = None
186  if self.preserve_depth is None:
187  self.write('\n')
188  self.current_attrs.clear()
189 
190  ## Write an attribute for the innermost element.
191  #
192  # \a name must be a valid XML element name. In \a value, the
193  # characters \c '&', \c '<', \c '>', and \c '"' are replaced with
194  # their entity representations.
195  #
196  # All attributes must be written before any character data, \c
197  # CDATA sections, or child elements are written.
198 
199  def write_attribute(self, name, value):
200  name = unicode(name)
201  value = unicode(value)
202  if self.state != STATE_START_TAG:
203  raise ValueError, "can't write attributes right now"
204  if not valid_name(name):
205  raise ValueError, "invalid attribute name '%s'" % name
206  if '\n' in value:
207  raise ValueError, "line feed character in attribute value"
208  # TODO: validate value
209 
210  if name in self.current_attrs:
211  raise ValueError, "duplicate attribute name '%s'" % name
212  self.current_attrs.add(name)
213 
214  self.write(' %s="%s"' % (name, escape(value)))
215 
216  ## Write character data.
217  #
218  # The characters \c '&', \c '<', \c '>', and \c '"' are replaced
219  # with their entity representations.
220 
221  def write_character_data(self, data):
222  data = unicode(data)
223  # TODO: validate data
224  if not self.stack:
225  raise ValueError, \
226  "can't write character data outside of root element"
227 
228  self._prepare_for_data()
229  if self.state == STATE_IDLE and self.preserve_depth is None:
230  self.write(' ' * len(self.stack))
231  self.state = STATE_CHARACTER_DATA
232 
233  self.write(escape(data))
234 
235  ## Write a \c CDATA section.
236  #
237  # Special characters in this section are not escaped except for
238  # the character sequence <tt>']]>'</tt>.
239 
240  def write_cdata_section(self, data):
241  data = unicode(data)
242  # TODO: validate data
243  if not self.stack:
244  raise ValueError, \
245  "can't write CDATA section outside of root element"
246 
247  self._prepare_for_data()
248  if self.state == STATE_IDLE and self.preserve_depth is None:
249  self.write(' ' * len(self.stack))
250  self.state = STATE_CHARACTER_DATA
251 
252  self.write('<![CDATA[')
253  self.write(data.replace(']]>', ']]>]]&gt;<![CDATA['))
254  self.write(']]>')
def escape
Escape XML metacharacters in a string.
Definition: xml_writer.py:67
def write_cdata_section
Write a CDATA section.
Definition: xml_writer.py:240
def write_character_data
Write character data.
Definition: xml_writer.py:221
def end_element
Write a closing tag for the innermost element.
Definition: xml_writer.py:169
def start_element
Write an opening tag for a new element.
Definition: xml_writer.py:146
def valid_name
Checks whether a string is a valid XML name.
Definition: xml_writer.py:26
def write_attribute
Write an attribute for the innermost element.
Definition: xml_writer.py:199
def __init__
Create an XMLWriter instance.
Definition: xml_writer.py:106
Writing XML documents.
Definition: xml_writer.py:97
def is_done
Return whether the document written so far is complete.
Definition: xml_writer.py:117