00001
00002 '''XML Canonicalization
00003
00004 Patches Applied to xml.dom.ext.c14n:
00005 http://sourceforge.net/projects/pyxml/
00006
00007 [ 1444526 ] c14n.py: http://www.w3.org/TR/xml-exc-c14n/ fix
00008 -- includes [ 829905 ] c14n.py fix for bug #825115,
00009 Date Submitted: 2003-10-24 23:43
00010 -- include dependent namespace declarations declared in ancestor nodes
00011 (checking attributes and tags),
00012 -- handle InclusiveNamespaces PrefixList parameter
00013
00014 This module generates canonical XML of a document or element.
00015 http://www.w3.org/TR/2001/REC-xml-c14n-20010315
00016 and includes a prototype of exclusive canonicalization
00017 http://www.w3.org/Signature/Drafts/xml-exc-c14n
00018
00019 Requires PyXML 0.7.0 or later.
00020
00021 Known issues if using Ft.Lib.pDomlette:
00022 1. Unicode
00023 2. does not white space normalize attributes of type NMTOKEN and ID?
00024 3. seems to be include "\n" after importing external entities?
00025
00026 Note, this version processes a DOM tree, and consequently it processes
00027 namespace nodes as attributes, not from a node's namespace axis. This
00028 permits simple document and element canonicalization without
00029 XPath. When XPath is used, the XPath result node list is passed and used to
00030 determine if the node is in the XPath result list, but little else.
00031
00032 Authors:
00033 "Joseph M. Reagle Jr." <reagle@w3.org>
00034 "Rich Salz" <rsalz@zolera.com>
00035
00036 $Date: 2006-03-30 23:47:16 +0000 (Thu, 30 Mar 2006) $ by $Author: boverhof $
00037 '''
00038
00039 _copyright = '''Copyright 2001, Zolera Systems Inc. All Rights Reserved.
00040 Copyright 2001, MIT. All Rights Reserved.
00041
00042 Distributed under the terms of:
00043 Python 2.0 License or later.
00044 http://www.python.org/2.0.1/license.html
00045 or
00046 W3C Software License
00047 http://www.w3.org/Consortium/Legal/copyright-software-19980720
00048 '''
00049
00050 import string
00051 from xml.dom import Node
00052 try:
00053 from xml.ns import XMLNS
00054 except:
00055 class XMLNS:
00056 BASE = "http://www.w3.org/2000/xmlns/"
00057 XML = "http://www.w3.org/XML/1998/namespace"
00058 try:
00059 import cStringIO
00060 StringIO = cStringIO
00061 except ImportError:
00062 import StringIO
00063
00064 _attrs = lambda E: (E.attributes and E.attributes.values()) or []
00065 _children = lambda E: E.childNodes or []
00066 _IN_XML_NS = lambda n: n.name.startswith("xmlns")
00067 _inclusive = lambda n: n.unsuppressedPrefixes == None
00068
00069
00070
00071
00072 _LesserElement, _Element, _GreaterElement = range(3)
00073
00074 def _sorter(n1,n2):
00075 '''_sorter(n1,n2) -> int
00076 Sorting predicate for non-NS attributes.'''
00077
00078 i = cmp(n1.namespaceURI, n2.namespaceURI)
00079 if i: return i
00080 return cmp(n1.localName, n2.localName)
00081
00082
00083 def _sorter_ns(n1,n2):
00084 '''_sorter_ns((n,v),(n,v)) -> int
00085 "(an empty namespace URI is lexicographically least)."'''
00086
00087 if n1[0] == 'xmlns': return -1
00088 if n2[0] == 'xmlns': return 1
00089 return cmp(n1[0], n2[0])
00090
00091 def _utilized(n, node, other_attrs, unsuppressedPrefixes):
00092 '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean
00093 Return true if that nodespace is utilized within the node'''
00094 if n.startswith('xmlns:'):
00095 n = n[6:]
00096 elif n.startswith('xmlns'):
00097 n = n[5:]
00098 if (n=="" and node.prefix in ["#default", None]) or \
00099 n == node.prefix or n in unsuppressedPrefixes:
00100 return 1
00101 for attr in other_attrs:
00102 if n == attr.prefix: return 1
00103
00104 if unsuppressedPrefixes is not None:
00105 for attr in _attrs(node):
00106 if n == attr.prefix: return 1
00107
00108 return 0
00109
00110
00111 def _inclusiveNamespacePrefixes(node, context, unsuppressedPrefixes):
00112 '''http://www.w3.org/TR/xml-exc-c14n/
00113 InclusiveNamespaces PrefixList parameter, which lists namespace prefixes that
00114 are handled in the manner described by the Canonical XML Recommendation'''
00115 inclusive = []
00116 if node.prefix:
00117 usedPrefixes = ['xmlns:%s' %node.prefix]
00118 else:
00119 usedPrefixes = ['xmlns']
00120
00121 for a in _attrs(node):
00122 if a.nodeName.startswith('xmlns') or not a.prefix: continue
00123 usedPrefixes.append('xmlns:%s' %a.prefix)
00124
00125 unused_namespace_dict = {}
00126 for attr in context:
00127 n = attr.nodeName
00128 if n in unsuppressedPrefixes:
00129 inclusive.append(attr)
00130 elif n.startswith('xmlns:') and n[6:] in unsuppressedPrefixes:
00131 inclusive.append(attr)
00132 elif n.startswith('xmlns') and n[5:] in unsuppressedPrefixes:
00133 inclusive.append(attr)
00134 elif attr.nodeName in usedPrefixes:
00135 inclusive.append(attr)
00136 elif n.startswith('xmlns:'):
00137 unused_namespace_dict[n] = attr.value
00138
00139 return inclusive, unused_namespace_dict
00140
00141
00142 _in_subset = lambda subset, node: subset is None or node in subset
00143
00144
00145 class _implementation:
00146 '''Implementation class for C14N. This accompanies a node during it's
00147 processing and includes the parameters and processing state.'''
00148
00149
00150 handlers = {}
00151
00152 def __init__(self, node, write, **kw):
00153 '''Create and run the implementation.'''
00154 self.write = write
00155 self.subset = kw.get('subset')
00156 self.comments = kw.get('comments', 0)
00157 self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes')
00158 nsdict = kw.get('nsdict', { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE })
00159
00160
00161 self.state = (nsdict, {'xml':''}, {}, {})
00162
00163 if node.nodeType == Node.DOCUMENT_NODE:
00164 self._do_document(node)
00165 elif node.nodeType == Node.ELEMENT_NODE:
00166 self.documentOrder = _Element
00167 if not _inclusive(self):
00168 inherited,unused = _inclusiveNamespacePrefixes(node, self._inherit_context(node),
00169 self.unsuppressedPrefixes)
00170 self._do_element(node, inherited, unused=unused)
00171 else:
00172 inherited = self._inherit_context(node)
00173 self._do_element(node, inherited)
00174 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
00175 pass
00176 else:
00177 raise TypeError, str(node)
00178
00179
00180 def _inherit_context(self, node):
00181 '''_inherit_context(self, node) -> list
00182 Scan ancestors of attribute and namespace context. Used only
00183 for single element node canonicalization, not for subset
00184 canonicalization.'''
00185
00186
00187 xmlattrs = filter(_IN_XML_NS, _attrs(node))
00188
00189
00190 inherited, parent = [], node.parentNode
00191 while parent and parent.nodeType == Node.ELEMENT_NODE:
00192 for a in filter(_IN_XML_NS, _attrs(parent)):
00193 n = a.localName
00194 if n not in xmlattrs:
00195 xmlattrs.append(n)
00196 inherited.append(a)
00197 parent = parent.parentNode
00198 return inherited
00199
00200
00201 def _do_document(self, node):
00202 '''_do_document(self, node) -> None
00203 Process a document node. documentOrder holds whether the document
00204 element has been encountered such that PIs/comments can be written
00205 as specified.'''
00206
00207 self.documentOrder = _LesserElement
00208 for child in node.childNodes:
00209 if child.nodeType == Node.ELEMENT_NODE:
00210 self.documentOrder = _Element
00211 self._do_element(child)
00212 self.documentOrder = _GreaterElement
00213 elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
00214 self._do_pi(child)
00215 elif child.nodeType == Node.COMMENT_NODE:
00216 self._do_comment(child)
00217 elif child.nodeType == Node.DOCUMENT_TYPE_NODE:
00218 pass
00219 else:
00220 raise TypeError, str(child)
00221 handlers[Node.DOCUMENT_NODE] = _do_document
00222
00223
00224 def _do_text(self, node):
00225 '''_do_text(self, node) -> None
00226 Process a text or CDATA node. Render various special characters
00227 as their C14N entity representations.'''
00228 if not _in_subset(self.subset, node): return
00229 s = string.replace(node.data, "&", "&")
00230 s = string.replace(s, "<", "<")
00231 s = string.replace(s, ">", ">")
00232 s = string.replace(s, "\015", "
")
00233 if s: self.write(s)
00234 handlers[Node.TEXT_NODE] = _do_text
00235 handlers[Node.CDATA_SECTION_NODE] = _do_text
00236
00237
00238 def _do_pi(self, node):
00239 '''_do_pi(self, node) -> None
00240 Process a PI node. Render a leading or trailing #xA if the
00241 document order of the PI is greater or lesser (respectively)
00242 than the document element.
00243 '''
00244 if not _in_subset(self.subset, node): return
00245 W = self.write
00246 if self.documentOrder == _GreaterElement: W('\n')
00247 W('<?')
00248 W(node.nodeName)
00249 s = node.data
00250 if s:
00251 W(' ')
00252 W(s)
00253 W('?>')
00254 if self.documentOrder == _LesserElement: W('\n')
00255 handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi
00256
00257
00258 def _do_comment(self, node):
00259 '''_do_comment(self, node) -> None
00260 Process a comment node. Render a leading or trailing #xA if the
00261 document order of the comment is greater or lesser (respectively)
00262 than the document element.
00263 '''
00264 if not _in_subset(self.subset, node): return
00265 if self.comments:
00266 W = self.write
00267 if self.documentOrder == _GreaterElement: W('\n')
00268 W('<!--')
00269 W(node.data)
00270 W('-->')
00271 if self.documentOrder == _LesserElement: W('\n')
00272 handlers[Node.COMMENT_NODE] = _do_comment
00273
00274
00275 def _do_attr(self, n, value):
00276 ''''_do_attr(self, node) -> None
00277 Process an attribute.'''
00278
00279 W = self.write
00280 W(' ')
00281 W(n)
00282 W('="')
00283 s = string.replace(value, "&", "&")
00284 s = string.replace(s, "<", "<")
00285 s = string.replace(s, '"', '"')
00286 s = string.replace(s, '\011', '	')
00287 s = string.replace(s, '\012', '
')
00288 s = string.replace(s, '\015', '
')
00289 W(s)
00290 W('"')
00291
00292
00293 def _do_element(self, node, initial_other_attrs = [], unused = None):
00294 '''_do_element(self, node, initial_other_attrs = [], unused = {}) -> None
00295 Process an element (and its children).'''
00296
00297
00298
00299
00300
00301
00302
00303
00304 ns_parent, ns_rendered, xml_attrs = \
00305 self.state[0], self.state[1].copy(), self.state[2].copy()
00306
00307 ns_unused_inherited = unused
00308 if unused is None:
00309 ns_unused_inherited = self.state[3].copy()
00310
00311 ns_local = ns_parent.copy()
00312 inclusive = _inclusive(self)
00313 xml_attrs_local = {}
00314
00315
00316 other_attrs = []
00317 in_subset = _in_subset(self.subset, node)
00318 for a in initial_other_attrs + _attrs(node):
00319 if a.namespaceURI == XMLNS.BASE:
00320 n = a.nodeName
00321 if n == "xmlns:": n = "xmlns"
00322 ns_local[n] = a.nodeValue
00323 elif a.namespaceURI == XMLNS.XML:
00324 if inclusive or (in_subset and _in_subset(self.subset, a)):
00325 xml_attrs_local[a.nodeName] = a
00326 else:
00327 if _in_subset(self.subset, a):
00328 other_attrs.append(a)
00329
00330
00331
00332
00333
00334
00335 xml_attrs.update(xml_attrs_local)
00336
00337
00338 W, name = self.write, None
00339 if in_subset:
00340 name = node.nodeName
00341 if not inclusive:
00342 if node.prefix is not None:
00343 prefix = 'xmlns:%s' %node.prefix
00344 else:
00345 prefix = 'xmlns'
00346
00347 if not ns_rendered.has_key(prefix) and not ns_local.has_key(prefix):
00348 if not ns_unused_inherited.has_key(prefix):
00349 raise RuntimeError,\
00350 'For exclusive c14n, unable to map prefix "%s" in %s' %(
00351 prefix, node)
00352
00353 ns_local[prefix] = ns_unused_inherited[prefix]
00354 del ns_unused_inherited[prefix]
00355
00356 W('<')
00357 W(name)
00358
00359
00360 ns_to_render = []
00361 for n,v in ns_local.items():
00362
00363
00364
00365 if n == "xmlns" and v in [ XMLNS.BASE, '' ] \
00366 and ns_rendered.get('xmlns') in [ XMLNS.BASE, '', None ]:
00367 continue
00368
00369
00370
00371
00372 if n in ["xmlns:xml", "xml"] \
00373 and v in [ 'http://www.w3.org/XML/1998/namespace' ]:
00374 continue
00375
00376
00377
00378
00379 if (n,v) not in ns_rendered.items():
00380 if inclusive or _utilized(n, node, other_attrs, self.unsuppressedPrefixes):
00381 ns_to_render.append((n, v))
00382 elif not inclusive:
00383 ns_unused_inherited[n] = v
00384
00385
00386 ns_to_render.sort(_sorter_ns)
00387 for n,v in ns_to_render:
00388 self._do_attr(n, v)
00389 ns_rendered[n]=v
00390
00391
00392
00393
00394 if not inclusive or _in_subset(self.subset,node.parentNode):
00395 other_attrs.extend(xml_attrs_local.values())
00396 else:
00397 other_attrs.extend(xml_attrs.values())
00398 other_attrs.sort(_sorter)
00399 for a in other_attrs:
00400 self._do_attr(a.nodeName, a.value)
00401 W('>')
00402
00403
00404 state, self.state = self.state, (ns_local, ns_rendered, xml_attrs, ns_unused_inherited)
00405 for c in _children(node):
00406 _implementation.handlers[c.nodeType](self, c)
00407 self.state = state
00408
00409 if name: W('</%s>' % name)
00410 handlers[Node.ELEMENT_NODE] = _do_element
00411
00412
00413 def Canonicalize(node, output=None, **kw):
00414 '''Canonicalize(node, output=None, **kw) -> UTF-8
00415
00416 Canonicalize a DOM document/element node and all descendents.
00417 Return the text; if output is specified then output.write will
00418 be called to output the text and None will be returned
00419 Keyword parameters:
00420 nsdict: a dictionary of prefix:uri namespace entries
00421 assumed to exist in the surrounding context
00422 comments: keep comments if non-zero (default is 0)
00423 subset: Canonical XML subsetting resulting from XPath
00424 (default is [])
00425 unsuppressedPrefixes: do exclusive C14N, and this specifies the
00426 prefixes that should be inherited.
00427 '''
00428 if output:
00429 apply(_implementation, (node, output.write), kw)
00430 else:
00431 s = StringIO.StringIO()
00432 apply(_implementation, (node, s.write), kw)
00433 return s.getvalue()