Commit e1d5dd64 authored by Stefan Behnel's avatar Stefan Behnel Committed by GitHub

bpo-13611: C14N 2.0 implementation for ElementTree (GH-12966)

* Implement C14N 2.0 as a new canonicalize() function in ElementTree.

Missing features:
- prefix renaming in XPath expressions (tag and attribute text is supported)
- preservation of original prefixes given redundant namespace declarations
parent ee88af3f
...@@ -465,6 +465,53 @@ Reference ...@@ -465,6 +465,53 @@ Reference
Functions Functions
^^^^^^^^^ ^^^^^^^^^
.. function:: canonicalize(xml_data=None, *, out=None, from_file=None, **options)
`C14N 2.0 <https://www.w3.org/TR/xml-c14n2/>`_ transformation function.
Canonicalization is a way to normalise XML output in a way that allows
byte-by-byte comparisons and digital signatures. It reduced the freedom
that XML serializers have and instead generates a more constrained XML
representation. The main restrictions regard the placement of namespace
declarations, the ordering of attributes, and ignorable whitespace.
This function takes an XML data string (*xml_data*) or a file path or
file-like object (*from_file*) as input, converts it to the canonical
form, and writes it out using the *out* file(-like) object, if provided,
or returns it as a text string if not. The output file receives text,
not bytes. It should therefore be opened in text mode with ``utf-8``
encoding.
Typical uses::
xml_data = "<root>...</root>"
print(canonicalize(xml_data))
with open("c14n_output.xml", mode='w', encoding='utf-8') as out_file:
canonicalize(xml_data, out=out_file)
with open("c14n_output.xml", mode='w', encoding='utf-8') as out_file:
canonicalize(from_file="inputfile.xml", out=out_file)
The configuration *options* are as follows:
- *with_comments*: set to true to include comments (default: false)
- *strip_text*: set to true to strip whitespace before and after text content
(default: false)
- *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
(default: false)
- *qname_aware_tags*: a set of qname aware tag names in which prefixes
should be replaced in text content (default: empty)
- *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
should be replaced in text content (default: empty)
- *exclude_attrs*: a set of attribute names that should not be serialised
- *exclude_tags*: a set of tag names that should not be serialised
In the option list above, "a set" refers to any collection or iterable of
strings, no ordering is expected.
.. versionadded:: 3.8
.. function:: Comment(text=None) .. function:: Comment(text=None)
...@@ -1114,6 +1161,19 @@ TreeBuilder Objects ...@@ -1114,6 +1161,19 @@ TreeBuilder Objects
.. versionadded:: 3.8 .. versionadded:: 3.8
.. class:: C14NWriterTarget(write, *, \
with_comments=False, strip_text=False, rewrite_prefixes=False, \
qname_aware_tags=None, qname_aware_attrs=None, \
exclude_attrs=None, exclude_tags=None)
A `C14N 2.0 <https://www.w3.org/TR/xml-c14n2/>`_ writer. Arguments are the
same as for the :func:`canonicalize` function. This class does not build a
tree but translates the callback events directly into a serialised form
using the *write* function.
.. versionadded:: 3.8
.. _elementtree-xmlparser-objects: .. _elementtree-xmlparser-objects:
XMLParser Objects XMLParser Objects
......
...@@ -525,6 +525,10 @@ xml ...@@ -525,6 +525,10 @@ xml
external entities by default. external entities by default.
(Contributed by Christian Heimes in :issue:`17239`.) (Contributed by Christian Heimes in :issue:`17239`.)
* The :mod:`xml.etree.ElementTree` module provides a new function
:func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0.
(Contributed by Stefan Behnel in :issue:`13611`.)
Optimizations Optimizations
============= =============
......
...@@ -12,6 +12,7 @@ import io ...@@ -12,6 +12,7 @@ import io
import itertools import itertools
import locale import locale
import operator import operator
import os
import pickle import pickle
import sys import sys
import textwrap import textwrap
...@@ -20,6 +21,7 @@ import unittest ...@@ -20,6 +21,7 @@ import unittest
import warnings import warnings
import weakref import weakref
from functools import partial
from itertools import product, islice from itertools import product, islice
from test import support from test import support
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
...@@ -3527,6 +3529,231 @@ class NoAcceleratorTest(unittest.TestCase): ...@@ -3527,6 +3529,231 @@ class NoAcceleratorTest(unittest.TestCase):
self.assertIsInstance(pyET.Element.__init__, types.FunctionType) self.assertIsInstance(pyET.Element.__init__, types.FunctionType)
self.assertIsInstance(pyET.XMLParser.__init__, types.FunctionType) self.assertIsInstance(pyET.XMLParser.__init__, types.FunctionType)
# --------------------------------------------------------------------
def c14n_roundtrip(xml, **options):
return pyET.canonicalize(xml, **options)
class C14NTest(unittest.TestCase):
maxDiff = None
#
# simple roundtrip tests (from c14n.py)
def test_simple_roundtrip(self):
# Basics
self.assertEqual(c14n_roundtrip("<doc/>"), '<doc></doc>')
self.assertEqual(c14n_roundtrip("<doc xmlns='uri'/>"), # FIXME
'<doc xmlns="uri"></doc>')
self.assertEqual(c14n_roundtrip("<prefix:doc xmlns:prefix='uri'/>"),
'<prefix:doc xmlns:prefix="uri"></prefix:doc>')
self.assertEqual(c14n_roundtrip("<doc xmlns:prefix='uri'><prefix:bar/></doc>"),
'<doc><prefix:bar xmlns:prefix="uri"></prefix:bar></doc>')
self.assertEqual(c14n_roundtrip("<elem xmlns:wsu='http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd' xmlns:SOAP-ENV='http://schemas.xmlsoap.org/soap/envelope/' />"),
'<elem></elem>')
# C14N spec
self.assertEqual(c14n_roundtrip("<doc>Hello, world!<!-- Comment 1 --></doc>"),
'<doc>Hello, world!</doc>')
self.assertEqual(c14n_roundtrip("<value>&#x32;</value>"),
'<value>2</value>')
self.assertEqual(c14n_roundtrip('<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'),
'<compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute>')
self.assertEqual(c14n_roundtrip('''<compute expr='value>"0" &amp;&amp; value&lt;"10" ?"valid":"error"'>valid</compute>'''),
'<compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute>')
self.assertEqual(c14n_roundtrip("<norm attr=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>"),
'<norm attr=" \' &#xD;&#xA;&#x9; \' "></norm>')
self.assertEqual(c14n_roundtrip("<normNames attr=' A &#x20;&#13;&#xa;&#9; B '/>"),
'<normNames attr=" A &#xD;&#xA;&#x9; B "></normNames>')
self.assertEqual(c14n_roundtrip("<normId id=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>"),
'<normId id=" \' &#xD;&#xA;&#x9; \' "></normId>')
# fragments from PJ's tests
#self.assertEqual(c14n_roundtrip("<doc xmlns:x='http://example.com/x' xmlns='http://example.com/default'><b y:a1='1' xmlns='http://example.com/default' a3='3' xmlns:y='http://example.com/y' y:a2='2'/></doc>"),
#'<doc xmlns:x="http://example.com/x"><b xmlns:y="http://example.com/y" a3="3" y:a1="1" y:a2="2"></b></doc>')
def test_c14n_exclusion(self):
xml = textwrap.dedent("""\
<root xmlns:x="http://example.com/x">
<a x:attr="attrx">
<b>abtext</b>
</a>
<b>btext</b>
<c>
<x:d>dtext</x:d>
</c>
</root>
""")
self.assertEqual(
c14n_roundtrip(xml, strip_text=True),
'<root>'
'<a xmlns:x="http://example.com/x" x:attr="attrx"><b>abtext</b></a>'
'<b>btext</b>'
'<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, strip_text=True, exclude_attrs=['{http://example.com/x}attr']),
'<root>'
'<a><b>abtext</b></a>'
'<b>btext</b>'
'<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, strip_text=True, exclude_tags=['{http://example.com/x}d']),
'<root>'
'<a xmlns:x="http://example.com/x" x:attr="attrx"><b>abtext</b></a>'
'<b>btext</b>'
'<c></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, strip_text=True, exclude_attrs=['{http://example.com/x}attr'],
exclude_tags=['{http://example.com/x}d']),
'<root>'
'<a><b>abtext</b></a>'
'<b>btext</b>'
'<c></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, strip_text=True, exclude_tags=['a', 'b']),
'<root>'
'<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, exclude_tags=['a', 'b']),
'<root>\n'
' \n'
' \n'
' <c>\n'
' <x:d xmlns:x="http://example.com/x">dtext</x:d>\n'
' </c>\n'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, strip_text=True, exclude_tags=['{http://example.com/x}d', 'b']),
'<root>'
'<a xmlns:x="http://example.com/x" x:attr="attrx"></a>'
'<c></c>'
'</root>')
self.assertEqual(
c14n_roundtrip(xml, exclude_tags=['{http://example.com/x}d', 'b']),
'<root>\n'
' <a xmlns:x="http://example.com/x" x:attr="attrx">\n'
' \n'
' </a>\n'
' \n'
' <c>\n'
' \n'
' </c>\n'
'</root>')
#
# basic method=c14n tests from the c14n 2.0 specification. uses
# test files under xmltestdata/c14n-20.
# note that this uses generated C14N versions of the standard ET.write
# output, not roundtripped C14N (see above).
def test_xml_c14n2(self):
datadir = findfile("c14n-20", subdir="xmltestdata")
full_path = partial(os.path.join, datadir)
files = [filename[:-4] for filename in sorted(os.listdir(datadir))
if filename.endswith('.xml')]
input_files = [
filename for filename in files
if filename.startswith('in')
]
configs = {
filename: {
# <c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
option.tag.split('}')[-1]: ((option.text or '').strip(), option)
for option in ET.parse(full_path(filename) + ".xml").getroot()
}
for filename in files
if filename.startswith('c14n')
}
tests = {
input_file: [
(filename, configs[filename.rsplit('_', 1)[-1]])
for filename in files
if filename.startswith(f'out_{input_file}_')
and filename.rsplit('_', 1)[-1] in configs
]
for input_file in input_files
}
# Make sure we found all test cases.
self.assertEqual(30, len([
output_file for output_files in tests.values()
for output_file in output_files]))
def get_option(config, option_name, default=None):
return config.get(option_name, (default, ()))[0]
for input_file, output_files in tests.items():
for output_file, config in output_files:
keep_comments = get_option(
config, 'IgnoreComments') == 'true' # no, it's right :)
strip_text = get_option(
config, 'TrimTextNodes') == 'true'
rewrite_prefixes = get_option(
config, 'PrefixRewrite') == 'sequential'
if 'QNameAware' in config:
qattrs = [
f"{{{el.get('NS')}}}{el.get('Name')}"
for el in config['QNameAware'][1].findall(
'{http://www.w3.org/2010/xml-c14n2}QualifiedAttr')
]
qtags = [
f"{{{el.get('NS')}}}{el.get('Name')}"
for el in config['QNameAware'][1].findall(
'{http://www.w3.org/2010/xml-c14n2}Element')
]
else:
qtags = qattrs = None
# Build subtest description from config.
config_descr = ','.join(
f"{name}={value or ','.join(c.tag.split('}')[-1] for c in children)}"
for name, (value, children) in sorted(config.items())
)
with self.subTest(f"{output_file}({config_descr})"):
if input_file == 'inNsRedecl' and not rewrite_prefixes:
self.skipTest(
f"Redeclared namespace handling is not supported in {output_file}")
if input_file == 'inNsSuperfluous' and not rewrite_prefixes:
self.skipTest(
f"Redeclared namespace handling is not supported in {output_file}")
if 'QNameAware' in config and config['QNameAware'][1].find(
'{http://www.w3.org/2010/xml-c14n2}XPathElement') is not None:
self.skipTest(
f"QName rewriting in XPath text is not supported in {output_file}")
f = full_path(input_file + ".xml")
if input_file == 'inC14N5':
# Hack: avoid setting up external entity resolution in the parser.
with open(full_path('world.txt'), 'rb') as entity_file:
with open(f, 'rb') as f:
f = io.BytesIO(f.read().replace(b'&ent2;', entity_file.read()))
text = ET.canonicalize(
from_file=f,
with_comments=keep_comments,
strip_text=strip_text,
rewrite_prefixes=rewrite_prefixes,
qname_aware_tags=qtags, qname_aware_attrs=qattrs)
with open(full_path(output_file + ".xml"), 'r', encoding='utf8') as f:
expected = f.read()
if input_file == 'inC14N3':
# FIXME: cET resolves default attributes but ET does not!
expected = expected.replace(' attr="default"', '')
text = text.replace(' attr="default"', '')
self.assertEqual(expected, text)
# -------------------------------------------------------------------- # --------------------------------------------------------------------
...@@ -3559,6 +3786,8 @@ def test_main(module=None): ...@@ -3559,6 +3786,8 @@ def test_main(module=None):
XMLParserTest, XMLParserTest,
XMLPullParserTest, XMLPullParserTest,
BugsTest, BugsTest,
KeywordArgsTest,
C14NTest,
] ]
# These tests will only run for the pure-Python version that doesn't import # These tests will only run for the pure-Python version that doesn't import
......
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:IgnoreComments>true</c14n2:IgnoreComments>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" Algorithm="http://www.w3.org/2010/xml-c14n2">
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
<c14n2:QNameAware>
<c14n2:QualifiedAttr Name="type" NS="http://www.w3.org/2001/XMLSchema-instance"/>
</c14n2:QNameAware>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
<c14n2:QNameAware>
<c14n2:Element Name="bar" NS="http://a"/>
<c14n2:XPathElement Name="IncludedXPath" NS="http://www.w3.org/2010/xmldsig2#"/>
</c14n2:QNameAware>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:QNameAware>
<c14n2:QualifiedAttr Name="type" NS="http://www.w3.org/2001/XMLSchema-instance"/>
</c14n2:QNameAware>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:QNameAware>
<c14n2:Element Name="bar" NS="http://a"/>
</c14n2:QNameAware>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:QNameAware>
<c14n2:Element Name="bar" NS="http://a"/>
<c14n2:XPathElement Name="IncludedXPath" NS="http://www.w3.org/2010/xmldsig2#"/>
</c14n2:QNameAware>
</dsig:CanonicalizationMethod>
<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
<c14n2:TrimTextNodes>true</c14n2:TrimTextNodes>
</dsig:CanonicalizationMethod>
<?xml version="1.0" encoding="UTF-8"?>
<!ELEMENT doc (#PCDATA)>
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
>
</xsl:stylesheet>
<?xml version="1.0"?>
<?xml-stylesheet href="doc.xsl"
type="text/xsl" ?>
<!DOCTYPE doc SYSTEM "doc.dtd">
<doc>Hello, world!<!-- Comment 1 --></doc>
<?pi-without-data ?>
<!-- Comment 2 -->
<!-- Comment 3 -->
<doc>
<clean> </clean>
<dirty> A B </dirty>
<mixed>
A
<clean> </clean>
B
<dirty> A B </dirty>
C
</mixed>
</doc>
<!DOCTYPE doc [<!ATTLIST e9 attr CDATA "default">]>
<doc>
<e1 />
<e2 ></e2>
<e3 name = "elem3" id="elem3" />
<e4 name="elem4" id="elem4" ></e4>
<e5 a:attr="out" b:attr="sorted" attr2="all" attr="I'm"
xmlns:b="http://www.ietf.org"
xmlns:a="http://www.w3.org"
xmlns="http://example.org"/>
<e6 xmlns="" xmlns:a="http://www.w3.org">
<e7 xmlns="http://www.ietf.org">
<e8 xmlns="" xmlns:a="http://www.w3.org">
<e9 xmlns="" xmlns:a="http://www.ietf.org"/>
</e8>
</e7>
</e6>
</doc>
<!DOCTYPE doc [
<!ATTLIST normId id ID #IMPLIED>
<!ATTLIST normNames attr NMTOKENS #IMPLIED>
]>
<doc>
<text>First line&#x0d;&#10;Second line</text>
<value>&#x32;</value>
<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
<compute expr='value>"0" &amp;&amp; value&lt;"10" ?"valid":"error"'>valid</compute>
<norm attr=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>
<normNames attr=' A &#x20;&#13;&#xa;&#9; B '/>
<normId id=' &apos;&#x20;&#13;&#xa;&#9; &apos; '/>
</doc>
<!DOCTYPE doc [
<!ATTLIST doc attrExtEnt CDATA #IMPLIED>
<!ENTITY ent1 "Hello">
<!ENTITY ent2 SYSTEM "world.txt">
<!ENTITY entExt SYSTEM "earth.gif" NDATA gif>
<!NOTATION gif SYSTEM "viewgif.exe">
]>
<doc attrExtEnt="entExt">
&ent1;, &ent2;!
</doc>
<!-- Let world.txt contain "world" (excluding the quotes) -->
<?xml version="1.0" encoding="ISO-8859-1"?>
<doc>&#169;</doc>
<a:foo xmlns:a="http://a" xmlns:b="http://b" xmlns:child="http://c" xmlns:soap-env="http://schemas.xmlsoap.org/wsdl/soap/" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<a:bar>xsd:string</a:bar>
<dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
</a:foo>
<foo xmlns:a="http://a" xmlns:b="http://b">
<b:bar b:att1="val" att2="val"/>
</foo>
<a:foo xmlns:a="http://a" xmlns:b="http://b" xmlns:c="http://c">
<b:bar/>
<b:bar/>
<b:bar/>
<a:bar b:att1="val"/>
</a:foo>
<foo xmlns:a="http://z3" xmlns:b="http://z2" a:att1="val1" b:att2="val2">
<bar xmlns="http://z0" xmlns:a="http://z2" a:att1="val1" b:att2="val2" xmlns:b="http://z3" />
</foo>
<a:foo xmlns:a="http://z3" xmlns:b="http://z2" b:att1="val1" c:att3="val3" b:att2="val2" xmlns:c="http://z1" xmlns:d="http://z0">
<c:bar/>
<c:bar d:att3="val3"/>
</a:foo>
<foo xmlns:a="http://z0" xmlns:b="http://z0" a:att1="val1" b:att2="val2" xmlns="http://z0">
<c:bar xmlns:a="http://z0" xmlns:c="http://z0" c:att3="val3"/>
<d:bar xmlns:d="http://z0"/>
</foo>
<foo xmlns="http://z0" xml:id="23">
<bar xsi:type="xsd:string" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">data</bar>
</foo>
<?xml-stylesheet href="doc.xsl"
type="text/xsl" ?>
<doc>Hello, world!<!-- Comment 1 --></doc>
<?pi-without-data?>
<!-- Comment 2 -->
<!-- Comment 3 -->
\ No newline at end of file
<?xml-stylesheet href="doc.xsl"
type="text/xsl" ?>
<doc>Hello, world!</doc>
<?pi-without-data?>
\ No newline at end of file
<doc>
<clean> </clean>
<dirty> A B </dirty>
<mixed>
A
<clean> </clean>
B
<dirty> A B </dirty>
C
</mixed>
</doc>
\ No newline at end of file
<doc><clean></clean><dirty>A B</dirty><mixed>A<clean></clean>B<dirty>A B</dirty>C</mixed></doc>
\ No newline at end of file
<doc>
<e1></e1>
<e2></e2>
<e3 id="elem3" name="elem3"></e3>
<e4 id="elem4" name="elem4"></e4>
<e5 xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I'm" attr2="all" b:attr="sorted" a:attr="out"></e5>
<e6>
<e7 xmlns="http://www.ietf.org">
<e8 xmlns="">
<e9 attr="default"></e9>
</e8>
</e7>
</e6>
</doc>
\ No newline at end of file
<n0:doc xmlns:n0="">
<n0:e1></n0:e1>
<n0:e2></n0:e2>
<n0:e3 id="elem3" name="elem3"></n0:e3>
<n0:e4 id="elem4" name="elem4"></n0:e4>
<n1:e5 xmlns:n1="http://example.org" xmlns:n2="http://www.ietf.org" xmlns:n3="http://www.w3.org" attr="I'm" attr2="all" n2:attr="sorted" n3:attr="out"></n1:e5>
<n0:e6>
<n2:e7 xmlns:n2="http://www.ietf.org">
<n0:e8>
<n0:e9 attr="default"></n0:e9>
</n0:e8>
</n2:e7>
</n0:e6>
</n0:doc>
\ No newline at end of file
<doc><e1></e1><e2></e2><e3 id="elem3" name="elem3"></e3><e4 id="elem4" name="elem4"></e4><e5 xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I'm" attr2="all" b:attr="sorted" a:attr="out"></e5><e6><e7 xmlns="http://www.ietf.org"><e8 xmlns=""><e9 attr="default"></e9></e8></e7></e6></doc>
\ No newline at end of file
<doc>
<text>First line&#xD;
Second line</text>
<value>2</value>
<compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute>
<compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute>
<norm attr=" ' &#xD;&#xA;&#x9; ' "></norm>
<normNames attr="A &#xD;&#xA;&#x9; B"></normNames>
<normId id="' &#xD;&#xA;&#x9; '"></normId>
</doc>
\ No newline at end of file
<doc><text>First line&#xD;
Second line</text><value>2</value><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute><compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute><norm attr=" ' &#xD;&#xA;&#x9; ' "></norm><normNames attr="A &#xD;&#xA;&#x9; B"></normNames><normId id="' &#xD;&#xA;&#x9; '"></normId></doc>
\ No newline at end of file
<doc attrExtEnt="entExt">
Hello, world!
</doc>
\ No newline at end of file
<doc attrExtEnt="entExt">Hello, world!</doc>
\ No newline at end of file
<doc>©</doc>
\ No newline at end of file
<a:foo xmlns:a="http://a">
<a:bar>xsd:string</a:bar>
<dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
</a:foo>
\ No newline at end of file
<n0:foo xmlns:n0="http://a">
<n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema">n1:string</n0:bar>
<n4:IncludedXPath xmlns:n2="http://b" xmlns:n3="http://schemas.xmlsoap.org/wsdl/soap/" xmlns:n4="http://www.w3.org/2010/xmldsig2#">/n3:body/child::n2:foo[@att1 != "c:val" and @att2 != 'xsd:string']</n4:IncludedXPath>
</n0:foo>
\ No newline at end of file
<a:foo xmlns:a="http://a">
<a:bar xmlns:xsd="http://www.w3.org/2001/XMLSchema">xsd:string</a:bar>
<dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
</a:foo>
\ No newline at end of file
<a:foo xmlns:a="http://a">
<a:bar xmlns:xsd="http://www.w3.org/2001/XMLSchema">xsd:string</a:bar>
<dsig2:IncludedXPath xmlns:b="http://b" xmlns:dsig2="http://www.w3.org/2010/xmldsig2#" xmlns:soap-env="http://schemas.xmlsoap.org/wsdl/soap/">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
</a:foo>
\ No newline at end of file
<foo>
<b:bar xmlns:b="http://b" att2="val" b:att1="val"></b:bar>
</foo>
\ No newline at end of file
<n0:foo xmlns:n0="">
<n1:bar xmlns:n1="http://b" att2="val" n1:att1="val"></n1:bar>
</n0:foo>
\ No newline at end of file
<a:foo xmlns:a="http://a">
<b:bar xmlns:b="http://b"></b:bar>
<b:bar xmlns:b="http://b"></b:bar>
<b:bar xmlns:b="http://b"></b:bar>
<a:bar xmlns:b="http://b" b:att1="val"></a:bar>
</a:foo>
\ No newline at end of file
<n0:foo xmlns:n0="http://a">
<n1:bar xmlns:n1="http://b"></n1:bar>
<n1:bar xmlns:n1="http://b"></n1:bar>
<n1:bar xmlns:n1="http://b"></n1:bar>
<n0:bar xmlns:n1="http://b" n1:att1="val"></n0:bar>
</n0:foo>
\ No newline at end of file
<foo xmlns:a="http://z3" xmlns:b="http://z2" b:att2="val2" a:att1="val1">
<bar xmlns="http://z0" xmlns:a="http://z2" xmlns:b="http://z3" a:att1="val1" b:att2="val2"></bar>
</foo>
\ No newline at end of file
<n0:foo xmlns:n0="" xmlns:n1="http://z2" xmlns:n2="http://z3" n1:att2="val2" n2:att1="val1">
<n3:bar xmlns:n3="http://z0" n1:att1="val1" n2:att2="val2"></n3:bar>
</n0:foo>
\ No newline at end of file
<a:foo xmlns:a="http://z3" xmlns:b="http://z2" xmlns:c="http://z1" c:att3="val3" b:att1="val1" b:att2="val2">
<c:bar></c:bar>
<c:bar xmlns:d="http://z0" d:att3="val3"></c:bar>
</a:foo>
\ No newline at end of file
<n2:foo xmlns:n0="http://z1" xmlns:n1="http://z2" xmlns:n2="http://z3" n0:att3="val3" n1:att1="val1" n1:att2="val2">
<n0:bar></n0:bar>
<n0:bar xmlns:n3="http://z0" n3:att3="val3"></n0:bar>
</n2:foo>
\ No newline at end of file
<foo xmlns="http://z0" xmlns:a="http://z0" xmlns:b="http://z0" a:att1="val1" b:att2="val2">
<c:bar xmlns:c="http://z0" c:att3="val3"></c:bar>
<d:bar xmlns:d="http://z0"></d:bar>
</foo>
\ No newline at end of file
<n0:foo xmlns:n0="http://z0" n0:att1="val1" n0:att2="val2">
<n0:bar n0:att3="val3"></n0:bar>
<n0:bar></n0:bar>
</n0:foo>
\ No newline at end of file
<foo xmlns="http://z0" xml:id="23">
<bar xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xsd:string">data</bar>
</foo>
\ No newline at end of file
<n0:foo xmlns:n0="http://z0" xml:id="23">
<n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema-instance" n1:type="xsd:string">data</n0:bar>
</n0:foo>
\ No newline at end of file
<n0:foo xmlns:n0="http://z0" xml:id="23">
<n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema" xmlns:n2="http://www.w3.org/2001/XMLSchema-instance" n2:type="n1:string">data</n0:bar>
</n0:foo>
\ No newline at end of file
<foo xmlns="http://z0" xml:id="23">
<bar xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xsd:string">data</bar>
</foo>
\ No newline at end of file
world
\ No newline at end of file
...@@ -87,6 +87,7 @@ __all__ = [ ...@@ -87,6 +87,7 @@ __all__ = [
"XML", "XMLID", "XML", "XMLID",
"XMLParser", "XMLPullParser", "XMLParser", "XMLPullParser",
"register_namespace", "register_namespace",
"canonicalize", "C14NWriterTarget",
] ]
VERSION = "1.3.0" VERSION = "1.3.0"
...@@ -1711,6 +1712,336 @@ class XMLParser: ...@@ -1711,6 +1712,336 @@ class XMLParser:
del self.target, self._target del self.target, self._target
# --------------------------------------------------------------------
# C14N 2.0
def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
"""Convert XML to its C14N 2.0 serialised form.
If *out* is provided, it must be a file or file-like object that receives
the serialised canonical XML output (text, not bytes) through its ``.write()``
method. To write to a file, open it in text mode with encoding "utf-8".
If *out* is not provided, this function returns the output as text string.
Either *xml_data* (an XML string) or *from_file* (a file path or
file-like object) must be provided as input.
The configuration options are the same as for the ``C14NWriterTarget``.
"""
if xml_data is None and from_file is None:
raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
sio = None
if out is None:
sio = out = io.StringIO()
parser = XMLParser(target=C14NWriterTarget(out.write, **options))
if xml_data is not None:
parser.feed(xml_data)
parser.close()
elif from_file is not None:
parse(from_file, parser=parser)
return sio.getvalue() if sio is not None else None
_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
class C14NWriterTarget:
"""
Canonicalization writer target for the XMLParser.
Serialises parse events to XML C14N 2.0.
The *write* function is used for writing out the resulting data stream
as text (not bytes). To write to a file, open it in text mode with encoding
"utf-8" and pass its ``.write`` method.
Configuration options:
- *with_comments*: set to true to include comments
- *strip_text*: set to true to strip whitespace before and after text content
- *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
- *qname_aware_tags*: a set of qname aware tag names in which prefixes
should be replaced in text content
- *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
should be replaced in text content
- *exclude_attrs*: a set of attribute names that should not be serialised
- *exclude_tags*: a set of tag names that should not be serialised
"""
def __init__(self, write, *,
with_comments=False, strip_text=False, rewrite_prefixes=False,
qname_aware_tags=None, qname_aware_attrs=None,
exclude_attrs=None, exclude_tags=None):
self._write = write
self._data = []
self._with_comments = with_comments
self._strip_text = strip_text
self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
self._exclude_tags = set(exclude_tags) if exclude_tags else None
self._rewrite_prefixes = rewrite_prefixes
if qname_aware_tags:
self._qname_aware_tags = set(qname_aware_tags)
else:
self._qname_aware_tags = None
if qname_aware_attrs:
self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
else:
self._find_qname_aware_attrs = None
# Stack with globally and newly declared namespaces as (uri, prefix) pairs.
self._declared_ns_stack = [[
("http://www.w3.org/XML/1998/namespace", "xml"),
]]
# Stack with user declared namespace prefixes as (uri, prefix) pairs.
self._ns_stack = []
if not rewrite_prefixes:
self._ns_stack.append(list(_namespace_map.items()))
self._ns_stack.append([])
self._prefix_map = {}
self._preserve_space = [False]
self._pending_start = None
self._root_seen = False
self._root_done = False
self._ignored_depth = 0
def _iter_namespaces(self, ns_stack, _reversed=reversed):
for namespaces in _reversed(ns_stack):
if namespaces: # almost no element declares new namespaces
yield from namespaces
def _resolve_prefix_name(self, prefixed_name):
prefix, name = prefixed_name.split(':', 1)
for uri, p in self._iter_namespaces(self._ns_stack):
if p == prefix:
return f'{{{uri}}}{name}'
raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
def _qname(self, qname, uri=None):
if uri is None:
uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
else:
tag = qname
prefixes_seen = set()
for u, prefix in self._iter_namespaces(self._declared_ns_stack):
if u == uri and prefix not in prefixes_seen:
return f'{prefix}:{tag}' if prefix else tag, tag, uri
prefixes_seen.add(prefix)
# Not declared yet => add new declaration.
if self._rewrite_prefixes:
if uri in self._prefix_map:
prefix = self._prefix_map[uri]
else:
prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
self._declared_ns_stack[-1].append((uri, prefix))
return f'{prefix}:{tag}', tag, uri
if not uri and '' not in prefixes_seen:
# No default namespace declared => no prefix needed.
return tag, tag, uri
for u, prefix in self._iter_namespaces(self._ns_stack):
if u == uri:
self._declared_ns_stack[-1].append((uri, prefix))
return f'{prefix}:{tag}' if prefix else tag, tag, uri
raise ValueError(f'Namespace "{uri}" is not declared in scope')
def data(self, data):
if not self._ignored_depth:
self._data.append(data)
def _flush(self, _join_text=''.join):
data = _join_text(self._data)
del self._data[:]
if self._strip_text and not self._preserve_space[-1]:
data = data.strip()
if self._pending_start is not None:
args, self._pending_start = self._pending_start, None
qname_text = data if data and _looks_like_prefix_name(data) else None
self._start(*args, qname_text)
if qname_text is not None:
return
if data and self._root_seen:
self._write(_escape_cdata_c14n(data))
def start_ns(self, prefix, uri):
if self._ignored_depth:
return
# we may have to resolve qnames in text content
if self._data:
self._flush()
self._ns_stack[-1].append((uri, prefix))
def start(self, tag, attrs):
if self._exclude_tags is not None and (
self._ignored_depth or tag in self._exclude_tags):
self._ignored_depth += 1
return
if self._data:
self._flush()
new_namespaces = []
self._declared_ns_stack.append(new_namespaces)
if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
# Need to parse text first to see if it requires a prefix declaration.
self._pending_start = (tag, attrs, new_namespaces)
return
self._start(tag, attrs, new_namespaces)
def _start(self, tag, attrs, new_namespaces, qname_text=None):
if self._exclude_attrs is not None and attrs:
attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
qnames = {tag, *attrs}
resolved_names = {}
# Resolve prefixes in attribute and tag text.
if qname_text is not None:
qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
qnames.add(qname)
if self._find_qname_aware_attrs is not None and attrs:
qattrs = self._find_qname_aware_attrs(attrs)
if qattrs:
for attr_name in qattrs:
value = attrs[attr_name]
if _looks_like_prefix_name(value):
qname = resolved_names[value] = self._resolve_prefix_name(value)
qnames.add(qname)
else:
qattrs = None
else:
qattrs = None
# Assign prefixes in lexicographical order of used URIs.
parse_qname = self._qname
parsed_qnames = {n: parse_qname(n) for n in sorted(
qnames, key=lambda n: n.split('}', 1))}
# Write namespace declarations in prefix order ...
if new_namespaces:
attr_list = [
('xmlns:' + prefix if prefix else 'xmlns', uri)
for uri, prefix in new_namespaces
]
attr_list.sort()
else:
# almost always empty
attr_list = []
# ... followed by attributes in URI+name order
if attrs:
for k, v in sorted(attrs.items()):
if qattrs is not None and k in qattrs and v in resolved_names:
v = parsed_qnames[resolved_names[v]][0]
attr_qname, attr_name, uri = parsed_qnames[k]
# No prefix for attributes in default ('') namespace.
attr_list.append((attr_qname if uri else attr_name, v))
# Honour xml:space attributes.
space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
self._preserve_space.append(
space_behaviour == 'preserve' if space_behaviour
else self._preserve_space[-1])
# Write the tag.
write = self._write
write('<' + parsed_qnames[tag][0])
if attr_list:
write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
write('>')
# Write the resolved qname text content.
if qname_text is not None:
write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
self._root_seen = True
self._ns_stack.append([])
def end(self, tag):
if self._ignored_depth:
self._ignored_depth -= 1
return
if self._data:
self._flush()
self._write(f'</{self._qname(tag)[0]}>')
self._preserve_space.pop()
self._root_done = len(self._preserve_space) == 1
self._declared_ns_stack.pop()
self._ns_stack.pop()
def comment(self, text):
if not self._with_comments:
return
if self._ignored_depth:
return
if self._root_done:
self._write('\n')
elif self._root_seen and self._data:
self._flush()
self._write(f'<!--{_escape_cdata_c14n(text)}-->')
if not self._root_seen:
self._write('\n')
def pi(self, target, data):
if self._ignored_depth:
return
if self._root_done:
self._write('\n')
elif self._root_seen and self._data:
self._flush()
self._write(
f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
if not self._root_seen:
self._write('\n')
def _escape_cdata_c14n(text):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if '&' in text:
text = text.replace('&', '&amp;')
if '<' in text:
text = text.replace('<', '&lt;')
if '>' in text:
text = text.replace('>', '&gt;')
if '\r' in text:
text = text.replace('\r', '&#xD;')
return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _escape_attrib_c14n(text):
# escape attribute value
try:
if '&' in text:
text = text.replace('&', '&amp;')
if '<' in text:
text = text.replace('<', '&lt;')
if '"' in text:
text = text.replace('"', '&quot;')
if '\t' in text:
text = text.replace('\t', '&#x9;')
if '\n' in text:
text = text.replace('\n', '&#xA;')
if '\r' in text:
text = text.replace('\r', '&#xD;')
return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
# --------------------------------------------------------------------
# Import the C accelerators # Import the C accelerators
try: try:
# Element is going to be shadowed by the C implementation. We need to keep # Element is going to be shadowed by the C implementation. We need to keep
......
The xml.etree.ElementTree packages gained support for C14N 2.0 serialisation.
Patch by Stefan Behnel.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment