Commit dde3eebd authored by Stefan Behnel's avatar Stefan Behnel Committed by GitHub

bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885)

* bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
parent 43851a20
......@@ -1086,7 +1086,7 @@ TreeBuilder Objects
In addition, a custom :class:`TreeBuilder` object can provide the
following method:
following methods:
.. method:: doctype(name, pubid, system)
......@@ -1096,6 +1096,23 @@ TreeBuilder Objects
.. versionadded:: 3.2
.. method:: start_ns(prefix, uri)
Is called whenever the parser encounters a new namespace declaration,
before the ``start()`` callback for the opening element that defines it.
*prefix* is ``''`` for the default namespace and the declared
namespace prefix name otherwise. *uri* is the namespace URI.
.. versionadded:: 3.8
.. method:: end_ns(prefix)
Is called after the ``end()`` callback of an element that declared
a namespace prefix mapping, with the name of the *prefix* that went
out of scope.
.. versionadded:: 3.8
.. _elementtree-xmlparser-objects:
......@@ -1131,7 +1148,8 @@ XMLParser Objects
:meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method
for each opening tag, its ``end(tag)`` method for each closing tag, and data
is processed by method ``data(data)``. :meth:`XMLParser.close` calls
is processed by method ``data(data)``. For further supported callback
methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls
*target*\'s method ``close()``. :class:`XMLParser` can be used not only for
building a tree structure. This is an example of counting the maximum depth
of an XML file::
......
......@@ -14,12 +14,13 @@ import locale
import operator
import pickle
import sys
import textwrap
import types
import unittest
import warnings
import weakref
from itertools import product
from itertools import product, islice
from test import support
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
......@@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase):
self.append(("pi", target, data))
def comment(self, data):
self.append(("comment", data))
def start_ns(self, prefix, uri):
self.append(("start-ns", prefix, uri))
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(data)
self.assertEqual(builder, [
('pi', 'pi', 'data'),
('comment', ' comment '),
('start-ns', '', 'namespace'),
('start', '{namespace}root'),
('start', '{namespace}element'),
('end', '{namespace}element'),
......@@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase):
('start', '{namespace}empty-element'),
('end', '{namespace}empty-element'),
('end', '{namespace}root'),
('end-ns', ''),
])
def test_custom_builder_only_end_ns(self):
class Builder(list):
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(textwrap.dedent("""\
<?pi data?>
<!-- comment -->
<root xmlns='namespace' xmlns:p='pns' xmlns:a='ans'>
<a:element key='value'>text</a:element>
<p:element>text</p:element>tail
<empty-element/>
</root>
"""))
self.assertEqual(builder, [
('end-ns', 'a'),
('end-ns', 'p'),
('end-ns', ''),
])
# Element.getchildren() and ElementTree.getiterator() are deprecated.
@checkwarnings(("This method will be removed in future versions. "
......@@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase):
for i in range(0, len(data), chunk_size):
parser.feed(data[i:i+chunk_size])
def assert_events(self, parser, expected):
def assert_events(self, parser, expected, max_events=None):
self.assertEqual(
[(event, (elem.tag, elem.text))
for event, elem in parser.read_events()],
for event, elem in islice(parser.read_events(), max_events)],
expected)
def assert_event_tags(self, parser, expected):
events = parser.read_events()
def assert_event_tuples(self, parser, expected, max_events=None):
self.assertEqual(
list(islice(parser.read_events(), max_events)),
expected)
def assert_event_tags(self, parser, expected, max_events=None):
events = islice(parser.read_events(), max_events)
self.assertEqual([(action, elem.tag) for action, elem in events],
expected)
......@@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase):
self.assertEqual(list(parser.read_events()), [('end-ns', None)])
self.assertIsNone(parser.close())
def test_ns_events_start(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
])
def test_ns_events_start_end(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
], max_events=1)
self.assert_event_tuples(parser, [
('end-ns', None),
('end-ns', None),
])
def test_events(self):
parser = ET.XMLPullParser(events=())
self._feed(parser, "<root/>\n")
......
......@@ -1518,6 +1518,10 @@ class XMLParser:
parser.StartElementHandler = self._start
if hasattr(target, 'end'):
parser.EndElementHandler = self._end
if hasattr(target, 'start_ns'):
parser.StartNamespaceDeclHandler = self._start_ns
if hasattr(target, 'end_ns'):
parser.EndNamespaceDeclHandler = self._end_ns
if hasattr(target, 'data'):
parser.CharacterDataHandler = target.data
# miscellaneous callbacks
......@@ -1559,12 +1563,24 @@ class XMLParser:
append((event, end(tag)))
parser.EndElementHandler = handler
elif event_name == "start-ns":
def handler(prefix, uri, event=event_name, append=append):
append((event, (prefix or "", uri or "")))
# TreeBuilder does not implement .start_ns()
if hasattr(self.target, "start_ns"):
def handler(prefix, uri, event=event_name, append=append,
start_ns=self._start_ns):
append((event, start_ns(prefix, uri)))
else:
def handler(prefix, uri, event=event_name, append=append):
append((event, (prefix or '', uri or '')))
parser.StartNamespaceDeclHandler = handler
elif event_name == "end-ns":
def handler(prefix, event=event_name, append=append):
append((event, None))
# TreeBuilder does not implement .end_ns()
if hasattr(self.target, "end_ns"):
def handler(prefix, event=event_name, append=append,
end_ns=self._end_ns):
append((event, end_ns(prefix)))
else:
def handler(prefix, event=event_name, append=append):
append((event, None))
parser.EndNamespaceDeclHandler = handler
elif event_name == 'comment':
def handler(text, event=event_name, append=append, self=self):
......@@ -1595,6 +1611,12 @@ class XMLParser:
self._names[key] = name
return name
def _start_ns(self, prefix, uri):
return self.target.start_ns(prefix or '', uri or '')
def _end_ns(self, prefix):
return self.target.end_ns(prefix or '')
def _start(self, tag, attr_list):
# Handler for expat's StartElementHandler. Since ordered_attributes
# is set, the attributes are reported as a list of alternating
......
The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
Patch by Stefan Behnel.
......@@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
return NULL;
}
LOCAL(PyObject*)
treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
{
PyObject* parcel;
if (self->events_append && self->start_ns_event_obj) {
parcel = PyTuple_Pack(2, prefix, uri);
if (!parcel) {
return NULL;
}
if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
Py_DECREF(parcel);
return NULL;
}
Py_DECREF(parcel);
}
Py_RETURN_NONE;
}
LOCAL(PyObject*)
treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
{
if (self->events_append && self->end_ns_event_obj) {
if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
return NULL;
}
}
Py_RETURN_NONE;
}
/* -------------------------------------------------------------------- */
/* methods (in alphabetical order) */
......@@ -3046,6 +3079,8 @@ typedef struct {
PyObject *names;
PyObject *handle_start_ns;
PyObject *handle_end_ns;
PyObject *handle_start;
PyObject *handle_data;
PyObject *handle_end;
......@@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
}
static void
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
const XML_Char *uri)
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
const XML_Char *uri_in)
{
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
PyObject *parcel;
PyObject* res = NULL;
PyObject* uri;
PyObject* prefix;
PyObject* stack[2];
if (PyErr_Occurred())
return;
if (!target->events_append || !target->start_ns_event_obj)
return;
if (!uri_in)
uri_in = "";
if (!prefix_in)
prefix_in = "";
if (!uri)
uri = "";
if (!prefix)
prefix = "";
if (TreeBuilder_CheckExact(self->target)) {
/* shortcut - TreeBuilder does not actually implement .start_ns() */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
parcel = Py_BuildValue("ss", prefix, uri);
if (!parcel)
return;
treebuilder_append_event(target, target->start_ns_event_obj, parcel);
Py_DECREF(parcel);
if (target->events_append && target->start_ns_event_obj) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
res = treebuilder_handle_start_ns(target, prefix, uri);
Py_DECREF(uri);
Py_DECREF(prefix);
}
} else if (self->handle_start_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
stack[0] = prefix;
stack[1] = uri;
res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
Py_DECREF(uri);
Py_DECREF(prefix);
}
Py_XDECREF(res);
}
static void
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
{
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
PyObject *res = NULL;
PyObject* prefix;
if (PyErr_Occurred())
return;
if (!target->events_append)
return;
if (!prefix_in)
prefix_in = "";
if (TreeBuilder_CheckExact(self->target)) {
/* shortcut - TreeBuilder does not actually implement .end_ns() */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
if (target->events_append && target->end_ns_event_obj) {
res = treebuilder_handle_end_ns(target, Py_None);
}
} else if (self->handle_end_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
Py_DECREF(prefix);
}
treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
Py_XDECREF(res);
}
static void
......@@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self) {
self->parser = NULL;
self->target = self->entity = self->names = NULL;
self->handle_start_ns = self->handle_end_ns = NULL;
self->handle_start = self->handle_data = self->handle_end = NULL;
self->handle_comment = self->handle_pi = self->handle_close = NULL;
self->handle_doctype = NULL;
......@@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
}
self->target = target;
self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
if (ignore_attribute_error(self->handle_start_ns)) {
return -1;
}
self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
if (ignore_attribute_error(self->handle_end_ns)) {
return -1;
}
self->handle_start = PyObject_GetAttrString(target, "start");
if (ignore_attribute_error(self->handle_start)) {
return -1;
......@@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
/* configure parser */
EXPAT(SetUserData)(self->parser, self);
if (self->handle_start_ns || self->handle_end_ns)
EXPAT(SetNamespaceDeclHandler)(
self->parser,
(XML_StartNamespaceDeclHandler) expat_start_ns_handler,
(XML_EndNamespaceDeclHandler) expat_end_ns_handler
);
EXPAT(SetElementHandler)(
self->parser,
(XML_StartElementHandler) expat_start_handler,
......@@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
Py_VISIT(self->handle_end);
Py_VISIT(self->handle_data);
Py_VISIT(self->handle_start);
Py_VISIT(self->handle_start_ns);
Py_VISIT(self->handle_end_ns);
Py_VISIT(self->handle_doctype);
Py_VISIT(self->target);
Py_VISIT(self->entity);
......@@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self)
Py_CLEAR(self->handle_end);
Py_CLEAR(self->handle_data);
Py_CLEAR(self->handle_start);
Py_CLEAR(self->handle_start_ns);
Py_CLEAR(self->handle_end_ns);
Py_CLEAR(self->handle_doctype);
Py_CLEAR(self->target);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment