Commit 46495182 authored by Ezio Melotti's avatar Ezio Melotti

#15156: HTMLParser now uses the new "html.entities.html5" dictionary.

parent a504a7a7
...@@ -11,10 +11,6 @@ ...@@ -11,10 +11,6 @@
This module defines four dictionaries, :data:`html5`, This module defines four dictionaries, :data:`html5`,
:data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`. :data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
:data:`entitydefs` is used to provide the :attr:`entitydefs`
attribute of the :class:`html.parser.HTMLParser` class. The definition provided
here contains all the entities defined by XHTML 1.0 that can be handled using
simple textual substitution in the Latin-1 character set (ISO-8859-1).
.. data:: html5 .. data:: html5
......
...@@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase):
self.error("unknown declaration: %r" % (data,)) self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting # Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s): def unescape(self, s):
if '&' not in s: if '&' not in s:
return s return s
...@@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase):
if s[0] == "#": if s[0] == "#":
s = s[1:] s = s[1:]
if s[0] in ['x','X']: if s[0] in ['x','X']:
c = int(s[1:], 16) c = int(s[1:].rstrip(';'), 16)
else: else:
c = int(s) c = int(s.rstrip(';'))
return chr(c) return chr(c)
except ValueError: except ValueError:
return '&#'+ s +';' return '&#' + s
else: else:
# Cannot use name2codepoint directly, because HTMLParser from html.entities import html5
# supports apos, which is not part of HTML 4 if s in html5:
import html.entities return html5[s]
if HTMLParser.entitydefs is None: elif s.endswith(';'):
entitydefs = HTMLParser.entitydefs = {'apos':"'"} return '&' + s
for k, v in html.entities.name2codepoint.items(): for x in range(2, len(s)):
entitydefs[k] = chr(v) if s[:x] in html5:
try: return html5[s[:x]] + s[x:]
return self.entitydefs[s] else:
except KeyError: return '&' + s
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
replaceEntities, s, flags=re.ASCII) replaceEntities, s, flags=re.ASCII)
...@@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): ...@@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", ' self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
'method="post">', [ 'method="post">', [
('starttag', 'form', ('starttag', 'form',
[('action', '/xxx.php?a=1&b=2&amp'), [('action', '/xxx.php?a=1&b=2&'),
(',', None), ('method', 'post')])]) (',', None), ('method', 'post')])])
def test_weird_chars_in_unquoted_attribute_values(self): def test_weird_chars_in_unquoted_attribute_values(self):
...@@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): ...@@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self.assertEqual(p.unescape('&#0038;'),'&') self.assertEqual(p.unescape('&#0038;'),'&')
# see #12888 # see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050) self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
# see #15156
self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
'&alphacentauri&alpha;centauri'),
'ÉricÉric&alphacentauriαcentauri')
self.assertEqual(p.unescape('&co;'), '&co;')
def test_broken_comments(self): def test_broken_comments(self):
html = ('<! not really a comment >' html = ('<! not really a comment >'
......
...@@ -76,6 +76,8 @@ Library ...@@ -76,6 +76,8 @@ Library
It is used automatically on platforms supporting the necessary os.openat() It is used automatically on platforms supporting the necessary os.openat()
and os.unlinkat() functions. Main code by Martin von Löwis. and os.unlinkat() functions. Main code by Martin von Löwis.
- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
- Issue #11113: add a new "html5" dictionary containing the named character - Issue #11113: add a new "html5" dictionary containing the named character
references defined by the HTML5 standard and the equivalent Unicode references defined by the HTML5 standard and the equivalent Unicode
character(s) to the html.entities module. character(s) to the html.entities module.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment