Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
4a9ee267
Commit
4a9ee267
authored
Nov 19, 2013
by
Ezio Melotti
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
#2927: Added the unescape() function to the html module.
parent
5160da1a
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
215 additions
and
49 deletions
+215
-49
Doc/library/html.entities.rst
Doc/library/html.entities.rst
+1
-0
Doc/library/html.rst
Doc/library/html.rst
+11
-0
Lib/html/__init__.py
Lib/html/__init__.py
+113
-1
Lib/html/parser.py
Lib/html/parser.py
+5
-33
Lib/test/test_html.py
Lib/test/test_html.py
+83
-3
Lib/test/test_htmlparser.py
Lib/test/test_htmlparser.py
+0
-12
Misc/NEWS
Misc/NEWS
+2
-0
No files found.
Doc/library/html.entities.rst
View file @
4a9ee267
...
...
@@ -20,6 +20,7 @@ This module defines four dictionaries, :data:`html5`,
Note that the trailing semicolon is included in the name (e.g. ``'gt;'``),
however some of the names are accepted by the standard even without the
semicolon: in this case the name is present with and without the ``';'``.
See also :func:`html.unescape`.
.. versionadded:: 3.3
...
...
Doc/library/html.rst
View file @
4a9ee267
...
...
@@ -20,6 +20,17 @@ This module defines utilities to manipulate HTML.
.. versionadded:: 3.2
.. function:: unescape(s)
Convert all named and numeric character references (e.g. ``
>
``,
``
>
``, ``
&x3e;
``) in the string *s* to the corresponding unicode
characters. This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the :data:`list of
HTML 5 named character references
<html
.
entities
.
html5
>
`.
.. versionadded:: 3.4
--------------
Submodules in the ``html`` package are:
...
...
Lib/html/__init__.py
View file @
4a9ee267
...
...
@@ -2,7 +2,12 @@
General functions for HTML manipulation.
"""
# NB: this is a candidate for a bytes/string polymorphic interface
import
re
as
_re
from
html.entities
import
html5
as
_html5
__all__
=
[
'escape'
,
'unescape'
]
def
escape
(
s
,
quote
=
True
):
"""
...
...
@@ -18,3 +23,110 @@ def escape(s, quote=True):
s
=
s
.
replace
(
'"'
,
"""
)
s
=
s
.
replace
(
'
\
'
'
,
"'"
)
return
s
# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
_invalid_charrefs
=
{
0x00
:
'
\
ufffd
'
,
# REPLACEMENT CHARACTER
0x0d
:
'
\
r
'
,
# CARRIAGE RETURN
0x80
:
'
\
u20ac
'
,
# EURO SIGN
0x81
:
'
\
x81
'
,
# <control>
0x82
:
'
\
u201a
'
,
# SINGLE LOW-9 QUOTATION MARK
0x83
:
'
\
u0192
'
,
# LATIN SMALL LETTER F WITH HOOK
0x84
:
'
\
u201e
'
,
# DOUBLE LOW-9 QUOTATION MARK
0x85
:
'
\
u2026
'
,
# HORIZONTAL ELLIPSIS
0x86
:
'
\
u2020
'
,
# DAGGER
0x87
:
'
\
u2021
'
,
# DOUBLE DAGGER
0x88
:
'
\
u02c6
'
,
# MODIFIER LETTER CIRCUMFLEX ACCENT
0x89
:
'
\
u2030
'
,
# PER MILLE SIGN
0x8a
:
'
\
u0160
'
,
# LATIN CAPITAL LETTER S WITH CARON
0x8b
:
'
\
u2039
'
,
# SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8c
:
'
\
u0152
'
,
# LATIN CAPITAL LIGATURE OE
0x8d
:
'
\
x8d
'
,
# <control>
0x8e
:
'
\
u017d
'
,
# LATIN CAPITAL LETTER Z WITH CARON
0x8f
:
'
\
x8f
'
,
# <control>
0x90
:
'
\
x90
'
,
# <control>
0x91
:
'
\
u2018
'
,
# LEFT SINGLE QUOTATION MARK
0x92
:
'
\
u2019
'
,
# RIGHT SINGLE QUOTATION MARK
0x93
:
'
\
u201c
'
,
# LEFT DOUBLE QUOTATION MARK
0x94
:
'
\
u201d
'
,
# RIGHT DOUBLE QUOTATION MARK
0x95
:
'
\
u2022
'
,
# BULLET
0x96
:
'
\
u2013
'
,
# EN DASH
0x97
:
'
\
u2014
'
,
# EM DASH
0x98
:
'
\
u02dc
'
,
# SMALL TILDE
0x99
:
'
\
u2122
'
,
# TRADE MARK SIGN
0x9a
:
'
\
u0161
'
,
# LATIN SMALL LETTER S WITH CARON
0x9b
:
'
\
u203a
'
,
# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9c
:
'
\
u0153
'
,
# LATIN SMALL LIGATURE OE
0x9d
:
'
\
x9d
'
,
# <control>
0x9e
:
'
\
u017e
'
,
# LATIN SMALL LETTER Z WITH CARON
0x9f
:
'
\
u0178
'
,
# LATIN CAPITAL LETTER Y WITH DIAERESIS
}
_invalid_codepoints
=
{
# 0x0001 to 0x0008
0x1
,
0x2
,
0x3
,
0x4
,
0x5
,
0x6
,
0x7
,
0x8
,
# 0x000E to 0x001F
0xe
,
0xf
,
0x10
,
0x11
,
0x12
,
0x13
,
0x14
,
0x15
,
0x16
,
0x17
,
0x18
,
0x19
,
0x1a
,
0x1b
,
0x1c
,
0x1d
,
0x1e
,
0x1f
,
# 0x007F to 0x009F
0x7f
,
0x80
,
0x81
,
0x82
,
0x83
,
0x84
,
0x85
,
0x86
,
0x87
,
0x88
,
0x89
,
0x8a
,
0x8b
,
0x8c
,
0x8d
,
0x8e
,
0x8f
,
0x90
,
0x91
,
0x92
,
0x93
,
0x94
,
0x95
,
0x96
,
0x97
,
0x98
,
0x99
,
0x9a
,
0x9b
,
0x9c
,
0x9d
,
0x9e
,
0x9f
,
# 0xFDD0 to 0xFDEF
0xfdd0
,
0xfdd1
,
0xfdd2
,
0xfdd3
,
0xfdd4
,
0xfdd5
,
0xfdd6
,
0xfdd7
,
0xfdd8
,
0xfdd9
,
0xfdda
,
0xfddb
,
0xfddc
,
0xfddd
,
0xfdde
,
0xfddf
,
0xfde0
,
0xfde1
,
0xfde2
,
0xfde3
,
0xfde4
,
0xfde5
,
0xfde6
,
0xfde7
,
0xfde8
,
0xfde9
,
0xfdea
,
0xfdeb
,
0xfdec
,
0xfded
,
0xfdee
,
0xfdef
,
# others
0xb
,
0xfffe
,
0xffff
,
0x1fffe
,
0x1ffff
,
0x2fffe
,
0x2ffff
,
0x3fffe
,
0x3ffff
,
0x4fffe
,
0x4ffff
,
0x5fffe
,
0x5ffff
,
0x6fffe
,
0x6ffff
,
0x7fffe
,
0x7ffff
,
0x8fffe
,
0x8ffff
,
0x9fffe
,
0x9ffff
,
0xafffe
,
0xaffff
,
0xbfffe
,
0xbffff
,
0xcfffe
,
0xcffff
,
0xdfffe
,
0xdffff
,
0xefffe
,
0xeffff
,
0xffffe
,
0xfffff
,
0x10fffe
,
0x10ffff
}
def
_replace_charref
(
s
):
s
=
s
.
group
(
1
)
if
s
[
0
]
==
'#'
:
# numeric charref
if
s
[
1
]
in
'xX'
:
num
=
int
(
s
[
2
:].
rstrip
(
';'
),
16
)
else
:
num
=
int
(
s
[
1
:].
rstrip
(
';'
))
if
num
in
_invalid_charrefs
:
return
_invalid_charrefs
[
num
]
if
0xD800
<=
num
<=
0xDFFF
or
num
>
0x10FFFF
:
return
'
\
uFFFD
'
if
num
in
_invalid_codepoints
:
return
''
return
chr
(
num
)
else
:
# named charref
if
s
in
_html5
:
return
_html5
[
s
]
# find the longest matching name (as defined by the standard)
for
x
in
range
(
len
(
s
)
-
1
,
1
,
-
1
):
if
s
[:
x
]
in
_html5
:
return
_html5
[
s
[:
x
]]
+
s
[
x
:]
else
:
return
'&'
+
s
_charref
=
_re
.
compile
(
r'&(#[0-9]+;?'
r'|#[xX][0-9a-fA-F]+;?'
r'|[^\t\n\f <&#;]{1,32};?)'
)
def
unescape
(
s
):
"""
Convert all named and numeric character references (e.g. >, >,
&x3e;) in the string s to the corresponding unicode characters.
This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the list of
HTML 5 named character references defined in html.entities.html5.
"""
if
'&'
not
in
s
:
return
s
return
_charref
.
sub
(
_replace_charref
,
s
)
Lib/html/parser.py
View file @
4a9ee267
...
...
@@ -8,9 +8,12 @@
# and CDATA (character data -- only end tags are special).
import
_markupbase
import
re
import
warnings
import
_markupbase
from
html
import
unescape
__all__
=
[
'HTMLParser'
]
...
...
@@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue
[:
1
]
==
'"'
==
attrvalue
[
-
1
:]:
attrvalue
=
attrvalue
[
1
:
-
1
]
if
attrvalue
:
attrvalue
=
self
.
unescape
(
attrvalue
)
attrvalue
=
unescape
(
attrvalue
)
attrs
.
append
((
attrname
.
lower
(),
attrvalue
))
k
=
m
.
end
()
...
...
@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
def
unknown_decl
(
self
,
data
):
if
self
.
strict
:
self
.
error
(
"unknown declaration: %r"
%
(
data
,))
# Internal -- helper to remove special character quoting
def
unescape
(
self
,
s
):
if
'&'
not
in
s
:
return
s
def
replaceEntities
(
s
):
s
=
s
.
groups
()[
0
]
try
:
if
s
[
0
]
==
"#"
:
s
=
s
[
1
:]
if
s
[
0
]
in
[
'x'
,
'X'
]:
c
=
int
(
s
[
1
:].
rstrip
(
';'
),
16
)
else
:
c
=
int
(
s
.
rstrip
(
';'
))
return
chr
(
c
)
except
ValueError
:
return
'&#'
+
s
else
:
from
html.entities
import
html5
if
s
in
html5
:
return
html5
[
s
]
elif
s
.
endswith
(
';'
):
return
'&'
+
s
for
x
in
range
(
2
,
len
(
s
)):
if
s
[:
x
]
in
html5
:
return
html5
[
s
[:
x
]]
+
s
[
x
:]
else
:
return
'&'
+
s
return
re
.
sub
(
r"&(#?[xX]?(?:[0-9a-fA-F]+;|\
w{
1,32};?))"
,
replaceEntities
,
s
,
flags
=
re
.
ASCII
)
Lib/test/test_html.py
View file @
4a9ee267
...
...
@@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase):
html
.
escape
(
'
\
'
<script>"&foo;"</script>
\
'
'
,
False
),
'
\
'
<script>"&foo;"</script>
\
'
'
)
def
test_unescape
(
self
):
numeric_formats
=
[
'&#%d'
,
'&#%d;'
,
'&#x%x'
,
'&#x%x;'
]
errmsg
=
'unescape(%r) should have returned %r'
def
check
(
text
,
expected
):
self
.
assertEqual
(
html
.
unescape
(
text
),
expected
,
msg
=
errmsg
%
(
text
,
expected
))
def
check_num
(
num
,
expected
):
for
format
in
numeric_formats
:
text
=
format
%
num
self
.
assertEqual
(
html
.
unescape
(
text
),
expected
,
msg
=
errmsg
%
(
text
,
expected
))
# check text with no character references
check
(
'no character references'
,
'no character references'
)
# check & followed by invalid chars
check
(
'&
\
n
&
\
t
& &&'
,
'&
\
n
&
\
t
& &&'
)
# check & followed by numbers and letters
check
(
'&0 &9 &a &0; &9; &a;'
,
'&0 &9 &a &0; &9; &a;'
)
# check incomplete entities at the end of the string
for
x
in
[
'&'
,
'&#'
,
'&#x'
,
'&#X'
,
'&#y'
,
'&#xy'
,
'&#Xy'
]:
check
(
x
,
x
)
check
(
x
+
';'
,
x
+
';'
)
# check several combinations of numeric character references,
# possibly followed by different characters
formats
=
[
'&#%d'
,
'&#%07d'
,
'&#%d;'
,
'&#%07d;'
,
'&#x%x'
,
'&#x%06x'
,
'&#x%x;'
,
'&#x%06x;'
,
'&#x%X'
,
'&#x%06X'
,
'&#X%x;'
,
'&#X%06x;'
]
for
num
,
char
in
zip
([
65
,
97
,
34
,
38
,
0x2603
,
0x101234
],
[
'A'
,
'a'
,
'"'
,
'&'
,
'
\
u2603
'
,
'
\
U00101234
'
]):
for
s
in
formats
:
check
(
s
%
num
,
char
)
for
end
in
[
' '
,
'X'
]:
check
((
s
+
end
)
%
num
,
char
+
end
)
# check invalid codepoints
for
cp
in
[
0xD800
,
0xDB00
,
0xDC00
,
0xDFFF
,
0x110000
]:
check_num
(
cp
,
'
\
uFFFD
'
)
# check more invalid codepoints
for
cp
in
[
0x1
,
0xb
,
0xe
,
0x7f
,
0xfffe
,
0xffff
,
0x10fffe
,
0x10ffff
]:
check_num
(
cp
,
''
)
# check invalid numbers
for
num
,
ch
in
zip
([
0x0d
,
0x80
,
0x95
,
0x9d
],
'
\
r
\
u20ac
\
u2022
\
x9d
'
):
check_num
(
num
,
ch
)
# check small numbers
check_num
(
0
,
'
\
uFFFD
'
)
check_num
(
9
,
'
\
t
'
)
# check a big number
check_num
(
1000000000000000000
,
'
\
uFFFD
'
)
# check that multiple trailing semicolons are handled correctly
for
e
in
[
'";'
,
'";'
,
'";'
,
'";'
]:
check
(
e
,
'";'
)
# check that semicolons in the middle don't create problems
for
e
in
[
'"quot;'
,
'"quot;'
,
'"quot;'
,
'"quot;'
]:
check
(
e
,
'"quot;'
)
# check triple adjacent charrefs
for
e
in
[
'"'
,
'"'
,
'"'
,
'"'
]:
check
(
e
*
3
,
'"""'
)
check
((
e
+
';'
)
*
3
,
'"""'
)
# check that the case is respected
for
e
in
[
'&'
,
'&'
,
'&'
,
'&'
]:
check
(
e
,
'&'
)
for
e
in
[
'&Amp'
,
'&Amp;'
]:
check
(
e
,
e
)
# check that non-existent named entities are returned unchanged
check
(
'&svadilfari;'
,
'&svadilfari;'
)
# the following examples are in the html5 specs
check
(
'¬it'
,
'¬it'
)
check
(
'¬it;'
,
'¬it;'
)
check
(
'¬in'
,
'¬in'
)
check
(
'∉'
,
'∉'
)
# a similar example with a long name
check
(
'¬ReallyAnExistingNamedCharacterReference;'
,
'¬ReallyAnExistingNamedCharacterReference;'
)
# longest valid name
check
(
'∳'
,
'∳'
)
# check a charref that maps to two unicode chars
check
(
'∾̳'
,
'
\
u223E
\
u0333
'
)
check
(
'&acE'
,
'&acE'
)
# see #12888
check
(
'{ '
*
1050
,
'{ '
*
1050
)
# see #15156
check
(
'ÉricÉric&alphacentauriαcentauri'
,
'ÉricÉric&alphacentauriαcentauri'
)
check
(
'&co;'
,
'&co;'
)
def
test_main
():
run_unittest
(
HtmlTests
)
if
__name__
==
'__main__'
:
test_
main
()
unittest
.
main
()
Lib/test/test_htmlparser.py
View file @
4a9ee267
...
...
@@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
for
html
,
expected
in
data
:
self
.
_run_check
(
html
,
expected
)
def
test_unescape_function
(
self
):
p
=
self
.
get_collector
()
self
.
assertEqual
(
p
.
unescape
(
'&#bad;'
),
'&#bad;'
)
self
.
assertEqual
(
p
.
unescape
(
'&'
),
'&'
)
# see #12888
self
.
assertEqual
(
p
.
unescape
(
'{ '
*
1050
),
'{ '
*
1050
)
# see #15156
self
.
assertEqual
(
p
.
unescape
(
'ÉricÉric'
'&alphacentauriαcentauri'
),
'ÉricÉric&alphacentauriαcentauri'
)
self
.
assertEqual
(
p
.
unescape
(
'&co;'
),
'&co;'
)
def
test_broken_comments
(
self
):
html
=
(
'<! not really a comment >'
'<! not a comment either -->'
...
...
Misc/NEWS
View file @
4a9ee267
...
...
@@ -59,6 +59,8 @@ Library
-
Issue
#
19449
:
in
csv
's writerow, handle non-string keys when generating the
error message that certain keys are not in the '
fieldnames
' list.
- Issue #2927: Added the unescape() function to the html module.
- Issue #8402: Added the escape() function to the glob module.
- Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment