Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
63194a77
Commit
63194a77
authored
Jul 11, 2013
by
R David Murray
Browse files
Options
Browse Files
Download
Plain Diff
Merge: #18044: Fix parsing of encoded words of the form =?utf8?q?=XX...?=
parents
f9e6672a
65171b28
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
62 additions
and
40 deletions
+62
-40
Lib/email/_header_value_parser.py
Lib/email/_header_value_parser.py
+7
-36
Lib/test/test_email/test__encoded_words.py
Lib/test/test_email/test__encoded_words.py
+5
-0
Lib/test/test_email/test__header_value_parser.py
Lib/test/test_email/test__header_value_parser.py
+9
-0
Lib/test/test_email/test_headerregistry.py
Lib/test/test_email/test_headerregistry.py
+37
-4
Misc/NEWS
Misc/NEWS
+4
-0
No files found.
Lib/email/_header_value_parser.py
View file @
63194a77
...
...
@@ -69,6 +69,7 @@ XXX: provide complete list of token types.
import
re
import
urllib
# For urllib.parse.unquote
from
string
import
hexdigits
from
collections
import
namedtuple
,
OrderedDict
from
email
import
_encoded_words
as
_ew
from
email
import
errors
...
...
@@ -391,10 +392,6 @@ class UnstructuredTokenList(TokenList):
token_type
=
'unstructured'
def
_fold
(
self
,
folded
):
if
any
(
x
.
token_type
==
'encoded-word'
for
x
in
self
):
return
self
.
_fold_encoded
(
folded
)
# Here we can have either a pure ASCII string that may or may not
# have surrogateescape encoded bytes, or a unicode string.
last_ew
=
None
for
part
in
self
.
parts
:
tstr
=
str
(
part
)
...
...
@@ -1386,35 +1383,6 @@ def _get_ptext_to_endchars(value, endchars):
pos = pos + 1
return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
def _decode_ew_run(value):
""" Decode a run of RFC2047 encoded words.
_decode_ew_run(value) -> (text, value, defects)
Scans the supplied value for a run of tokens that look like they are RFC
2047 encoded words, decodes those words into text according to RFC 2047
rules (whitespace between encoded words is discarded), and returns the text
and the remaining value (including any leading whitespace on the remaining
value), as well as a list of any defects encountered while decoding. The
input value may not have any leading whitespace.
"""
res = []
defects = []
last_ws = ''
while value:
try:
tok, ws, value = _wsp_splitter(value, 1)
except ValueError:
tok, ws, value = value, '', ''
if not (tok.startswith('
=
?
') and tok.endswith('
?
=
')):
return ''.join(res), last_ws + tok + ws + value, defects
text, charset, lang, new_defects = _ew.decode(tok)
res.append(text)
defects.extend(new_defects)
last_ws = ws
return ''.join(res), last_ws, defects
def get_fws(value):
"""FWS = 1*WSP
...
...
@@ -1440,7 +1408,8 @@ def get_encoded_word(value):
raise errors.HeaderParseError(
"expected encoded word but found {}".format(value))
remstr = ''.join(remainder)
if remstr[:2].isdigit():
if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
# The ? after the CTE was followed by an encoded word escape (=XX).
rest, *remainder = remstr.split('?=', 1)
tok = tok + '?=' + rest
if len(tok.split()) > 1:
...
...
@@ -1488,8 +1457,8 @@ def get_unstructured(value):
"""
# XXX: but what about bare CR and LF? They might signal the start or
# end of an encoded word. YAGNI for now, since ou
t
current parsers
# will never send us strings with bar
d
CR or LF.
# end of an encoded word. YAGNI for now, since ou
r
current parsers
# will never send us strings with bar
e
CR or LF.
unstructured = UnstructuredTokenList()
while value:
...
...
@@ -1501,6 +1470,8 @@ def get_unstructured(value):
try:
token, value = get_encoded_word(value)
except errors.HeaderParseError:
# XXX: Need to figure out how to register defects when
# appropriate here.
pass
else:
have_ws = True
...
...
Lib/test/test_email/test__encoded_words.py
View file @
63194a77
...
...
@@ -122,6 +122,11 @@ class TestDecode(TestEmailBase):
# XXX Should this be a new Defect instead?
defects
=
[
errors
.
CharsetError
])
def
test_q_nonascii
(
self
):
self
.
_test
(
'=?utf-8?q?=C3=89ric?='
,
'Éric'
,
charset
=
'utf-8'
)
class
TestEncodeQ
(
TestEmailBase
):
...
...
Lib/test/test_email/test__header_value_parser.py
View file @
63194a77
...
...
@@ -170,6 +170,15 @@ class TestParser(TestParserMixin, TestEmailBase):
[],
'')
def test_get_encoded_word_quopri_utf_escape_follows_cte(self):
# Issue 18044
self._test_get_x(parser.get_encoded_word,
'
=
?
utf
-
8
?
q
?
=
C3
=
89
ric
?
=
',
'
É
ric
',
'
É
ric
',
[],
'')
# get_unstructured
def _get_unst(self, value):
...
...
Lib/test/test_email/test_headerregistry.py
View file @
63194a77
...
...
@@ -123,12 +123,45 @@ class TestBaseHeaderFeatures(TestHeaderBase):
# self.assertEqual(h, value)
# self.assertDefectsEqual(h.defects, [errors.ObsoleteHeaderDefect])
def
test_RFC2047_value_decoded
(
self
):
value
=
'=?utf-8?q?this_is_a_test?='
h
=
self
.
make_header
(
'subject'
,
value
)
self
.
assertEqual
(
h
,
'this is a test'
)
@
parameterize
class
TestUnstructuredHeader
(
TestHeaderBase
):
def
string_as_value
(
self
,
source
,
decoded
,
*
args
):
l
=
len
(
args
)
defects
=
args
[
0
]
if
l
>
0
else
[]
header
=
'Subject:'
+
(
' '
if
source
else
''
)
folded
=
header
+
(
args
[
1
]
if
l
>
1
else
source
)
+
'
\
n
'
h
=
self
.
make_header
(
'Subject'
,
source
)
self
.
assertEqual
(
h
,
decoded
)
self
.
assertDefectsEqual
(
h
.
defects
,
defects
)
self
.
assertEqual
(
h
.
fold
(
policy
=
policy
.
default
),
folded
)
string_params
=
{
'rfc2047_simple_quopri'
:
(
'=?utf-8?q?this_is_a_test?='
,
'this is a test'
,
[],
'this is a test'
),
'rfc2047_gb2312_base64'
:
(
'=?gb2312?b?1eLKx9bQzsSy4srUo6E=?='
,
'
\
u8fd9
\
u662f
\
u4e2d
\
u6587
\
u6d4b
\
u8bd5
\
uff01
'
,
[],
'=?utf-8?b?6L+Z5piv5Lit5paH5rWL6K+V77yB?='
),
'rfc2047_simple_nonascii_quopri'
:
(
'=?utf-8?q?=C3=89ric?='
,
'Éric'
),
}
@
parameterize
class
TestDateHeader
(
TestHeaderBase
):
datestring
=
'Sun, 23 Sep 2001 20:10:55 -0700'
...
...
Misc/NEWS
View file @
63194a77
...
...
@@ -151,6 +151,10 @@ Core and Builtins
Library
-------
-
Issue
#
18044
:
The
new
email
header
parser
was
mis
-
parsing
encoded
words
where
an
encoded
character
immediately
followed
the
'?'
that
follows
the
CTE
character
,
resulting
in
a
decoding
failure
.
They
are
now
decoded
correctly
.
-
Issue
#
18101
:
Tcl
.
split
()
now
process
strings
nested
in
a
tuple
as
it
do
with
byte
strings
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment