Commit e4b0603f authored by Nicolas Delaby's avatar Nicolas Delaby

Do not trust specified encoding

This patch will always perform conversion against given encoding, in order to check if this codec is valid or not.
parent f6caaf1b
......@@ -333,17 +333,17 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
message = 'Conversion to base format succeeds'
if re_match is not None:
charset = re_match.group('charset')
if charset.lower() != 'utf-8':
try:
# Use encoding in html document
text_content = text_content.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
else:
message = 'Conversion to base format with charset %r succeeds'\
% charset
try:
# Use encoding in html document
text_content = text_content.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# Encoding read from document is wrong
text_content, message = guessCharsetAndConvert(self,
text_content, content_type)
else:
message = 'Conversion to base format with charset %r succeeds'\
% charset
if charset.lower() != 'utf-8':
charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well
def subCharset(matchobj):
......
......@@ -1704,6 +1704,11 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
self.assertTrue('AZERTYY' not in safe_html)
self.assertTrue('#FFAA44' in safe_html)
filename = 'broken_html.html'
file_object = makeFileUpload(filename)
web_page.edit(file=file_object)
converted = web_page.convert('html')[1]
def test_safeHTML_impossible_conversion(self):
"""Some html are not parsable.
"""
......
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:p="urn:schemas-microsoft-com:office:powerpoint" xmlns:a="urn:schemas-microsoft-com:office:access" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882" xmlns:rs="urn:schemas-microsoft-com:rowset" xmlns:z="#RowsetSchema" xmlns:b="urn:schemas-microsoft-com:office:publisher" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet" xmlns:odc="urn:schemas-microsoft-com:office:odc" xmlns:oa="urn:schemas-microsoft-com:office:activation" xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:q="http://schemas.xmlsoap.org/soap/envelope/" xmlns:rtc="http://microsoft.com/officenet/conferencing" xmlns:D="DAV:" xmlns:Repl="http://schemas.microsoft.com/repl/" xmlns:mt="http://schemas.microsoft.com/sharepoint/soap/meetings/" xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml" xmlns:ppda="http://www.passport.com/NameSpace.xsd" xmlns:ois="http://schemas.microsoft.com/sharepoint/soap/ois/" xmlns:dir="http://schemas.microsoft.com/sharepoint/soap/directory/" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:dsp="http://schemas.microsoft.com/sharepoint/dsp" xmlns:udc="http://schemas.microsoft.com/data/udc" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:sub="http://schemas.microsoft.com/sharepoint/soap/2002/1/alerts/" xmlns:ec="http://www.w3.org/2001/04/xmlenc#" xmlns:sp="http://schemas.microsoft.com/sharepoint/" xmlns:sps="http://schemas.microsoft.com/sharepoint/soap/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:udcs="http://schemas.microsoft.com/data/udc/soap" xmlns:udcxf="http://schemas.microsoft.com/data/udc/xmlfile" xmlns:udcp2p="http://schemas.microsoft.com/data/udc/parttopart" xmlns:wf="http://schemas.microsoft.com/sharepoint/soap/workflow/" xmlns:dsss="http://schemas.microsoft.com/office/2006/digsig-setup" xmlns:dssi="http://schemas.microsoft.com/office/2006/digsig" xmlns:mdssi="http://schemas.openxmlformats.org/package/2006/digital-signature" xmlns:mver="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns:mrels="http://schemas.openxmlformats.org/package/2006/relationships" xmlns:spwp="http://microsoft.com/sharepoint/webpartpages" xmlns:ex12t="http://schemas.microsoft.com/exchange/services/2006/types" xmlns:ex12m="http://schemas.microsoft.com/exchange/services/2006/messages" xmlns:pptsl="http://schemas.microsoft.com/sharepoint/soap/SlideLibrary/" xmlns:spsl="http://microsoft.com/webservices/SharePointPortalServer/PublishedLinksService" xmlns:Z="urn:schemas-microsoft-com:" xmlns:st="&#1;" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 12 (filtered medium)">
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]-->
<title>One-Time</title>
<style><!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0cm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.MsoAcetate, li.MsoAcetate, div.MsoAcetate
{mso-style-priority:99;
mso-style-link:"Balloon Text Char";
margin:0cm;
margin-bottom:.0001pt;
font-size:8.0pt;
font-family:"Tahoma","sans-serif";}
p.style1, li.style1, div.style1
{mso-style-name:style1;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style2, li.style2, div.style2
{mso-style-name:style2;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style3, li.style3, div.style3
{mso-style-name:style3;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style4, li.style4, div.style4
{mso-style-name:style4;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:18.0pt;
font-family:"Times New Roman","serif";}
span.EmailStyle22
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
span.BalloonTextChar
{mso-style-name:"Balloon Text Char";
mso-style-priority:99;
mso-style-link:"Balloon Text";
font-family:"Tahoma","sans-serif";}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;}
@page WordSection1
{size:612.0pt 792.0pt;
margin:72.0pt 72.0pt 72.0pt 72.0pt;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="2050" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-IE" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">n
<sup>th</sup> .<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">p;
<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">nt.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">on.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Regards,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<div>
<p class="MsoNormal" style="margin-bottom:10.0pt;line-height:115%"><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D">Ator<br>
<br>
</span><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:red">_______________________________________________</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
Cse<br>
oad<br>
<br>
e<br>
dqwodj;j;jk;lj
<img width="288" height="41" id="Picture_x0020_1" src="cid:image001.jpg@01CC7129.3570BB40"><br>
<br>
<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b><span lang="EN-US" style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span lang="EN-US" style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> ni[]
<br>
<b>Sent:</b> Th:00<br>
<b>To:</b> ne<br>
<b>Subject:</b> O<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal">One-Time<o:p></o:p></p>
</div>
<p>Thank you<o:p></o:p></p>
<div id="NewUser">
<table class="MsoNormalTable" border="0" cellspacing="0" cellpadding="0" width="600" style="width:450.0pt">
<tbody>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>r:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">04<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>r:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">1<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">7<o:p></o:p></p>
</td>
</tr>
<tr>
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm">
<p class="MsoNormal"><b>PaTts:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm">
<p class="MsoNormal">C0<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>td:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">€3.<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>Pt:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">081<o:p></o:p></p>
</td>
</tr>
<tr style="height:10.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:10.5pt"></td>
<td style="padding:0cm 0cm 0cm 0cm;height:10.5pt"></td>
</tr>
</tbody>
</table>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal"><b>Th0.<o:p></o:p></b></p>
</div>
</div>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal">Sho
<o:p></o:p></p>
</div>
<div>
<p class="MsoNormal"><a href="here">here</a><o:p></o:p></p>
</div>
<p class="MsoNormal" style="margin-bottom:12.0pt"><o:p>&nbsp;</o:p></p>
</div>
<FONT size=2 face=Arial>
6,000
w 10,9. Wrs cl
yofawne rs l stda ru
</FONT><A href="htsoe"><FONT size=2
face=Arial>wsr.</FONT></A>
<P><FONT size=2 face=Arial></FONT>&nbsp;</P>
<P><FONT size=2 face=Arial>Bu a <STRONG><A
href="htt/w.fces"><FONT
color=#000080>eoo</FONT></A></STRONG></FONT> </P>
<HR>
<P>Thsssent</P>
<HR>
<FONT face=Arial color=#000080 size=2>WsrMtr
24<BR></FONT>
</body>
</html>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment