LP/#324876: tighened regex for detecting the charset

from a meta-equiv header

LP/#324876: tighened regex for detecting the charset
from a meta-equiv header
09892b36 · Andreas Jung · a30cc136 · 09892b36 · 09892b36
Commit 09892b36 authored Feb 04, 2009 by Andreas Jung
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

doc/CHANGES.txt doc/CHANGES.txt +3 -0

lib/python/Products/PageTemplates/utils.py lib/python/Products/PageTemplates/utils.py +9 -1

No files found.
--- a/doc/CHANGES.txt
+++ b/doc/CHANGES.txt
@@ -19,6 +19,9 @@ Zope Changes
    Bugs Fixed
+      - LP/#324876: tighened regex for detecting the charset
+        from a meta-equiv header
      - configure script: setting ZOPE_VERS to '2.11'
      - Acquisition wrappers now correctly proxy __iter__.

--- a/lib/python/Products/PageTemplates/utils.py
+++ b/lib/python/Products/PageTemplates/utils.py
@@ -20,7 +20,15 @@ import re
 xml_preamble_reg = re.compile(r'^<\?xml.*?encoding="(.*?)".*?\?>', re.M)
-http_equiv_reg = re.compile(r'(<meta.*?http\-equiv.*?content-type.*?>)', re.I|re.M|re.S)
+# This regular expression is defined extremely carelessly. It starts
+#  with a tag beginning with 'meta' and extends until an arbitrary
+#  'content-type' (maybe in a completely unrelated element).
+#  Tighten the expression a bit.
+#  Note that using a regular expression at all is unreliable as it does
+#  not know about e.g. HTML comments. A robust solution would need to
+#  use an HTML parser to locate the 'meta' tag.
+#http_equiv_reg = re.compile(r'(<meta.*?http\-equiv.*?content-type.*?>)', re.I|re.M|re.S)
+http_equiv_reg = re.compile(r'(<meta\s+[^>]*?http\-equiv[^>]*?content-type.*?>)', re.I|re.M|re.S)
 http_equiv_reg2 = re.compile(r'charset.*?=.*?(?P<charset>[\w\-]*)', re.I|re.M|re.S)
 def encodingFromXMLPreamble(xml):