Enhance the punycode decoder so that it can decode

unicode objects. Fix the idna codec and the tests.

Enhance the punycode decoder so that it can decode
unicode objects. Fix the idna codec and the tests.
0ac30f82 · Walter Dörwald · 1f05a3b7 · 0ac30f82 · 0ac30f82 · 0ac30f82
Commit 0ac30f82 authored May 11, 2007 by Walter Dörwald
Hide whitespace changes
Inline Side-by-side

Showing with 136 additions and 130 deletions

Lib/encodings/idna.py Lib/encodings/idna.py +23 -20

Lib/encodings/punycode.py Lib/encodings/punycode.py +4 -2

Lib/test/test_codecs.py Lib/test/test_codecs.py +109 -108

No files found.
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -7,7 +7,8 @@ from unicodedata import ucd_3_2_0 as unicodedata
 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
 # IDNA section 5
-ace_prefix = "xn--"
+ace_prefix = b"xn--"
+sace_prefix = "xn--"
 # This assumes query strings, so AllowUnassigned is true
 def nameprep(label):
@@ -87,7 +88,7 @@ def ToASCII(label):
        raise UnicodeError("label empty or too long")
    # Step 5: Check ACE prefix
-    if label.startswith(ace_prefix):
+    if label.startswith(sace_prefix):
        raise UnicodeError("Label starts with ACE prefix")
    # Step 6: Encode with PUNYCODE
@@ -134,7 +135,7 @@ def ToUnicode(label):
    # Step 7: Compare the result of step 6 with the one of step 3
    # label2 will already be in lower case.
-    if label.lower() != label2:
+    if str(label, "ascii").lower() != str(label2, "ascii"):
        raise UnicodeError("IDNA does not round-trip", label, label2)
    # Step 8: return the result of step 5
@@ -143,7 +144,7 @@ def ToUnicode(label):
 ### Codec APIs
 class Codec(codecs.Codec):
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
        if errors != 'strict':
            # IDNA is quite clear that implementations must be strict
@@ -152,19 +153,21 @@ class Codec(codecs.Codec):
        if not input:
            return b"", 0
-        result = []
+        result = b""
        labels = dots.split(input)
-        if labels and len(labels[-1])==0:
+        if labels and not labels[-1]:
            trailing_dot = b'.'
            del labels[-1]
        else:
            trailing_dot = b''
        for label in labels:
-            result.append(ToASCII(label))
+            if result:
-        # Join with U+002E
+                # Join with U+002E
-        return b".".join(result)+trailing_dot, len(input)
+                result.extend(b'.')
+            result.extend(ToASCII(label))
+        return result+trailing_dot, len(input)
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
        if errors != 'strict':
            raise UnicodeError("Unsupported error handling "+errors)
@@ -199,30 +202,31 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
            raise UnicodeError("unsupported error handling "+errors)
        if not input:
-            return ("", 0)
+            return (b'', 0)
        labels = dots.split(input)
-        trailing_dot = ''
+        trailing_dot = b''
        if labels:
            if not labels[-1]:
-                trailing_dot = '.'
+                trailing_dot = b'.'
                del labels[-1]
            elif not final:
                # Keep potentially unfinished label until the next call
                del labels[-1]
                if labels:
-                    trailing_dot = '.'
+                    trailing_dot = b'.'
-        result = []
+        result = b""
        size = 0
        for label in labels:
-            result.append(ToASCII(label))
            if size:
+                # Join with U+002E
+                result.extend(b'.')
                size += 1
+            result.extend(ToASCII(label))
            size += len(label)
-        # Join with U+002E
+        result += trailing_dot
-        result = ".".join(result) + trailing_dot
        size += len(trailing_dot)
        return (result, size)
@@ -239,8 +243,7 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
            labels = dots.split(input)
        else:
            # Must be ASCII string
-            input = str(input)
+            input = str(input, "ascii")
-            str(input, "ascii")
            labels = input.split(".")
        trailing_dot = ''

--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -181,6 +181,8 @@ def insertion_sort(base, extended, errors):
    return base
 def punycode_decode(text, errors):
+    if isinstance(text, str):
+        text = text.encode("ascii")
    pos = text.rfind(b"-")
    if pos == -1:
        base = ""
@@ -194,11 +196,11 @@ def punycode_decode(text, errors):
 class Codec(codecs.Codec):
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
        res = punycode_encode(input)
        return res, len(input)
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
        if errors not in ('strict', 'replace', 'ignore'):
            raise UnicodeError, "Unsupported error handling "+errors
        res = punycode_decode(input, errors)

--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -624,6 +624,7 @@ class PunycodeTest(unittest.TestCase):
    def test_decode(self):
        for uni, puny in punycode_testcases:
            self.assertEquals(uni, puny.decode("punycode"))
+            self.assertEquals(uni, puny.decode("ascii").decode("punycode"))
 class UnicodeInternalTest(unittest.TestCase):
    def test_bug1251300(self):
@@ -676,154 +677,154 @@ class UnicodeInternalTest(unittest.TestCase):
 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
 nameprep_tests = [
    # 3.1 Map to nothing.
-    ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
+    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
-     '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
+     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
-     '\xb8\x8f\xef\xbb\xbf',
+     b'\xb8\x8f\xef\xbb\xbf',
-     'foobarbaz'),
+     b'foobarbaz'),
    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
-    ('CAFE',
+    (b'CAFE',
-     'cafe'),
+     b'cafe'),
    # 3.3 Case folding 8bit U+00DF (german sharp s).
    # The original test case is bogus; it says \xc3\xdf
-    ('\xc3\x9f',
+    (b'\xc3\x9f',
-     'ss'),
+     b'ss'),
    # 3.4 Case folding U+0130 (turkish capital I with dot).
-    ('\xc4\xb0',
+    (b'\xc4\xb0',
-     'i\xcc\x87'),
+     b'i\xcc\x87'),
    # 3.5 Case folding multibyte U+0143 U+037A.
-    ('\xc5\x83\xcd\xba',
+    (b'\xc5\x83\xcd\xba',
-     '\xc5\x84 \xce\xb9'),
+     b'\xc5\x84 \xce\xb9'),
    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
    # XXX: skip this as it fails in UCS-2 mode
    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
    # 'telc\xe2\x88\x95kg\xcf\x83'),
    (None, None),
    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
-    ('j\xcc\x8c\xc2\xa0\xc2\xaa',
+    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
-     '\xc7\xb0 a'),
+     b'\xc7\xb0 a'),
    # 3.8 Case folding U+1FB7 and normalization.
-    ('\xe1\xbe\xb7',
+    (b'\xe1\xbe\xb7',
-     '\xe1\xbe\xb6\xce\xb9'),
+     b'\xe1\xbe\xb6\xce\xb9'),
    # 3.9 Self-reverting case folding U+01F0 and normalization.
    # The original test case is bogus, it says `\xc7\xf0'
-    ('\xc7\xb0',
+    (b'\xc7\xb0',
-     '\xc7\xb0'),
+     b'\xc7\xb0'),
    # 3.10 Self-reverting case folding U+0390 and normalization.
-    ('\xce\x90',
+    (b'\xce\x90',
-     '\xce\x90'),
+     b'\xce\x90'),
    # 3.11 Self-reverting case folding U+03B0 and normalization.
-    ('\xce\xb0',
+    (b'\xce\xb0',
-     '\xce\xb0'),
+     b'\xce\xb0'),
    # 3.12 Self-reverting case folding U+1E96 and normalization.
-    ('\xe1\xba\x96',
+    (b'\xe1\xba\x96',
-     '\xe1\xba\x96'),
+     b'\xe1\xba\x96'),
    # 3.13 Self-reverting case folding U+1F56 and normalization.
-    ('\xe1\xbd\x96',
+    (b'\xe1\xbd\x96',
-     '\xe1\xbd\x96'),
+     b'\xe1\xbd\x96'),
    # 3.14 ASCII space character U+0020.
-    (' ',
+    (b' ',
-     ' '),
+     b' '),
    # 3.15 Non-ASCII 8bit space character U+00A0.
-    ('\xc2\xa0',
+    (b'\xc2\xa0',
-     ' '),
+     b' '),
    # 3.16 Non-ASCII multibyte space character U+1680.
-    ('\xe1\x9a\x80',
+    (b'\xe1\x9a\x80',
     None),
    # 3.17 Non-ASCII multibyte space character U+2000.
-    ('\xe2\x80\x80',
+    (b'\xe2\x80\x80',
-     ' '),
+     b' '),
    # 3.18 Zero Width Space U+200b.
-    ('\xe2\x80\x8b',
+    (b'\xe2\x80\x8b',
-     ''),
+     b''),
    # 3.19 Non-ASCII multibyte space character U+3000.
-    ('\xe3\x80\x80',
+    (b'\xe3\x80\x80',
-     ' '),
+     b' '),
    # 3.20 ASCII control characters U+0010 U+007F.
-    ('\x10\x7f',
+    (b'\x10\x7f',
-     '\x10\x7f'),
+     b'\x10\x7f'),
    # 3.21 Non-ASCII 8bit control character U+0085.
-    ('\xc2\x85',
+    (b'\xc2\x85',
     None),
    # 3.22 Non-ASCII multibyte control character U+180E.
-    ('\xe1\xa0\x8e',
+    (b'\xe1\xa0\x8e',
     None),
    # 3.23 Zero Width No-Break Space U+FEFF.
-    ('\xef\xbb\xbf',
+    (b'\xef\xbb\xbf',
-     ''),
+     b''),
    # 3.24 Non-ASCII control character U+1D175.
-    ('\xf0\x9d\x85\xb5',
+    (b'\xf0\x9d\x85\xb5',
     None),
    # 3.25 Plane 0 private use character U+F123.
-    ('\xef\x84\xa3',
+    (b'\xef\x84\xa3',
     None),
    # 3.26 Plane 15 private use character U+F1234.
-    ('\xf3\xb1\x88\xb4',
+    (b'\xf3\xb1\x88\xb4',
     None),
    # 3.27 Plane 16 private use character U+10F234.
-    ('\xf4\x8f\x88\xb4',
+    (b'\xf4\x8f\x88\xb4',
     None),
    # 3.28 Non-character code point U+8FFFE.
-    ('\xf2\x8f\xbf\xbe',
+    (b'\xf2\x8f\xbf\xbe',
     None),
    # 3.29 Non-character code point U+10FFFF.
-    ('\xf4\x8f\xbf\xbf',
+    (b'\xf4\x8f\xbf\xbf',
     None),
    # 3.30 Surrogate code U+DF42.
-    ('\xed\xbd\x82',
+    (b'\xed\xbd\x82',
     None),
    # 3.31 Non-plain text character U+FFFD.
-    ('\xef\xbf\xbd',
+    (b'\xef\xbf\xbd',
     None),
    # 3.32 Ideographic description character U+2FF5.
-    ('\xe2\xbf\xb5',
+    (b'\xe2\xbf\xb5',
     None),
    # 3.33 Display property character U+0341.
-    ('\xcd\x81',
+    (b'\xcd\x81',
-     '\xcc\x81'),
+     b'\xcc\x81'),
    # 3.34 Left-to-right mark U+200E.
-    ('\xe2\x80\x8e',
+    (b'\xe2\x80\x8e',
     None),
    # 3.35 Deprecated U+202A.
-    ('\xe2\x80\xaa',
+    (b'\xe2\x80\xaa',
     None),
    # 3.36 Language tagging character U+E0001.
-    ('\xf3\xa0\x80\x81',
+    (b'\xf3\xa0\x80\x81',
     None),
    # 3.37 Language tagging character U+E0042.
-    ('\xf3\xa0\x81\x82',
+    (b'\xf3\xa0\x81\x82',
     None),
    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
-    ('foo\xd6\xbebar',
+    (b'foo\xd6\xbebar',
     None),
    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
-    ('foo\xef\xb5\x90bar',
+    (b'foo\xef\xb5\x90bar',
     None),
    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
-    ('foo\xef\xb9\xb6bar',
+    (b'foo\xef\xb9\xb6bar',
-     'foo \xd9\x8ebar'),
+     b'foo \xd9\x8ebar'),
    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
-    ('\xd8\xa71',
+    (b'\xd8\xa71',
     None),
    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
-    ('\xd8\xa71\xd8\xa8',
+    (b'\xd8\xa71\xd8\xa8',
-     '\xd8\xa71\xd8\xa8'),
+     b'\xd8\xa71\xd8\xa8'),
    # 3.43 Unassigned code point U+E0002.
    # Skip this test as we allow unassigned
-    #('\xf3\xa0\x80\x82',
+    #(b'\xf3\xa0\x80\x82',
    # None),
    (None, None),
    # 3.44 Larger test (shrinking).
    # Original test case reads \xc3\xdf
-    ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
+    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
-     '\xaa\xce\xb0\xe2\x80\x80',
+     b'\xaa\xce\xb0\xe2\x80\x80',
-     'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
+     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
    # 3.45 Larger test (expanding).
    # Original test case reads \xc3\x9f
-    ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
+    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
-     '\x80',
+     b'\x80',
-     'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
+     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
-     '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
+     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
-     '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
+     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
    ]
@@ -848,16 +849,16 @@ class NameprepTest(unittest.TestCase):
 class IDNACodecTest(unittest.TestCase):
    def test_builtin_decode(self):
-        self.assertEquals(str("python.org", "idna"), "python.org")
+        self.assertEquals(str(b"python.org", "idna"), "python.org")
-        self.assertEquals(str("python.org.", "idna"), "python.org.")
+        self.assertEquals(str(b"python.org.", "idna"), "python.org.")
-        self.assertEquals(str("xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
+        self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
-        self.assertEquals(str("xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
+        self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
    def test_builtin_encode(self):
-        self.assertEquals("python.org".encode("idna"), "python.org")
+        self.assertEquals("python.org".encode("idna"), b"python.org")
-        self.assertEquals("python.org.".encode("idna"), "python.org.")
+        self.assertEquals("python.org.".encode("idna"), b"python.org.")
-        self.assertEquals("pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
+        self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
-        self.assertEquals("pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
+        self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
    def test_stream(self):
        r = codecs.getreader("idna")(io.BytesIO(b"abc"))
@@ -866,61 +867,61 @@ class IDNACodecTest(unittest.TestCase):
    def test_incremental_decode(self):
        self.assertEquals(
-            "".join(codecs.iterdecode("python.org", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org"), "idna")),
            "python.org"
        )
        self.assertEquals(
-            "".join(codecs.iterdecode("python.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org."), "idna")),
            "python.org."
        )
        self.assertEquals(
-            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
            "pyth\xf6n.org."
        )
        self.assertEquals(
-            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
            "pyth\xf6n.org."
        )
        decoder = codecs.getincrementaldecoder("idna")()
-        self.assertEquals(decoder.decode("xn--xam", ), "")
+        self.assertEquals(decoder.decode(b"xn--xam", ), "")
-        self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
+        self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
-        self.assertEquals(decoder.decode("rg"), "")
+        self.assertEquals(decoder.decode(b"rg"), "")
-        self.assertEquals(decoder.decode("", True), "org")
+        self.assertEquals(decoder.decode(b"", True), "org")
        decoder.reset()
-        self.assertEquals(decoder.decode("xn--xam", ), "")
+        self.assertEquals(decoder.decode(b"xn--xam", ), "")
-        self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
+        self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
-        self.assertEquals(decoder.decode("rg."), "org.")
+        self.assertEquals(decoder.decode(b"rg."), "org.")
-        self.assertEquals(decoder.decode("", True), "")
+        self.assertEquals(decoder.decode(b"", True), "")
    def test_incremental_encode(self):
        self.assertEquals(
-            "".join(codecs.iterencode("python.org", "idna")),
+            b"".join(codecs.iterencode("python.org", "idna")),
-            "python.org"
+            b"python.org"
        )
        self.assertEquals(
-            "".join(codecs.iterencode("python.org.", "idna")),
+            b"".join(codecs.iterencode("python.org.", "idna")),
-            "python.org."
+            b"python.org."
        )
        self.assertEquals(
-            "".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
+            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
-            "xn--pythn-mua.org."
+            b"xn--pythn-mua.org."
        )
        self.assertEquals(
-            "".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
+            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
-            "xn--pythn-mua.org."
+            b"xn--pythn-mua.org."
        )
        encoder = codecs.getincrementalencoder("idna")()
-        self.assertEquals(encoder.encode("\xe4x"), "")
+        self.assertEquals(encoder.encode("\xe4x"), b"")
-        self.assertEquals(encoder.encode("ample.org"), "xn--xample-9ta.")
+        self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
-        self.assertEquals(encoder.encode("", True), "org")
+        self.assertEquals(encoder.encode("", True), b"org")
        encoder.reset()
-        self.assertEquals(encoder.encode("\xe4x"), "")
+        self.assertEquals(encoder.encode("\xe4x"), b"")
-        self.assertEquals(encoder.encode("ample.org."), "xn--xample-9ta.org.")
+        self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
-        self.assertEquals(encoder.encode("", True), "")
+        self.assertEquals(encoder.encode("", True), b"")
 class CodecsModuleTest(unittest.TestCase):