test_unicode.py 51.1 KB
Newer Older
Martin v. Löwis's avatar
Martin v. Löwis committed
1
# -*- coding: iso-8859-1 -*-
2 3 4 5 6 7
""" Test script for the Unicode implementation.

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

8
"""#"
9 10 11 12 13
import codecs
import struct
import sys
import unittest
import warnings
14
from test import support, string_tests
15

16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
# Error handling (bad decoder return)
def search_function(encoding):
    def decode1(input, errors="strict"):
        return 42 # not a tuple
    def encode1(input, errors="strict"):
        return 42 # not a tuple
    def encode2(input, errors="strict"):
        return (42, 42) # no unicode
    def decode2(input, errors="strict"):
        return (42, 42) # no unicode
    if encoding=="test.unicode1":
        return (encode1, decode1, None, None)
    elif encoding=="test.unicode2":
        return (encode2, decode2, None, None)
    else:
        return None
codecs.register(search_function)

34 35
class UnicodeTest(
    string_tests.CommonTest,
36 37
    string_tests.MixinStrUnicodeUserStringTest,
    string_tests.MixinStrUnicodeTest,
38
    ):
39
    type2test = str
40

41 42 43 44 45 46
    def setUp(self):
        self.warning_filters = warnings.filters[:]

    def tearDown(self):
        warnings.filters = self.warning_filters

47 48 49 50 51 52 53 54 55
    def checkequalnofix(self, result, object, methodname, *args):
        method = getattr(object, methodname)
        realresult = method(*args)
        self.assertEqual(realresult, result)
        self.assert_(type(realresult) is type(result))

        # if the original is returned make sure that
        # this doesn't happen with subclasses
        if realresult is object:
56
            class usub(str):
57
                def __repr__(self):
58
                    return 'usub(%r)' % str.__repr__(self)
59 60 61 62 63
            object = usub(object)
            method = getattr(object, methodname)
            realresult = method(*args)
            self.assertEqual(realresult, result)
            self.assert_(object is not realresult)
64

65
    def test_literals(self):
66 67
        self.assertEqual('\xff', '\u00ff')
        self.assertEqual('\uffff', '\U0000ffff')
68 69 70
        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
71 72
        # raw strings should not have unicode escapes
        self.assertNotEquals(r"\u0020", " ")
73

Georg Brandl's avatar
Georg Brandl committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
    def test_ascii(self):
        if not sys.platform.startswith('java'):
            # Test basic sanity of repr()
            self.assertEqual(ascii('abc'), "'abc'")
            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
            self.assertEqual(ascii('\\c'), "'\\\\c'")
            self.assertEqual(ascii('\\'), "'\\\\'")
            self.assertEqual(ascii('\n'), "'\\n'")
            self.assertEqual(ascii('\r'), "'\\r'")
            self.assertEqual(ascii('\t'), "'\\t'")
            self.assertEqual(ascii('\b'), "'\\x08'")
            self.assertEqual(ascii("'\""), """'\\'"'""")
            self.assertEqual(ascii("'\""), """'\\'"'""")
            self.assertEqual(ascii("'"), '''"'"''')
            self.assertEqual(ascii('"'), """'"'""")
            latin1repr = (
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
                "\\xfe\\xff'")
            testrepr = ascii(''.join(map(chr, range(256))))
            self.assertEqual(testrepr, latin1repr)
            # Test ascii works on wide unicode escapes without overflow.
            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
                             ascii("\U00010000" * 39 + "\uffff" * 4096))

            class WrongRepr:
                def __repr__(self):
                    return b'byte-repr'
            self.assertRaises(TypeError, ascii, WrongRepr())

116 117 118
    def test_repr(self):
        if not sys.platform.startswith('java'):
            # Test basic sanity of repr()
119 120 121 122 123 124 125 126 127 128 129 130 131
            self.assertEqual(repr('abc'), "'abc'")
            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
            self.assertEqual(repr('ab\\'), "'ab\\\\'")
            self.assertEqual(repr('\\c'), "'\\\\c'")
            self.assertEqual(repr('\\'), "'\\\\'")
            self.assertEqual(repr('\n'), "'\\n'")
            self.assertEqual(repr('\r'), "'\\r'")
            self.assertEqual(repr('\t'), "'\\t'")
            self.assertEqual(repr('\b'), "'\\x08'")
            self.assertEqual(repr("'\""), """'\\'"'""")
            self.assertEqual(repr("'\""), """'\\'"'""")
            self.assertEqual(repr("'"), '''"'"''')
            self.assertEqual(repr('"'), """'"'""")
132
            latin1repr = (
133
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
134 135 136 137 138
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl's avatar
Georg Brandl committed
139 140 141 142 143 144 145 146
                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
                "\xfe\xff'")
147
            testrepr = repr(''.join(map(chr, range(256))))
148
            self.assertEqual(testrepr, latin1repr)
149
            # Test repr works on wide unicode escapes without overflow.
150 151
            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
                             repr("\U00010000" * 39 + "\uffff" * 4096))
152

Georg Brandl's avatar
Georg Brandl committed
153 154 155 156 157
            class WrongRepr:
                def __repr__(self):
                    return b'byte-repr'
            self.assertRaises(TypeError, repr, WrongRepr())

158 159
    def test_iterators(self):
        # Make sure unicode objects have an __iter__ method
160 161 162 163
        it = "\u1111\u2222\u3333".__iter__()
        self.assertEqual(next(it), "\u1111")
        self.assertEqual(next(it), "\u2222")
        self.assertEqual(next(it), "\u3333")
164
        self.assertRaises(StopIteration, next, it)
165

166
    def test_count(self):
167 168
        string_tests.CommonTest.test_count(self)
        # check mixed argument types
169 170 171 172 173 174 175 176 177
        self.checkequalnofix(3,  'aaa', 'count', 'a')
        self.checkequalnofix(0,  'aaa', 'count', 'b')
        self.checkequalnofix(3, 'aaa', 'count',  'a')
        self.checkequalnofix(0, 'aaa', 'count',  'b')
        self.checkequalnofix(0, 'aaa', 'count',  'b')
        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
178 179

    def test_find(self):
180 181 182
        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
183

184 185
        self.assertRaises(TypeError, 'hello'.find)
        self.assertRaises(TypeError, 'hello'.find, 42)
186 187

    def test_rfind(self):
188 189
        string_tests.CommonTest.test_rfind(self)
        # check mixed argument types
190 191 192
        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
193 194

    def test_index(self):
195
        string_tests.CommonTest.test_index(self)
196 197 198 199 200 201 202 203
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
204 205

    def test_rindex(self):
206
        string_tests.CommonTest.test_rindex(self)
207 208 209 210 211 212 213 214 215 216
        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)

        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
217

218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
    def test_maketrans_translate(self):
        # these work with plain translate()
        self.checkequalnofix('bbbc', 'abababc', 'translate',
                             {ord('a'): None})
        self.checkequalnofix('iiic', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ord('i')})
        self.checkequalnofix('iiix', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
        self.checkequalnofix('c', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ''})
        self.checkequalnofix('xyyx', 'xzx', 'translate',
                             {ord('z'): 'yy'})
        # this needs maketrans()
        self.checkequalnofix('abababc', 'abababc', 'translate',
                             {'b': '<i>'})
        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
        # test alternative way of calling maketrans()
        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)

        self.assertRaises(TypeError, self.type2test.maketrans)
        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
246

247
        self.assertRaises(TypeError, 'hello'.translate)
248
        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
249 250

    def test_split(self):
251 252 253
        string_tests.CommonTest.test_split(self)

        # Mixed arguments
254 255 256
        self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
        self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
        self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
257 258

    def test_join(self):
259 260
        string_tests.MixinStrUnicodeUserStringTest.test_join(self)

Guido van Rossum's avatar
Guido van Rossum committed
261 262 263 264
        class MyWrapper:
            def __init__(self, sval): self.sval = sval
            def __str__(self): return self.sval

265
        # mixed arguments
266 267 268 269 270 271 272
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
273 274 275 276
        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
277 278

    def test_replace(self):
279
        string_tests.CommonTest.test_replace(self)
280 281

        # method call forwarded from str implementation because of unicode argument
282 283
        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
284

285 286 287 288
    def test_bytes_comparison(self):
        warnings.simplefilter('ignore', BytesWarning)
        self.assertEqual('abc' == b'abc', False)
        self.assertEqual('abc' != b'abc', True)
Guido van Rossum's avatar
Guido van Rossum committed
289 290
        self.assertEqual('abc' == bytearray(b'abc'), False)
        self.assertEqual('abc' != bytearray(b'abc'), True)
291

292 293
    def test_comparison(self):
        # Comparisons:
294 295 296 297 298 299 300 301 302
        self.assertEqual('abc', 'abc')
        self.assertEqual('abc', 'abc')
        self.assertEqual('abc', 'abc')
        self.assert_('abcd' > 'abc')
        self.assert_('abcd' > 'abc')
        self.assert_('abcd' > 'abc')
        self.assert_('abc' < 'abcd')
        self.assert_('abc' < 'abcd')
        self.assert_('abc' < 'abcd')
303 304 305 306 307 308

        if 0:
            # Move these tests to a Unicode collation module test...
            # Testing UTF-16 code point order comparisons...

            # No surrogates, no fixup required.
309
            self.assert_('\u0061' < '\u20ac')
310
            # Non surrogate below surrogate value, no fixup required
311
            self.assert_('\u0061' < '\ud800\udc02')
312 313 314 315 316 317

            # Non surrogate above surrogate value, fixup required
            def test_lecmp(s, s2):
                self.assert_(s < s2)

            def test_fixup(s):
318
                s2 = '\ud800\udc01'
319
                test_lecmp(s, s2)
320
                s2 = '\ud900\udc01'
321
                test_lecmp(s, s2)
322
                s2 = '\uda00\udc01'
323
                test_lecmp(s, s2)
324
                s2 = '\udb00\udc01'
325
                test_lecmp(s, s2)
326
                s2 = '\ud800\udd01'
327
                test_lecmp(s, s2)
328
                s2 = '\ud900\udd01'
329
                test_lecmp(s, s2)
330
                s2 = '\uda00\udd01'
331
                test_lecmp(s, s2)
332
                s2 = '\udb00\udd01'
333
                test_lecmp(s, s2)
334
                s2 = '\ud800\ude01'
335
                test_lecmp(s, s2)
336
                s2 = '\ud900\ude01'
337
                test_lecmp(s, s2)
338
                s2 = '\uda00\ude01'
339
                test_lecmp(s, s2)
340
                s2 = '\udb00\ude01'
341
                test_lecmp(s, s2)
342
                s2 = '\ud800\udfff'
343
                test_lecmp(s, s2)
344
                s2 = '\ud900\udfff'
345
                test_lecmp(s, s2)
346
                s2 = '\uda00\udfff'
347
                test_lecmp(s, s2)
348
                s2 = '\udb00\udfff'
349 350
                test_lecmp(s, s2)

351 352
                test_fixup('\ue000')
                test_fixup('\uff61')
353 354

        # Surrogates on both sides, no fixup required
355
        self.assert_('\ud800\udc02' < '\ud84d\udc56')
356 357

    def test_islower(self):
358
        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
359
        self.checkequalnofix(False, '\u1FFc', 'islower')
360 361

    def test_isupper(self):
362 363
        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
        if not sys.platform.startswith('java'):
364
            self.checkequalnofix(False, '\u1FFc', 'isupper')
365 366

    def test_istitle(self):
367
        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
368 369
        self.checkequalnofix(True, '\u1FFc', 'istitle')
        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
370 371

    def test_isspace(self):
372
        string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
373 374 375
        self.checkequalnofix(True, '\u2000', 'isspace')
        self.checkequalnofix(True, '\u200a', 'isspace')
        self.checkequalnofix(False, '\u2014', 'isspace')
376 377

    def test_isalpha(self):
378
        string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
379
        self.checkequalnofix(True, '\u1FFc', 'isalpha')
380 381

    def test_isdecimal(self):
382 383 384 385 386 387 388 389
        self.checkequalnofix(False, '', 'isdecimal')
        self.checkequalnofix(False, 'a', 'isdecimal')
        self.checkequalnofix(True, '0', 'isdecimal')
        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
        self.checkequalnofix(True, '0123456789', 'isdecimal')
        self.checkequalnofix(False, '0123456789a', 'isdecimal')
390

391
        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
392 393

    def test_isdigit(self):
394
        string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
395 396 397
        self.checkequalnofix(True, '\u2460', 'isdigit')
        self.checkequalnofix(False, '\xbc', 'isdigit')
        self.checkequalnofix(True, '\u0660', 'isdigit')
398 399

    def test_isnumeric(self):
400 401 402 403 404 405 406 407
        self.checkequalnofix(False, '', 'isnumeric')
        self.checkequalnofix(False, 'a', 'isnumeric')
        self.checkequalnofix(True, '0', 'isnumeric')
        self.checkequalnofix(True, '\u2460', 'isnumeric')
        self.checkequalnofix(True, '\xbc', 'isnumeric')
        self.checkequalnofix(True, '\u0660', 'isnumeric')
        self.checkequalnofix(True, '0123456789', 'isnumeric')
        self.checkequalnofix(False, '0123456789a', 'isnumeric')
408

409
        self.assertRaises(TypeError, "abc".isnumeric, 42)
410

411 412 413 414 415 416 417 418 419 420 421 422
    def test_isidentifier(self):
        self.assertTrue("a".isidentifier())
        self.assertTrue("Z".isidentifier())
        self.assertTrue("_".isidentifier())
        self.assertTrue("b0".isidentifier())
        self.assertTrue("bc".isidentifier())
        self.assertTrue("b_".isidentifier())
        self.assertTrue("".isidentifier())

        self.assertFalse(" ".isidentifier())
        self.assertFalse("[".isidentifier())
        self.assertFalse("".isidentifier())
423
        self.assertFalse("0".isidentifier())
424

Georg Brandl's avatar
Georg Brandl committed
425 426 427 428
    def test_isprintable(self):
        self.assertTrue("".isprintable())
        self.assertTrue("abcdefg".isprintable())
        self.assertFalse("abcdefg\n".isprintable())
429 430 431
        # some defined Unicode character
        self.assertTrue("\u0374".isprintable())
        # undefined character
432
        self.assertFalse("\u0378".isprintable())
433
        # single surrogate character
Georg Brandl's avatar
Georg Brandl committed
434 435
        self.assertFalse("\ud800".isprintable())

436 437
    def test_contains(self):
        # Testing Unicode contains method
438 439 440 441 442 443
        self.assert_('a' in 'abdb')
        self.assert_('a' in 'bdab')
        self.assert_('a' in 'bdaba')
        self.assert_('a' in 'bdba')
        self.assert_('a' not in 'bdb')
        self.assert_('a' in 'bdba')
444 445
        self.assert_('a' in ('a',1,None))
        self.assert_('a' in (1,None,'a'))
446 447 448
        self.assert_('a' in ('a',1,None))
        self.assert_('a' in (1,None,'a'))
        self.assert_('a' not in ('x',1,'y'))
449
        self.assert_('a' not in ('x',1,None))
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
        self.assert_('abcd' not in 'abcxxxx')
        self.assert_('ab' in 'abcd')
        self.assert_('ab' in 'abc')
        self.assert_('ab' in (1,None,'ab'))
        self.assert_('' in 'abc')
        self.assert_('' in '')
        self.assert_('' in 'abc')
        self.assert_('\0' not in 'abc')
        self.assert_('\0' in '\0abc')
        self.assert_('\0' in 'abc\0')
        self.assert_('a' in '\0abc')
        self.assert_('asdf' in 'asdf')
        self.assert_('asdf' not in 'asd')
        self.assert_('asdf' not in '')

        self.assertRaises(TypeError, "abc".__contains__)
466

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531
    def test_format(self):
        self.assertEqual(''.format(), '')
        self.assertEqual('a'.format(), 'a')
        self.assertEqual('ab'.format(), 'ab')
        self.assertEqual('a{{'.format(), 'a{')
        self.assertEqual('a}}'.format(), 'a}')
        self.assertEqual('{{b'.format(), '{b')
        self.assertEqual('}}b'.format(), '}b')
        self.assertEqual('a{{b'.format(), 'a{b')

        # examples from the PEP:
        import datetime
        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
                         "My name is Fred")
        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
                         "My name is Fred :-{}")

        d = datetime.date(2007, 8, 18)
        self.assertEqual("The year is {0.year}".format(d),
                         "The year is 2007")

        # classes we'll use for testing
        class C:
            def __init__(self, x=100):
                self._x = x
            def __format__(self, spec):
                return spec

        class D:
            def __init__(self, x):
                self.x = x
            def __format__(self, spec):
                return str(self.x)

        # class with __str__, but no __format__
        class E:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return 'E(' + self.x + ')'

        # class with __repr__, but no __format__ or __str__
        class F:
            def __init__(self, x):
                self.x = x
            def __repr__(self):
                return 'F(' + self.x + ')'

        # class with __format__ that forwards to string, for some format_spec's
        class G:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return "string is " + self.x
            def __format__(self, format_spec):
                if format_spec == 'd':
                    return 'G(' + self.x + ')'
                return object.__format__(self, format_spec)

        # class that returns a bad type from __format__
        class H:
            def __format__(self, format_spec):
                return 1.0

532 533 534 535
        class I(datetime.date):
            def __format__(self, format_spec):
                return self.strftime(format_spec)

Eric Smith's avatar
Eric Smith committed
536 537 538 539
        class J(int):
            def __format__(self, format_spec):
                return int.__format__(self * 2, format_spec)

540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564

        self.assertEqual(''.format(), '')
        self.assertEqual('abc'.format(), 'abc')
        self.assertEqual('{0}'.format('abc'), 'abc')
        self.assertEqual('{0:}'.format('abc'), 'abc')
#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
        self.assertEqual('X{0}'.format('abc'), 'Xabc')
        self.assertEqual('{0}X'.format('abc'), 'abcX')
        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
        self.assertEqual('{0}'.format(-15), '-15')
        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
        self.assertEqual('{{'.format(), '{')
        self.assertEqual('}}'.format(), '}')
        self.assertEqual('{{}}'.format(), '{}')
        self.assertEqual('{{x}}'.format(), '{x}')
        self.assertEqual('{{{0}}}'.format(123), '{123}')
        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
        self.assertEqual('}}{{'.format(), '}{')
        self.assertEqual('}}x{{'.format(), '}x{')

565 566 567
        # weird field names
        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
568
        self.assertEqual("{0[ ]}".format({' ':3}), '3')
569

570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')

        # strings
        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
        self.assertEqual('{0:.0s}'.format('abcdef'), '')
        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
        self.assertEqual('{0:x<0s}'.format('result'), 'result')
        self.assertEqual('{0:x<5s}'.format('result'), 'result')
        self.assertEqual('{0:x<6s}'.format('result'), 'result')
        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
        self.assertEqual('{0: <7s}'.format('result'), 'result ')
        self.assertEqual('{0:<7s}'.format('result'), 'result ')
        self.assertEqual('{0:>7s}'.format('result'), ' result')
        self.assertEqual('{0:>8s}'.format('result'), '  result')
        self.assertEqual('{0:^8s}'.format('result'), ' result ')
        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)

        # format specifiers for user defined type
        self.assertEqual('{0:abc}'.format(C()), 'abc')

606
        # !r, !s and !a coercions
607 608 609 610 611 612 613
        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
614
        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
615 616
        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl's avatar
Georg Brandl committed
617
        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
618
        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
619
        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl's avatar
Georg Brandl committed
620 621
        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
622
        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
623 624 625 626 627 628 629 630 631 632 633 634

        # test fallback to object.__format__
        self.assertEqual('{0}'.format({}), '{}')
        self.assertEqual('{0}'.format([]), '[]')
        self.assertEqual('{0}'.format([1]), '[1]')
        self.assertEqual('{0}'.format(E('data')), 'E(data)')
        self.assertEqual('{0:^10}'.format(E('data')), ' E(data)  ')
        self.assertEqual('{0:^10s}'.format(E('data')), ' E(data)  ')
        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
        self.assertEqual('{0:>15s}'.format(G('data')), ' string is data')
        self.assertEqual('{0!s}'.format(G('data')), 'string is data')

635 636 637 638 639
        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
                                                       month=8,
                                                       day=27)),
                         "date: 2007-08-27")

Eric Smith's avatar
Eric Smith committed
640 641 642 643
        # test deriving from a builtin type and overriding __format__
        self.assertEqual("{0}".format(J(10)), "20")


644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
        # string format specifiers
        self.assertEqual('{0:}'.format('a'), 'a')

        # computed format specifiers
        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')

        # test various errors
        self.assertRaises(ValueError, '{'.format)
        self.assertRaises(ValueError, '}'.format)
        self.assertRaises(ValueError, 'a{'.format)
        self.assertRaises(ValueError, 'a}'.format)
        self.assertRaises(ValueError, '{a'.format)
        self.assertRaises(ValueError, '}a'.format)
661 662 663
        self.assertRaises(IndexError, '{0}'.format)
        self.assertRaises(IndexError, '{1}'.format, 'abc')
        self.assertRaises(KeyError,   '{x}'.format)
664 665 666 667 668
        self.assertRaises(ValueError, "}{".format)
        self.assertRaises(ValueError, "{".format)
        self.assertRaises(ValueError, "}".format)
        self.assertRaises(ValueError, "abc{0:{}".format)
        self.assertRaises(ValueError, "{0".format)
669 670 671
        self.assertRaises(IndexError, "{0.}".format)
        self.assertRaises(ValueError, "{0.}".format, 0)
        self.assertRaises(IndexError, "{0[}".format)
672
        self.assertRaises(ValueError, "{0[}".format, [])
673 674
        self.assertRaises(KeyError,   "{0]}".format)
        self.assertRaises(ValueError, "{0.[]}".format, 0)
675
        self.assertRaises(ValueError, "{0..foo}".format, 0)
676 677 678 679 680 681
        self.assertRaises(ValueError, "{0[0}".format, 0)
        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
        self.assertRaises(KeyError,   "{c]}".format)
        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
        self.assertRaises(ValueError, "{0}}".format, 0)
        self.assertRaises(KeyError,   "{foo}".format, bar=3)
682
        self.assertRaises(ValueError, "{0!x}".format, 3)
683 684
        self.assertRaises(ValueError, "{0!}".format, 0)
        self.assertRaises(ValueError, "{0!rs}".format, 0)
685
        self.assertRaises(ValueError, "{!}".format)
686 687 688
        self.assertRaises(IndexError, "{:}".format)
        self.assertRaises(IndexError, "{:s}".format)
        self.assertRaises(IndexError, "{}".format)
689 690 691 692 693 694 695 696 697 698 699 700 701 702

        # can't have a replacement on the field name portion
        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)

        # exceed maximum recursion depth
        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
                          0, 1, 2, 3, 4, 5, 6, 7)

        # string format spec errors
        self.assertRaises(ValueError, "{0:-s}".format, '')
        self.assertRaises(ValueError, format, "", "-")
        self.assertRaises(ValueError, "{0:=s}".format, '')

Eric Smith's avatar
Eric Smith committed
703 704 705 706
        # Alternate formatting is not supported
        self.assertRaises(ValueError, format, '', '#')
        self.assertRaises(ValueError, format, '', '#20')

707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
    def test_format_auto_numbering(self):
        class C:
            def __init__(self, x=100):
                self._x = x
            def __format__(self, spec):
                return spec

        self.assertEqual('{}'.format(10), '10')
        self.assertEqual('{:5}'.format('s'), 's    ')
        self.assertEqual('{!r}'.format('s'), "'s'")
        self.assertEqual('{._x}'.format(C(10)), '10')
        self.assertEqual('{[1]}'.format([1, 2]), '2')
        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')

        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')

        # can't mix and match numbering and auto-numbering
        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)

        # can mix and match auto-numbering and named
        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')

737
    def test_formatting(self):
738
        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
739
        # Testing Unicode formatting strings...
740 741 742 743 744 745
        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
746
        if not sys.platform.startswith('java'):
747
            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl's avatar
Georg Brandl committed
748 749
            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
750 751
        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald's avatar
Walter Dörwald committed
752

753
        self.assertEqual('%c' % 0x1234, '\u1234')
754 755 756 757
        self.assertEqual('%c' % 0x21483, '\U00021483')
        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
        self.assertEqual('%c' % '\U00021483', '\U00021483')
        self.assertRaises(TypeError, "%c".__mod__, "aa")
758 759 760

        # formatting jobs delegated from the string implementation:
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
        self.assertEqual('...%s...' % "abc", '...abc...')
        self.assertEqual('%*s' % (5,'abc',), '  abc')
        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
        self.assertEqual('%c' % 'a', 'a')
776 777
        class Wrapper:
            def __str__(self):
778 779
                return '\u1234'
        self.assertEqual('%s' % Wrapper(), '\u1234')
780

781
    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
782
    def test_format_float(self):
783
        # should not format with a comma, but always with C locale
784
        self.assertEqual('1.0', '%.1f' % 1.0)
785

786 787 788 789
    def test_constructor(self):
        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)

        self.assertEqual(
790 791
            str('unicode remains unicode'),
            'unicode remains unicode'
792 793
        )

794
        class UnicodeSubclass(str):
795 796 797
            pass

        self.assertEqual(
798 799
            str(UnicodeSubclass('unicode subclass becomes unicode')),
            'unicode subclass becomes unicode'
800 801 802
        )

        self.assertEqual(
803 804
            str('strings are converted to unicode'),
            'strings are converted to unicode'
805 806 807 808 809 810 811 812 813
        )

        class StringCompat:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return self.x

        self.assertEqual(
814 815
            str(StringCompat('__str__ compatible objects are recognized')),
            '__str__ compatible objects are recognized'
816 817 818 819 820
        )

        # unicode(obj) is compatible to str():

        o = StringCompat('unicode(obj) is compatible to str()')
821
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
822 823
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')

824
        for obj in (123, 123.45, 123):
825
            self.assertEqual(str(obj), str(str(obj)))
826 827 828 829 830 831 832

        # unicode(obj, encoding, error) tests (this maps to
        # PyUnicode_FromEncodedObject() at C level)

        if not sys.platform.startswith('java'):
            self.assertRaises(
                TypeError,
833 834
                str,
                'decoding unicode is not supported',
835 836 837 838 839
                'utf-8',
                'strict'
            )

        self.assertEqual(
840
            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
841
            'strings are decoded to unicode'
842 843 844 845
        )

        if not sys.platform.startswith('java'):
            self.assertEqual(
846
                str(
847
                    memoryview(b'character buffers are decoded to unicode'),
848 849 850
                    'utf-8',
                    'strict'
                ),
851
                'character buffers are decoded to unicode'
852 853
            )

854
        self.assertRaises(TypeError, str, 42, 42, 42)
855 856 857

    def test_codecs_utf7(self):
        utfTests = [
858 859 860 861 862 863 864 865 866 867 868 869
            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
            ('+', b'+-'),
            ('+-', b'+--'),
            ('+?', b'+-?'),
            ('\?', b'+AFw?'),
            ('+?', b'+-?'),
            (r'\\?', b'+AFwAXA?'),
            (r'\\\?', b'+AFwAXABc?'),
            (r'++--', b'+-+---')
870 871 872 873 874 875
        ]

        for (x, y) in utfTests:
            self.assertEqual(x.encode('utf-7'), y)

        # surrogates not supported
876
        self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
877

878
        self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd')
879

880 881 882
        # Issue #2242: crash on some Windows/MSVC versions
        self.assertRaises(UnicodeDecodeError, b'+\xc1'.decode, 'utf-7')

883
    def test_codecs_utf8(self):
884 885 886 887 888 889
        self.assertEqual(''.encode('utf-8'), b'')
        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
        self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
        self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
        self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80')
        self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80')
890
        self.assertEqual(
891
            ('\ud800\udc02'*1000).encode('utf-8'),
892
            b'\xf0\x90\x80\x82'*1000
893 894
        )
        self.assertEqual(
895 896 897 898 899 900
            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
            ' Nunstuck git und'.encode('utf-8'),
901 902 903 904 905 906 907 908 909 910
            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
911 912 913
        )

        # UTF-8 specific decoding tests
914 915 916
        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
917 918 919 920 921

        # Other possible utf-8 test cases:
        # * strict decoding testing for all of the
        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8

922 923
    def test_codecs_idna(self):
        # Test whether trailing dot is preserved
924
        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
925

926 927
    def test_codecs_errors(self):
        # Error handling (encoding)
928 929
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
930 931
        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
932 933

        # Error handling (decoding)
934 935 936 937
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
938 939

        # Error handling (unknown character names)
940
        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
941 942

        # Error handling (truncated escape sequence)
943
        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
944

945 946
        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
947 948
        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
949 950 951 952 953 954
        # executes PyUnicode_Encode()
        import imp
        self.assertRaises(
            ImportError,
            imp.find_module,
            "non-existing module",
955
            ["non-existing dir"]
956 957 958
        )

        # Error handling (wrong arguments)
959
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
960 961

        # Error handling (PyUnicode_EncodeDecimal())
962
        self.assertRaises(UnicodeError, int, "\u0200")
963 964 965

    def test_codecs(self):
        # Encoding
966 967 968 969 970 971 972
        self.assertEqual('hello'.encode('ascii'), b'hello')
        self.assertEqual('hello'.encode('utf-7'), b'hello')
        self.assertEqual('hello'.encode('utf-8'), b'hello')
        self.assertEqual('hello'.encode('utf8'), b'hello')
        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
        self.assertEqual('hello'.encode('latin-1'), b'hello')
973 974

        # Roundtrip safety for BMP (just the first 1024 chars)
975
        for c in range(1024):
976
            u = chr(c)
977 978 979
            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
                             'utf-16-be', 'raw_unicode_escape',
                             'unicode_escape', 'unicode_internal'):
980
                self.assertEqual(str(u.encode(encoding),encoding), u)
981 982

        # Roundtrip safety for BMP (just the first 256 chars)
983
        for c in range(256):
984
            u = chr(c)
985
            for encoding in ('latin-1',):
986
                self.assertEqual(str(u.encode(encoding),encoding), u)
987 988

        # Roundtrip safety for BMP (just the first 128 chars)
989
        for c in range(128):
990
            u = chr(c)
991
            for encoding in ('ascii',):
992
                self.assertEqual(str(u.encode(encoding),encoding), u)
993 994

        # Roundtrip safety for non-BMP (just a few chars)
995
        u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
996 997 998
        for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                         #'raw_unicode_escape',
                         'unicode_escape', 'unicode_internal'):
999
            self.assertEqual(str(u.encode(encoding),encoding), u)
1000 1001 1002 1003 1004

        # UTF-8 must be roundtrip safe for all UCS-2 code points
        # This excludes surrogates: in the full range, there would be
        # a surrogate pair (\udbff\udc00), which gets converted back
        # to a non-BMP character (\U0010fc00)
1005 1006
        u = ''.join(map(chr, list(range(0,0xd800)) +
                             list(range(0xe000,0x10000))))
1007
        for encoding in ('utf-8',):
1008
            self.assertEqual(str(u.encode(encoding),encoding), u)
1009 1010 1011

    def test_codecs_charmap(self):
        # 0-127
1012
        s = bytes(range(128))
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
        for encoding in (
            'cp037', 'cp1026',
            'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
            'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
            'cp863', 'cp865', 'cp866',
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
            'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
            'mac_cyrillic', 'mac_latin2',

            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
            'cp1256', 'cp1257', 'cp1258',
            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',

            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
            'cp1006', 'iso8859_8',

            ### These have undefined mappings:
            #'cp424',

            ### These fail the round-trip:
            #'cp875'

            ):
1037
            self.assertEqual(str(s, encoding).encode(encoding), s)
1038 1039

        # 128-255
1040
        s = bytes(range(128, 256))
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
        for encoding in (
            'cp037', 'cp1026',
            'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
            'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
            'cp863', 'cp865', 'cp866',
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
            'iso8859_2', 'iso8859_4', 'iso8859_5',
            'iso8859_9', 'koi8_r', 'latin_1',
            'mac_cyrillic', 'mac_latin2',

            ### These have undefined mappings:
            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
            #'cp1256', 'cp1257', 'cp1258',
            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
            #'iso8859_3', 'iso8859_6', 'iso8859_7',
            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',

            ### These fail the round-trip:
            #'cp1006', 'cp875', 'iso8859_8',

            ):
1062
            self.assertEqual(str(s, encoding).encode(encoding), s)
1063 1064

    def test_concatenation(self):
1065 1066 1067 1068 1069
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1070 1071 1072 1073

    def test_printing(self):
        class BitBucket:
            def write(self, text):
1074
                pass
1075 1076

        out = BitBucket()
1077 1078 1079 1080 1081 1082 1083 1084 1085
        print('abc', file=out)
        print('abc', 'def', file=out)
        print('abc', 'def', file=out)
        print('abc', 'def', file=out)
        print('abc\n', file=out)
        print('abc\n', end=' ', file=out)
        print('abc\n', end=' ', file=out)
        print('def\n', file=out)
        print('def\n', file=out)
1086

1087
    def test_ucs4(self):
1088
        x = '\U00100000'
1089 1090 1091
        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
        self.assertEqual(x, y)

Christian Heimes's avatar
Christian Heimes committed
1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
        # FIXME
        #y = r'\U00100000'
        #x = y.encode("raw-unicode-escape").decode("raw-unicode-escape")
        #self.assertEqual(x, y)
        #y = r'\U00010000'
        #x = y.encode("raw-unicode-escape").decode("raw-unicode-escape")
        #self.assertEqual(x, y)

        #try:
        #    '\U11111111'.decode("raw-unicode-escape")
        #except UnicodeDecodeError as e:
        #    self.assertEqual(e.start, 0)
        #    self.assertEqual(e.end, 10)
        #else:
        #    self.fail("Should have raised UnicodeDecodeError")

1108 1109 1110 1111 1112 1113 1114
    def test_conversion(self):
        # Make sure __unicode__() works properly
        class Foo0:
            def __str__(self):
                return "foo"

        class Foo1:
1115
            def __str__(self):
1116
                return "foo"
1117 1118

        class Foo2(object):
1119
            def __str__(self):
1120
                return "foo"
1121 1122

        class Foo3(object):
1123
            def __str__(self):
1124 1125 1126
                return "foo"

        class Foo4(str):
1127
            def __str__(self):
1128 1129
                return "foo"

1130
        class Foo5(str):
1131
            def __str__(self):
1132 1133 1134 1135 1136 1137
                return "foo"

        class Foo6(str):
            def __str__(self):
                return "foos"

1138
            def __str__(self):
1139
                return "foou"
1140

1141
        class Foo7(str):
1142 1143
            def __str__(self):
                return "foos"
1144
            def __str__(self):
1145
                return "foou"
1146

1147
        class Foo8(str):
1148
            def __new__(cls, content=""):
1149
                return str.__new__(cls, 2*content)
1150
            def __str__(self):
1151 1152
                return self

1153
        class Foo9(str):
1154 1155 1156
            def __str__(self):
                return "not unicode"

1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
        self.assertEqual(str(Foo0()), "foo")
        self.assertEqual(str(Foo1()), "foo")
        self.assertEqual(str(Foo2()), "foo")
        self.assertEqual(str(Foo3()), "foo")
        self.assertEqual(str(Foo4("bar")), "foo")
        self.assertEqual(str(Foo5("bar")), "foo")
        self.assertEqual(str(Foo6("bar")), "foou")
        self.assertEqual(str(Foo7("bar")), "foou")
        self.assertEqual(str(Foo8("foo")), "foofoo")
        self.assertEqual(str(Foo9("foo")), "not unicode")
1167

1168 1169 1170 1171 1172 1173 1174
    def test_unicode_repr(self):
        class s1:
            def __repr__(self):
                return '\\n'

        class s2:
            def __repr__(self):
1175
                return '\\n'
1176 1177 1178 1179

        self.assertEqual(repr(s1()), '\\n')
        self.assertEqual(repr(s2()), '\\n')

1180 1181 1182 1183
    def test_expandtabs_overflows_gracefully(self):
        # This test only affects 32-bit platforms because expandtabs can only take
        # an int as the max value, not a 64-bit C long.  If expandtabs is changed
        # to take a 64-bit long, this test should apply to all platforms.
1184
        if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
1185
            return
1186
        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
1187

1188 1189 1190 1191 1192
    def test_raiseMemError(self):
        # Ensure that the freelist contains a consistent object, even
        # when a string allocation fails with a MemoryError.
        # This used to crash the interpreter,
        # or leak references when the number was smaller.
1193 1194 1195 1196
        charwidth = 4 if sys.maxunicode >= 0x10000 else 2
        # Note: sys.maxsize is half of the actual max allocation because of
        # the signedness of Py_ssize_t.
        alloc = lambda: "a" * (sys.maxsize // charwidth * 2)
1197 1198 1199
        self.assertRaises(MemoryError, alloc)
        self.assertRaises(MemoryError, alloc)

1200

1201
def test_main():
1202
    support.run_unittest(__name__)
1203 1204 1205

if __name__ == "__main__":
    test_main()