test_unicode.py 79.9 KB
Newer Older
1 2 3 4 5 6
""" Test script for the Unicode implementation.

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

7
"""#"
8 9 10 11 12
import codecs
import struct
import sys
import unittest
import warnings
13
from test import support, string_tests
14
import _string
15

16 17 18 19
# decorator to skip tests on narrow builds
requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
                                      'requires wide build')

20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
# Error handling (bad decoder return)
def search_function(encoding):
    def decode1(input, errors="strict"):
        return 42 # not a tuple
    def encode1(input, errors="strict"):
        return 42 # not a tuple
    def encode2(input, errors="strict"):
        return (42, 42) # no unicode
    def decode2(input, errors="strict"):
        return (42, 42) # no unicode
    if encoding=="test.unicode1":
        return (encode1, decode1, None, None)
    elif encoding=="test.unicode2":
        return (encode2, decode2, None, None)
    else:
        return None
codecs.register(search_function)

38 39 40
class UnicodeTest(string_tests.CommonTest,
        string_tests.MixinStrUnicodeUserStringTest,
        string_tests.MixinStrUnicodeTest):
41

42
    type2test = str
43

44 45 46 47
    def checkequalnofix(self, result, object, methodname, *args):
        method = getattr(object, methodname)
        realresult = method(*args)
        self.assertEqual(realresult, result)
48
        self.assertTrue(type(realresult) is type(result))
49 50 51 52

        # if the original is returned make sure that
        # this doesn't happen with subclasses
        if realresult is object:
53
            class usub(str):
54
                def __repr__(self):
55
                    return 'usub(%r)' % str.__repr__(self)
56 57 58 59
            object = usub(object)
            method = getattr(object, methodname)
            realresult = method(*args)
            self.assertEqual(realresult, result)
60
            self.assertTrue(object is not realresult)
61

62
    def test_literals(self):
63 64
        self.assertEqual('\xff', '\u00ff')
        self.assertEqual('\uffff', '\U0000ffff')
65 66 67
        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
68
        # raw strings should not have unicode escapes
69
        self.assertNotEqual(r"\u0020", " ")
70

Georg Brandl's avatar
Georg Brandl committed
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    def test_ascii(self):
        if not sys.platform.startswith('java'):
            # Test basic sanity of repr()
            self.assertEqual(ascii('abc'), "'abc'")
            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
            self.assertEqual(ascii('\\c'), "'\\\\c'")
            self.assertEqual(ascii('\\'), "'\\\\'")
            self.assertEqual(ascii('\n'), "'\\n'")
            self.assertEqual(ascii('\r'), "'\\r'")
            self.assertEqual(ascii('\t'), "'\\t'")
            self.assertEqual(ascii('\b'), "'\\x08'")
            self.assertEqual(ascii("'\""), """'\\'"'""")
            self.assertEqual(ascii("'\""), """'\\'"'""")
            self.assertEqual(ascii("'"), '''"'"''')
            self.assertEqual(ascii('"'), """'"'""")
            latin1repr = (
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
                "\\xfe\\xff'")
            testrepr = ascii(''.join(map(chr, range(256))))
            self.assertEqual(testrepr, latin1repr)
            # Test ascii works on wide unicode escapes without overflow.
            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
                             ascii("\U00010000" * 39 + "\uffff" * 4096))

            class WrongRepr:
                def __repr__(self):
                    return b'byte-repr'
            self.assertRaises(TypeError, ascii, WrongRepr())

113 114 115
    def test_repr(self):
        if not sys.platform.startswith('java'):
            # Test basic sanity of repr()
116 117 118 119 120 121 122 123 124 125 126 127 128
            self.assertEqual(repr('abc'), "'abc'")
            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
            self.assertEqual(repr('ab\\'), "'ab\\\\'")
            self.assertEqual(repr('\\c'), "'\\\\c'")
            self.assertEqual(repr('\\'), "'\\\\'")
            self.assertEqual(repr('\n'), "'\\n'")
            self.assertEqual(repr('\r'), "'\\r'")
            self.assertEqual(repr('\t'), "'\\t'")
            self.assertEqual(repr('\b'), "'\\x08'")
            self.assertEqual(repr("'\""), """'\\'"'""")
            self.assertEqual(repr("'\""), """'\\'"'""")
            self.assertEqual(repr("'"), '''"'"''')
            self.assertEqual(repr('"'), """'"'""")
129
            latin1repr = (
130
                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
131 132 133 134 135
                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
Georg Brandl's avatar
Georg Brandl committed
136 137 138 139 140 141 142 143
                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
                "\xfe\xff'")
144
            testrepr = repr(''.join(map(chr, range(256))))
145
            self.assertEqual(testrepr, latin1repr)
146
            # Test repr works on wide unicode escapes without overflow.
147 148
            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
                             repr("\U00010000" * 39 + "\uffff" * 4096))
149

Georg Brandl's avatar
Georg Brandl committed
150 151 152 153 154
            class WrongRepr:
                def __repr__(self):
                    return b'byte-repr'
            self.assertRaises(TypeError, repr, WrongRepr())

155 156
    def test_iterators(self):
        # Make sure unicode objects have an __iter__ method
157 158 159 160
        it = "\u1111\u2222\u3333".__iter__()
        self.assertEqual(next(it), "\u1111")
        self.assertEqual(next(it), "\u2222")
        self.assertEqual(next(it), "\u3333")
161
        self.assertRaises(StopIteration, next, it)
162

163
    def test_count(self):
164 165
        string_tests.CommonTest.test_count(self)
        # check mixed argument types
166 167 168 169 170 171 172 173 174
        self.checkequalnofix(3,  'aaa', 'count', 'a')
        self.checkequalnofix(0,  'aaa', 'count', 'b')
        self.checkequalnofix(3, 'aaa', 'count',  'a')
        self.checkequalnofix(0, 'aaa', 'count',  'b')
        self.checkequalnofix(0, 'aaa', 'count',  'b')
        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
175 176

    def test_find(self):
177
        string_tests.CommonTest.test_find(self)
178 179 180
        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
181

182 183
        self.assertRaises(TypeError, 'hello'.find)
        self.assertRaises(TypeError, 'hello'.find, 42)
184 185

    def test_rfind(self):
186 187
        string_tests.CommonTest.test_rfind(self)
        # check mixed argument types
188 189 190
        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
191 192

    def test_index(self):
193
        string_tests.CommonTest.test_index(self)
194 195 196 197 198 199 200 201
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
202 203

    def test_rindex(self):
204
        string_tests.CommonTest.test_rindex(self)
205 206 207 208 209 210 211 212 213 214
        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)

        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
215

216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
    def test_maketrans_translate(self):
        # these work with plain translate()
        self.checkequalnofix('bbbc', 'abababc', 'translate',
                             {ord('a'): None})
        self.checkequalnofix('iiic', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ord('i')})
        self.checkequalnofix('iiix', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
        self.checkequalnofix('c', 'abababc', 'translate',
                             {ord('a'): None, ord('b'): ''})
        self.checkequalnofix('xyyx', 'xzx', 'translate',
                             {ord('z'): 'yy'})
        # this needs maketrans()
        self.checkequalnofix('abababc', 'abababc', 'translate',
                             {'b': '<i>'})
        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
        # test alternative way of calling maketrans()
        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)

        self.assertRaises(TypeError, self.type2test.maketrans)
        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
244

245
        self.assertRaises(TypeError, 'hello'.translate)
246
        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
247 248

    def test_split(self):
249 250 251
        string_tests.CommonTest.test_split(self)

        # Mixed arguments
252 253 254
        self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
        self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
        self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
255 256

    def test_join(self):
257 258
        string_tests.MixinStrUnicodeUserStringTest.test_join(self)

259 260 261 262
        class MyWrapper:
            def __init__(self, sval): self.sval = sval
            def __str__(self): return self.sval

263
        # mixed arguments
264 265 266 267 268 269 270
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
271 272 273 274
        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
275 276

    def test_replace(self):
277
        string_tests.CommonTest.test_replace(self)
278 279

        # method call forwarded from str implementation because of unicode argument
280 281
        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
282

283
    def test_bytes_comparison(self):
284 285 286 287 288 289
        with support.check_warnings():
            warnings.simplefilter('ignore', BytesWarning)
            self.assertEqual('abc' == b'abc', False)
            self.assertEqual('abc' != b'abc', True)
            self.assertEqual('abc' == bytearray(b'abc'), False)
            self.assertEqual('abc' != bytearray(b'abc'), True)
290

291 292
    def test_comparison(self):
        # Comparisons:
293
        self.assertEqual('abc', 'abc')
294 295
        self.assertTrue('abcd' > 'abc')
        self.assertTrue('abc' < 'abcd')
296 297 298 299 300 301

        if 0:
            # Move these tests to a Unicode collation module test...
            # Testing UTF-16 code point order comparisons...

            # No surrogates, no fixup required.
302
            self.assertTrue('\u0061' < '\u20ac')
303
            # Non surrogate below surrogate value, no fixup required
304
            self.assertTrue('\u0061' < '\ud800\udc02')
305 306 307

            # Non surrogate above surrogate value, fixup required
            def test_lecmp(s, s2):
308
                self.assertTrue(s < s2)
309 310

            def test_fixup(s):
311
                s2 = '\ud800\udc01'
312
                test_lecmp(s, s2)
313
                s2 = '\ud900\udc01'
314
                test_lecmp(s, s2)
315
                s2 = '\uda00\udc01'
316
                test_lecmp(s, s2)
317
                s2 = '\udb00\udc01'
318
                test_lecmp(s, s2)
319
                s2 = '\ud800\udd01'
320
                test_lecmp(s, s2)
321
                s2 = '\ud900\udd01'
322
                test_lecmp(s, s2)
323
                s2 = '\uda00\udd01'
324
                test_lecmp(s, s2)
325
                s2 = '\udb00\udd01'
326
                test_lecmp(s, s2)
327
                s2 = '\ud800\ude01'
328
                test_lecmp(s, s2)
329
                s2 = '\ud900\ude01'
330
                test_lecmp(s, s2)
331
                s2 = '\uda00\ude01'
332
                test_lecmp(s, s2)
333
                s2 = '\udb00\ude01'
334
                test_lecmp(s, s2)
335
                s2 = '\ud800\udfff'
336
                test_lecmp(s, s2)
337
                s2 = '\ud900\udfff'
338
                test_lecmp(s, s2)
339
                s2 = '\uda00\udfff'
340
                test_lecmp(s, s2)
341
                s2 = '\udb00\udfff'
342 343
                test_lecmp(s, s2)

344 345
                test_fixup('\ue000')
                test_fixup('\uff61')
346 347

        # Surrogates on both sides, no fixup required
348
        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
349 350

    def test_islower(self):
351
        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
352
        self.checkequalnofix(False, '\u1FFc', 'islower')
353 354 355 356 357 358 359 360 361
        # non-BMP, uppercase
        self.assertFalse('\U00010401'.islower())
        self.assertFalse('\U00010427'.islower())
        # non-BMP, lowercase
        self.assertTrue('\U00010429'.islower())
        self.assertTrue('\U0001044E'.islower())
        # non-BMP, non-cased
        self.assertFalse('\U0001F40D'.islower())
        self.assertFalse('\U0001F46F'.islower())
362 363

    def test_isupper(self):
364 365
        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
        if not sys.platform.startswith('java'):
366
            self.checkequalnofix(False, '\u1FFc', 'isupper')
367 368 369 370 371 372 373 374 375
        # non-BMP, uppercase
        self.assertTrue('\U00010401'.isupper())
        self.assertTrue('\U00010427'.isupper())
        # non-BMP, lowercase
        self.assertFalse('\U00010429'.isupper())
        self.assertFalse('\U0001044E'.isupper())
        # non-BMP, non-cased
        self.assertFalse('\U0001F40D'.isupper())
        self.assertFalse('\U0001F46F'.isupper())
376 377

    def test_istitle(self):
378
        string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
379 380
        self.checkequalnofix(True, '\u1FFc', 'istitle')
        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
381

382 383 384 385 386 387 388
        # non-BMP, uppercase + lowercase
        self.assertTrue('\U00010401\U00010429'.istitle())
        self.assertTrue('\U00010427\U0001044E'.istitle())
        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))

389
    def test_isspace(self):
390
        string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
391 392 393
        self.checkequalnofix(True, '\u2000', 'isspace')
        self.checkequalnofix(True, '\u200a', 'isspace')
        self.checkequalnofix(False, '\u2014', 'isspace')
394 395 396 397 398 399 400 401 402 403
        # apparently there are no non-BMP spaces chars in Unicode 6
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                   '\U0001F40D', '\U0001F46F']:
            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))

    def test_isalnum(self):
        string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
404 405

    def test_isalpha(self):
406
        string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
407
        self.checkequalnofix(True, '\u1FFc', 'isalpha')
408 409 410 411 412 413 414 415
        # non-BMP, cased
        self.assertTrue('\U00010401'.isalpha())
        self.assertTrue('\U00010427'.isalpha())
        self.assertTrue('\U00010429'.isalpha())
        self.assertTrue('\U0001044E'.isalpha())
        # non-BMP, non-cased
        self.assertFalse('\U0001F40D'.isalpha())
        self.assertFalse('\U0001F46F'.isalpha())
416 417

    def test_isdecimal(self):
418 419 420 421 422 423 424 425
        self.checkequalnofix(False, '', 'isdecimal')
        self.checkequalnofix(False, 'a', 'isdecimal')
        self.checkequalnofix(True, '0', 'isdecimal')
        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
        self.checkequalnofix(True, '0123456789', 'isdecimal')
        self.checkequalnofix(False, '0123456789a', 'isdecimal')
426

427
        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
428

429 430 431 432 433 434
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))

435
    def test_isdigit(self):
436
        string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
437 438 439
        self.checkequalnofix(True, '\u2460', 'isdigit')
        self.checkequalnofix(False, '\xbc', 'isdigit')
        self.checkequalnofix(True, '\u0660', 'isdigit')
440

441 442 443 444 445 446
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                   '\U0001F40D', '\U0001F46F', '\U00011065']:
            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))

447
    def test_isnumeric(self):
448 449 450 451 452 453 454 455
        self.checkequalnofix(False, '', 'isnumeric')
        self.checkequalnofix(False, 'a', 'isnumeric')
        self.checkequalnofix(True, '0', 'isnumeric')
        self.checkequalnofix(True, '\u2460', 'isnumeric')
        self.checkequalnofix(True, '\xbc', 'isnumeric')
        self.checkequalnofix(True, '\u0660', 'isnumeric')
        self.checkequalnofix(True, '0123456789', 'isnumeric')
        self.checkequalnofix(False, '0123456789a', 'isnumeric')
456

457
        self.assertRaises(TypeError, "abc".isnumeric, 42)
458

459 460 461 462 463 464 465
        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                   '\U0001F40D', '\U0001F46F']:
            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
                   '\U000104A0', '\U0001F107']:
            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))

466 467 468 469 470 471 472
    def test_isidentifier(self):
        self.assertTrue("a".isidentifier())
        self.assertTrue("Z".isidentifier())
        self.assertTrue("_".isidentifier())
        self.assertTrue("b0".isidentifier())
        self.assertTrue("bc".isidentifier())
        self.assertTrue("b_".isidentifier())
473
        self.assertTrue("µ".isidentifier())
474
        self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
475 476 477

        self.assertFalse(" ".isidentifier())
        self.assertFalse("[".isidentifier())
478
        self.assertFalse("©".isidentifier())
479
        self.assertFalse("0".isidentifier())
480

Georg Brandl's avatar
Georg Brandl committed
481 482
    def test_isprintable(self):
        self.assertTrue("".isprintable())
483
        self.assertTrue(" ".isprintable())
Georg Brandl's avatar
Georg Brandl committed
484 485
        self.assertTrue("abcdefg".isprintable())
        self.assertFalse("abcdefg\n".isprintable())
486 487 488
        # some defined Unicode character
        self.assertTrue("\u0374".isprintable())
        # undefined character
489
        self.assertFalse("\u0378".isprintable())
490
        # single surrogate character
Georg Brandl's avatar
Georg Brandl committed
491 492
        self.assertFalse("\ud800".isprintable())

493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
        self.assertTrue('\U0001F46F'.isprintable())
        self.assertFalse('\U000E0020'.isprintable())

    def test_surrogates(self):
        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
            self.assertTrue(s.islower())
            self.assertFalse(s.isupper())
            self.assertFalse(s.istitle())
        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
            self.assertFalse(s.islower())
            self.assertTrue(s.isupper())
            self.assertTrue(s.istitle())

        for meth_name in ('islower', 'isupper', 'istitle'):
            meth = getattr(str, meth_name)
            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))

        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
                          'isdecimal', 'isnumeric',
                          'isidentifier', 'isprintable'):
            meth = getattr(str, meth_name)
            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))


523
    @requires_wide_build
524 525 526 527
    def test_lower(self):
        string_tests.CommonTest.test_lower(self)
        self.assertEqual('\U00010427'.lower(), '\U0001044F')
        self.assertEqual('\U00010427\U00010427'.lower(),
528
                         '\U0001044F\U0001044F')
529
        self.assertEqual('\U00010427\U0001044F'.lower(),
530
                         '\U0001044F\U0001044F')
531
        self.assertEqual('X\U00010427x\U0001044F'.lower(),
532
                         'x\U0001044Fx\U0001044F')
533

534
    @requires_wide_build
535 536 537 538
    def test_upper(self):
        string_tests.CommonTest.test_upper(self)
        self.assertEqual('\U0001044F'.upper(), '\U00010427')
        self.assertEqual('\U0001044F\U0001044F'.upper(),
539
                         '\U00010427\U00010427')
540
        self.assertEqual('\U00010427\U0001044F'.upper(),
541
                         '\U00010427\U00010427')
542
        self.assertEqual('X\U00010427x\U0001044F'.upper(),
543
                         'X\U00010427X\U00010427')
544

545
    @requires_wide_build
546 547 548 549
    def test_capitalize(self):
        string_tests.CommonTest.test_capitalize(self)
        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
550
                         '\U00010427\U0001044F')
551
        self.assertEqual('\U00010427\U0001044F'.capitalize(),
552
                         '\U00010427\U0001044F')
553
        self.assertEqual('\U0001044F\U00010427'.capitalize(),
554
                         '\U00010427\U0001044F')
555
        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
556
                         'X\U0001044Fx\U0001044F')
557

558
    @requires_wide_build
559 560 561 562
    def test_title(self):
        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
        self.assertEqual('\U0001044F'.title(), '\U00010427')
        self.assertEqual('\U0001044F\U0001044F'.title(),
563
                         '\U00010427\U0001044F')
564
        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
565
                         '\U00010427\U0001044F \U00010427\U0001044F')
566
        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
567
                         '\U00010427\U0001044F \U00010427\U0001044F')
568
        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
569
                         '\U00010427\U0001044F \U00010427\U0001044F')
570
        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
571
                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
572

573
    @requires_wide_build
574 575 576 577 578
    def test_swapcase(self):
        string_tests.CommonTest.test_swapcase(self)
        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
579
                         '\U00010427\U00010427')
580
        self.assertEqual('\U00010427\U0001044F'.swapcase(),
581
                         '\U0001044F\U00010427')
582
        self.assertEqual('\U0001044F\U00010427'.swapcase(),
583
                         '\U00010427\U0001044F')
584
        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
585
                         'x\U0001044FX\U00010427')
586

587 588
    def test_contains(self):
        # Testing Unicode contains method
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
        self.assertIn('a', 'abdb')
        self.assertIn('a', 'bdab')
        self.assertIn('a', 'bdaba')
        self.assertIn('a', 'bdba')
        self.assertNotIn('a', 'bdb')
        self.assertIn('a', 'bdba')
        self.assertIn('a', ('a',1,None))
        self.assertIn('a', (1,None,'a'))
        self.assertIn('a', ('a',1,None))
        self.assertIn('a', (1,None,'a'))
        self.assertNotIn('a', ('x',1,'y'))
        self.assertNotIn('a', ('x',1,None))
        self.assertNotIn('abcd', 'abcxxxx')
        self.assertIn('ab', 'abcd')
        self.assertIn('ab', 'abc')
        self.assertIn('ab', (1,None,'ab'))
        self.assertIn('', 'abc')
        self.assertIn('', '')
        self.assertIn('', 'abc')
        self.assertNotIn('\0', 'abc')
        self.assertIn('\0', '\0abc')
        self.assertIn('\0', 'abc\0')
        self.assertIn('a', '\0abc')
        self.assertIn('asdf', 'asdf')
        self.assertNotIn('asdf', 'asd')
        self.assertNotIn('asdf', '')
615 616

        self.assertRaises(TypeError, "abc".__contains__)
617

618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
    def test_format(self):
        self.assertEqual(''.format(), '')
        self.assertEqual('a'.format(), 'a')
        self.assertEqual('ab'.format(), 'ab')
        self.assertEqual('a{{'.format(), 'a{')
        self.assertEqual('a}}'.format(), 'a}')
        self.assertEqual('{{b'.format(), '{b')
        self.assertEqual('}}b'.format(), '}b')
        self.assertEqual('a{{b'.format(), 'a{b')

        # examples from the PEP:
        import datetime
        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
                         "My name is Fred")
        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
                         "My name is Fred :-{}")

        d = datetime.date(2007, 8, 18)
        self.assertEqual("The year is {0.year}".format(d),
                         "The year is 2007")

        # classes we'll use for testing
        class C:
            def __init__(self, x=100):
                self._x = x
            def __format__(self, spec):
                return spec

        class D:
            def __init__(self, x):
                self.x = x
            def __format__(self, spec):
                return str(self.x)

        # class with __str__, but no __format__
        class E:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return 'E(' + self.x + ')'

        # class with __repr__, but no __format__ or __str__
        class F:
            def __init__(self, x):
                self.x = x
            def __repr__(self):
                return 'F(' + self.x + ')'

        # class with __format__ that forwards to string, for some format_spec's
        class G:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return "string is " + self.x
            def __format__(self, format_spec):
                if format_spec == 'd':
                    return 'G(' + self.x + ')'
                return object.__format__(self, format_spec)

678 679 680 681
        class I(datetime.date):
            def __format__(self, format_spec):
                return self.strftime(format_spec)

Eric Smith's avatar
Eric Smith committed
682 683 684 685
        class J(int):
            def __format__(self, format_spec):
                return int.__format__(self * 2, format_spec)

686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710

        self.assertEqual(''.format(), '')
        self.assertEqual('abc'.format(), 'abc')
        self.assertEqual('{0}'.format('abc'), 'abc')
        self.assertEqual('{0:}'.format('abc'), 'abc')
#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
        self.assertEqual('X{0}'.format('abc'), 'Xabc')
        self.assertEqual('{0}X'.format('abc'), 'abcX')
        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
        self.assertEqual('{0}'.format(-15), '-15')
        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
        self.assertEqual('{{'.format(), '{')
        self.assertEqual('}}'.format(), '}')
        self.assertEqual('{{}}'.format(), '{}')
        self.assertEqual('{{x}}'.format(), '{x}')
        self.assertEqual('{{{0}}}'.format(123), '{123}')
        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
        self.assertEqual('}}{{'.format(), '}{')
        self.assertEqual('}}x{{'.format(), '}x{')

711 712 713
        # weird field names
        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
714
        self.assertEqual("{0[ ]}".format({' ':3}), '3')
715

716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751
        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')

        # strings
        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
        self.assertEqual('{0:.0s}'.format('abcdef'), '')
        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
        self.assertEqual('{0:x<0s}'.format('result'), 'result')
        self.assertEqual('{0:x<5s}'.format('result'), 'result')
        self.assertEqual('{0:x<6s}'.format('result'), 'result')
        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
        self.assertEqual('{0: <7s}'.format('result'), 'result ')
        self.assertEqual('{0:<7s}'.format('result'), 'result ')
        self.assertEqual('{0:>7s}'.format('result'), ' result')
        self.assertEqual('{0:>8s}'.format('result'), '  result')
        self.assertEqual('{0:^8s}'.format('result'), ' result ')
        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)

        # format specifiers for user defined type
        self.assertEqual('{0:abc}'.format(C()), 'abc')

752
        # !r, !s and !a coercions
753 754 755 756 757 758 759
        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
760
        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
761 762
        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
Georg Brandl's avatar
Georg Brandl committed
763
        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
764
        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
765
        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
Georg Brandl's avatar
Georg Brandl committed
766 767
        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
768
        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
769 770 771 772 773

        # test fallback to object.__format__
        self.assertEqual('{0}'.format({}), '{}')
        self.assertEqual('{0}'.format([]), '[]')
        self.assertEqual('{0}'.format([1]), '[1]')
774

775 776 777
        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
        self.assertEqual('{0!s}'.format(G('data')), 'string is data')

778 779 780 781 782 783
        msg = 'object.__format__ with a non-empty format string is deprecated'
        with support.check_warnings((msg, PendingDeprecationWarning)):
            self.assertEqual('{0:^10}'.format(E('data')), ' E(data)  ')
            self.assertEqual('{0:^10s}'.format(E('data')), ' E(data)  ')
            self.assertEqual('{0:>15s}'.format(G('data')), ' string is data')

784 785 786 787 788
        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
                                                       month=8,
                                                       day=27)),
                         "date: 2007-08-27")

Eric Smith's avatar
Eric Smith committed
789 790 791 792
        # test deriving from a builtin type and overriding __format__
        self.assertEqual("{0}".format(J(10)), "20")


793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
        # string format specifiers
        self.assertEqual('{0:}'.format('a'), 'a')

        # computed format specifiers
        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')

        # test various errors
        self.assertRaises(ValueError, '{'.format)
        self.assertRaises(ValueError, '}'.format)
        self.assertRaises(ValueError, 'a{'.format)
        self.assertRaises(ValueError, 'a}'.format)
        self.assertRaises(ValueError, '{a'.format)
        self.assertRaises(ValueError, '}a'.format)
810 811 812
        self.assertRaises(IndexError, '{0}'.format)
        self.assertRaises(IndexError, '{1}'.format, 'abc')
        self.assertRaises(KeyError,   '{x}'.format)
813 814 815
        self.assertRaises(ValueError, "}{".format)
        self.assertRaises(ValueError, "abc{0:{}".format)
        self.assertRaises(ValueError, "{0".format)
816 817 818
        self.assertRaises(IndexError, "{0.}".format)
        self.assertRaises(ValueError, "{0.}".format, 0)
        self.assertRaises(IndexError, "{0[}".format)
819
        self.assertRaises(ValueError, "{0[}".format, [])
820 821
        self.assertRaises(KeyError,   "{0]}".format)
        self.assertRaises(ValueError, "{0.[]}".format, 0)
822
        self.assertRaises(ValueError, "{0..foo}".format, 0)
823 824 825 826 827 828
        self.assertRaises(ValueError, "{0[0}".format, 0)
        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
        self.assertRaises(KeyError,   "{c]}".format)
        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
        self.assertRaises(ValueError, "{0}}".format, 0)
        self.assertRaises(KeyError,   "{foo}".format, bar=3)
829
        self.assertRaises(ValueError, "{0!x}".format, 3)
830 831
        self.assertRaises(ValueError, "{0!}".format, 0)
        self.assertRaises(ValueError, "{0!rs}".format, 0)
832
        self.assertRaises(ValueError, "{!}".format)
833 834 835
        self.assertRaises(IndexError, "{:}".format)
        self.assertRaises(IndexError, "{:s}".format)
        self.assertRaises(IndexError, "{}".format)
836 837 838
        big = "23098475029384702983476098230754973209482573"
        self.assertRaises(ValueError, ("{" + big + "}").format)
        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
839

840 841 842 843
        # issue 6089
        self.assertRaises(ValueError, "{0[0]x}".format, [None])
        self.assertRaises(ValueError, "{0[0](10)}".format, [None])

844 845 846 847 848 849 850 851 852 853 854 855 856
        # can't have a replacement on the field name portion
        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)

        # exceed maximum recursion depth
        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
                          0, 1, 2, 3, 4, 5, 6, 7)

        # string format spec errors
        self.assertRaises(ValueError, "{0:-s}".format, '')
        self.assertRaises(ValueError, format, "", "-")
        self.assertRaises(ValueError, "{0:=s}".format, '')

Eric Smith's avatar
Eric Smith committed
857 858 859 860
        # Alternate formatting is not supported
        self.assertRaises(ValueError, format, '', '#')
        self.assertRaises(ValueError, format, '', '#20')

861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
    def test_format_map(self):
        self.assertEqual(''.format_map({}), '')
        self.assertEqual('a'.format_map({}), 'a')
        self.assertEqual('ab'.format_map({}), 'ab')
        self.assertEqual('a{{'.format_map({}), 'a{')
        self.assertEqual('a}}'.format_map({}), 'a}')
        self.assertEqual('{{b'.format_map({}), '{b')
        self.assertEqual('}}b'.format_map({}), '}b')
        self.assertEqual('a{{b'.format_map({}), 'a{b')

        # using mappings
        class Mapping(dict):
            def __missing__(self, key):
                return key
        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')

        class InternalMapping:
            def __init__(self):
                self.mapping = {'a': 'hello'}
            def __getitem__(self, key):
                return self.mapping[key]
        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')


        class C:
            def __init__(self, x=100):
                self._x = x
            def __format__(self, spec):
                return spec
        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')

        # test various errors
894 895 896 897 898 899 900 901 902
        self.assertRaises(TypeError, ''.format_map)
        self.assertRaises(TypeError, 'a'.format_map)

        self.assertRaises(ValueError, '{'.format_map, {})
        self.assertRaises(ValueError, '}'.format_map, {})
        self.assertRaises(ValueError, 'a{'.format_map, {})
        self.assertRaises(ValueError, 'a}'.format_map, {})
        self.assertRaises(ValueError, '{a'.format_map, {})
        self.assertRaises(ValueError, '}a'.format_map, {})
903

904 905 906 907 908
        # issue #12579: can't supply positional params to format_map
        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
        self.assertRaises(ValueError, '{}'.format_map, 'a')
        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
    def test_format_huge_precision(self):
        format_string = ".{}f".format(sys.maxsize + 1)
        with self.assertRaises(ValueError):
            result = format(2.34, format_string)

    def test_format_huge_width(self):
        format_string = "{}f".format(sys.maxsize + 1)
        with self.assertRaises(ValueError):
            result = format(2.34, format_string)

    def test_format_huge_item_number(self):
        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
        with self.assertRaises(ValueError):
            result = format_string.format(2.34)

924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953
    def test_format_auto_numbering(self):
        class C:
            def __init__(self, x=100):
                self._x = x
            def __format__(self, spec):
                return spec

        self.assertEqual('{}'.format(10), '10')
        self.assertEqual('{:5}'.format('s'), 's    ')
        self.assertEqual('{!r}'.format('s'), "'s'")
        self.assertEqual('{._x}'.format(C(10)), '10')
        self.assertEqual('{[1]}'.format([1, 2]), '2')
        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')

        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')

        # can't mix and match numbering and auto-numbering
        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)

        # can mix and match auto-numbering and named
        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')

954
    def test_formatting(self):
955
        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
956
        # Testing Unicode formatting strings...
957 958 959 960 961 962
        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
963
        if not sys.platform.startswith('java'):
964
            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
Georg Brandl's avatar
Georg Brandl committed
965 966
            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
967 968
        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
Walter Dörwald's avatar
Walter Dörwald committed
969

970
        self.assertEqual('%c' % 0x1234, '\u1234')
971 972 973 974
        self.assertEqual('%c' % 0x21483, '\U00021483')
        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
        self.assertEqual('%c' % '\U00021483', '\U00021483')
        self.assertRaises(TypeError, "%c".__mod__, "aa")
975
        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
976
        self.assertRaises(TypeError, "%i".__mod__, "aa")
977 978 979

        # formatting jobs delegated from the string implementation:
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
        self.assertEqual('...%s...' % "abc", '...abc...')
        self.assertEqual('%*s' % (5,'abc',), '  abc')
        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
        self.assertEqual('%c' % 'a', 'a')
995 996
        class Wrapper:
            def __str__(self):
997 998
                return '\u1234'
        self.assertEqual('%s' % Wrapper(), '\u1234')
999

1000 1001 1002 1003 1004 1005 1006 1007
        # issue 3382
        NAN = float('nan')
        INF = float('inf')
        self.assertEqual('%f' % NAN, 'nan')
        self.assertEqual('%F' % NAN, 'NAN')
        self.assertEqual('%f' % INF, 'inf')
        self.assertEqual('%F' % INF, 'INF')

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
    @support.cpython_only
    def test_formatting_huge_precision(self):
        from _testcapi import INT_MAX
        format_string = "%.{}f".format(INT_MAX + 1)
        with self.assertRaises(ValueError):
            result = format_string % 2.34

    def test_formatting_huge_width(self):
        format_string = "%{}f".format(sys.maxsize + 1)
        with self.assertRaises(ValueError):
            result = format_string % 2.34

1020 1021
    def test_startswith_endswith_errors(self):
        for meth in ('foo'.startswith, 'foo'.endswith):
Ezio Melotti's avatar
Ezio Melotti committed
1022
            with self.assertRaises(TypeError) as cm:
1023
                meth(['f'])
Ezio Melotti's avatar
Ezio Melotti committed
1024
            exc = str(cm.exception)
1025 1026 1027
            self.assertIn('str', exc)
            self.assertIn('tuple', exc)

1028
    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1029
    def test_format_float(self):
1030
        # should not format with a comma, but always with C locale
1031
        self.assertEqual('1.0', '%.1f' % 1.0)
1032

1033 1034 1035 1036
    def test_constructor(self):
        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)

        self.assertEqual(
1037 1038
            str('unicode remains unicode'),
            'unicode remains unicode'
1039 1040
        )

1041
        class UnicodeSubclass(str):
1042 1043 1044
            pass

        self.assertEqual(
1045 1046
            str(UnicodeSubclass('unicode subclass becomes unicode')),
            'unicode subclass becomes unicode'
1047 1048 1049
        )

        self.assertEqual(
1050 1051
            str('strings are converted to unicode'),
            'strings are converted to unicode'
1052 1053 1054 1055 1056 1057 1058 1059 1060
        )

        class StringCompat:
            def __init__(self, x):
                self.x = x
            def __str__(self):
                return self.x

        self.assertEqual(
1061 1062
            str(StringCompat('__str__ compatible objects are recognized')),
            '__str__ compatible objects are recognized'
1063 1064 1065 1066 1067
        )

        # unicode(obj) is compatible to str():

        o = StringCompat('unicode(obj) is compatible to str()')
1068
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1069 1070
        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')

1071
        for obj in (123, 123.45, 123):
1072
            self.assertEqual(str(obj), str(str(obj)))
1073 1074 1075 1076 1077 1078 1079

        # unicode(obj, encoding, error) tests (this maps to
        # PyUnicode_FromEncodedObject() at C level)

        if not sys.platform.startswith('java'):
            self.assertRaises(
                TypeError,
1080 1081
                str,
                'decoding unicode is not supported',
1082 1083 1084 1085 1086
                'utf-8',
                'strict'
            )

        self.assertEqual(
1087
            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1088
            'strings are decoded to unicode'
1089 1090 1091 1092
        )

        if not sys.platform.startswith('java'):
            self.assertEqual(
1093
                str(
1094
                    memoryview(b'character buffers are decoded to unicode'),
1095 1096 1097
                    'utf-8',
                    'strict'
                ),
1098
                'character buffers are decoded to unicode'
1099 1100
            )

1101
        self.assertRaises(TypeError, str, 42, 42, 42)
1102 1103 1104

    def test_codecs_utf7(self):
        utfTests = [
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
            ('+', b'+-'),
            ('+-', b'+--'),
            ('+?', b'+-?'),
            ('\?', b'+AFw?'),
            ('+?', b'+-?'),
            (r'\\?', b'+AFwAXA?'),
            (r'\\\?', b'+AFwAXABc?'),
1116 1117 1118
            (r'++--', b'+-+---'),
            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
            ('/', b'/'),
1119 1120 1121 1122 1123
        ]

        for (x, y) in utfTests:
            self.assertEqual(x.encode('utf-7'), y)

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
        # Unpaired surrogates are passed through
        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')

        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1136

1137
        # Issue #2242: crash on some Windows/MSVC versions
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
        self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')

        # Direct encoded characters
        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
        # Optional direct characters
        set_o = '!"#$%&*;<=>@[]^_`{|}'
        for c in set_d:
            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
            self.assertEqual(c.encode('ascii').decode('utf7'), c)
        for c in set_o:
            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1149

1150
    def test_codecs_utf8(self):
1151 1152
        self.assertEqual(''.encode('utf-8'), b'')
        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1153 1154 1155
        if sys.maxunicode == 65535:
            self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
            self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1156 1157
        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1158 1159 1160 1161
        if sys.maxunicode == 65535:
            self.assertEqual(
                ('\ud800\udc02'*1000).encode('utf-8'),
                b'\xf0\x90\x80\x82'*1000)
1162
        self.assertEqual(
1163 1164 1165 1166 1167 1168
            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
            ' Nunstuck git und'.encode('utf-8'),
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1179 1180 1181
        )

        # UTF-8 specific decoding tests
1182 1183 1184
        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1185 1186 1187 1188 1189

        # Other possible utf-8 test cases:
        # * strict decoding testing for all of the
        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8

1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
    def test_utf8_decode_valid_sequences(self):
        sequences = [
            # single byte
            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
            # 2 bytes
            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
            # 3 bytes
            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
            # 4 bytes
            (b'\xF0\x90\x80\x80', '\U00010000'),
            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
        ]
        for seq, res in sequences:
            self.assertEqual(seq.decode('utf-8'), res)


    def test_utf8_decode_invalid_sequences(self):
        # continuation bytes in a sequence of 2, 3, or 4 bytes
        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
        # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
        # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
        invalid_start_bytes = (
            continuation_bytes + invalid_2B_seq_start_bytes +
            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
        )

        for byte in invalid_start_bytes:
            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')

        for sb in invalid_2B_seq_start_bytes:
            for cb in continuation_bytes:
                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')

        for sb in invalid_4B_seq_start_bytes:
            for cb1 in continuation_bytes[:3]:
                for cb3 in continuation_bytes[:3]:
                    self.assertRaises(UnicodeDecodeError,
                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')

        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
            self.assertRaises(UnicodeDecodeError,
                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
            self.assertRaises(UnicodeDecodeError,
                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
        # surrogates
        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
            self.assertRaises(UnicodeDecodeError,
                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
            self.assertRaises(UnicodeDecodeError,
                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
            self.assertRaises(UnicodeDecodeError,
                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
            self.assertRaises(UnicodeDecodeError,
                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
            self.assertRaises(UnicodeDecodeError,
                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
            self.assertRaises(UnicodeDecodeError,
                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')

    def test_issue8271(self):
        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
        # only the start byte and the continuation byte(s) are now considered
        # invalid, instead of the number of bytes specified by the start byte.
        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
        # table 3-8, Row 2) for more information about the algorithm used.
        FFFD = '\ufffd'
        sequences = [
            # invalid start bytes
            (b'\x80', FFFD), # continuation byte
            (b'\x80\x80', FFFD*2), # 2 continuation bytes
            (b'\xc0', FFFD),
            (b'\xc0\xc0', FFFD*2),
            (b'\xc1', FFFD),
            (b'\xc1\xc0', FFFD*2),
            (b'\xc0\xc1', FFFD*2),
            # with start byte of a 2-byte sequence
            (b'\xc2', FFFD), # only the start byte
            (b'\xc2\xc2', FFFD*2), # 2 start bytes
            (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes
            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
            # with start byte of a 3-byte sequence
            (b'\xe1', FFFD), # only the start byte
            (b'\xe1\xe1', FFFD*2), # 2 start bytes
            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
            (b'\xe1\x80', FFFD), # only 1 continuation byte
            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
            # with start byte of a 4-byte sequence
            (b'\xf1', FFFD), # only the start byte
            (b'\xf1\xf1', FFFD*2), # 2 start bytes
            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
            (b'\xf1\x80', FFFD), # only 1 continuation bytes
            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
            # with invalid start byte of a 4-byte sequence (rfc2279)
            (b'\xf5', FFFD), # only the start byte
            (b'\xf5\xf5', FFFD*2), # 2 start bytes
            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
            # with invalid start byte of a 5-byte sequence (rfc2279)
            (b'\xf8', FFFD), # only the start byte
            (b'\xf8\xf8', FFFD*2), # 2 start bytes
            (b'\xf8\x80', FFFD*2), # only one continuation byte
            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
            # with invalid start byte of a 6-byte sequence (rfc2279)
            (b'\xfc', FFFD), # only the start byte
            (b'\xfc\xfc', FFFD*2), # 2 start bytes
            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
            # invalid start byte
            (b'\xfe', FFFD),
            (b'\xfe\x80\x80', FFFD*3),
            # other sequences
            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
        ]
        for n, (seq, res) in enumerate(sequences):
            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
            self.assertEqual(seq.decode('utf-8', 'replace'), res)
            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
            self.assertEqual(seq.decode('utf-8', 'ignore'),
                             res.replace('\uFFFD', ''))

1343 1344
    def test_codecs_idna(self):
        # Test whether trailing dot is preserved
1345
        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
1346

1347 1348
    def test_codecs_errors(self):
        # Error handling (encoding)
1349 1350
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
1351 1352
        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
1353 1354 1355 1356
        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
                         'Andr\202 x'.encode('ascii', errors='replace'))
        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1357 1358

        # Error handling (decoding)
1359 1360 1361 1362
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
1363 1364

        # Error handling (unknown character names)
1365
        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
1366 1367

        # Error handling (truncated escape sequence)
1368
        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
1369

1370 1371
        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
1372 1373
        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
1374 1375 1376 1377 1378 1379
        # executes PyUnicode_Encode()
        import imp
        self.assertRaises(
            ImportError,
            imp.find_module,
            "non-existing module",
1380
            ["non-existing dir"]
1381 1382 1383
        )

        # Error handling (wrong arguments)
1384
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
1385

1386 1387 1388 1389 1390 1391 1392
        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
        self.assertRaises(UnicodeError, int, "\ud800")
        self.assertRaises(UnicodeError, int, "\udf00")
        self.assertRaises(UnicodeError, float, "\ud800")
        self.assertRaises(UnicodeError, float, "\udf00")
        self.assertRaises(UnicodeError, complex, "\ud800")
        self.assertRaises(UnicodeError, complex, "\udf00")
1393 1394 1395

    def test_codecs(self):
        # Encoding
1396 1397 1398 1399 1400 1401 1402
        self.assertEqual('hello'.encode('ascii'), b'hello')
        self.assertEqual('hello'.encode('utf-7'), b'hello')
        self.assertEqual('hello'.encode('utf-8'), b'hello')
        self.assertEqual('hello'.encode('utf8'), b'hello')
        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
        self.assertEqual('hello'.encode('latin-1'), b'hello')
1403 1404

        # Roundtrip safety for BMP (just the first 1024 chars)
1405
        for c in range(1024):
1406
            u = chr(c)
1407 1408 1409
            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
                             'utf-16-be', 'raw_unicode_escape',
                             'unicode_escape', 'unicode_internal'):
1410
                self.assertEqual(str(u.encode(encoding),encoding), u)
1411 1412

        # Roundtrip safety for BMP (just the first 256 chars)
1413
        for c in range(256):
1414
            u = chr(c)
1415
            for encoding in ('latin-1',):
1416
                self.assertEqual(str(u.encode(encoding),encoding), u)
1417 1418

        # Roundtrip safety for BMP (just the first 128 chars)
1419
        for c in range(128):
1420
            u = chr(c)
1421
            for encoding in ('ascii',):
1422
                self.assertEqual(str(u.encode(encoding),encoding), u)
1423 1424

        # Roundtrip safety for non-BMP (just a few chars)
1425
        u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
1426 1427 1428
        for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                         #'raw_unicode_escape',
                         'unicode_escape', 'unicode_internal'):
1429
            self.assertEqual(str(u.encode(encoding),encoding), u)
1430 1431 1432 1433 1434

        # UTF-8 must be roundtrip safe for all UCS-2 code points
        # This excludes surrogates: in the full range, there would be
        # a surrogate pair (\udbff\udc00), which gets converted back
        # to a non-BMP character (\U0010fc00)
1435 1436
        u = ''.join(map(chr, list(range(0,0xd800)) +
                             list(range(0xe000,0x10000))))
1437
        for encoding in ('utf-8',):
1438
            self.assertEqual(str(u.encode(encoding),encoding), u)
1439 1440 1441

    def test_codecs_charmap(self):
        # 0-127
1442
        s = bytes(range(128))
1443 1444
        for encoding in (
            'cp037', 'cp1026',
1445 1446
            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466
            'cp863', 'cp865', 'cp866',
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
            'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
            'mac_cyrillic', 'mac_latin2',

            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
            'cp1256', 'cp1257', 'cp1258',
            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',

            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
            'cp1006', 'iso8859_8',

            ### These have undefined mappings:
            #'cp424',

            ### These fail the round-trip:
            #'cp875'

            ):
1467
            self.assertEqual(str(s, encoding).encode(encoding), s)
1468 1469

        # 128-255
1470
        s = bytes(range(128, 256))
1471 1472
        for encoding in (
            'cp037', 'cp1026',
1473 1474
            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491
            'cp863', 'cp865', 'cp866',
            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
            'iso8859_2', 'iso8859_4', 'iso8859_5',
            'iso8859_9', 'koi8_r', 'latin_1',
            'mac_cyrillic', 'mac_latin2',

            ### These have undefined mappings:
            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
            #'cp1256', 'cp1257', 'cp1258',
            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
            #'iso8859_3', 'iso8859_6', 'iso8859_7',
            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',

            ### These fail the round-trip:
            #'cp1006', 'cp875', 'iso8859_8',

            ):
1492
            self.assertEqual(str(s, encoding).encode(encoding), s)
1493 1494

    def test_concatenation(self):
1495 1496 1497 1498 1499
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def"), "abcdef")
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1500 1501 1502 1503

    def test_printing(self):
        class BitBucket:
            def write(self, text):
1504
                pass
1505 1506

        out = BitBucket()
1507 1508 1509 1510 1511 1512 1513 1514 1515
        print('abc', file=out)
        print('abc', 'def', file=out)
        print('abc', 'def', file=out)
        print('abc', 'def', file=out)
        print('abc\n', file=out)
        print('abc\n', end=' ', file=out)
        print('abc\n', end=' ', file=out)
        print('def\n', file=out)
        print('def\n', file=out)
1516

1517
    def test_ucs4(self):
1518
        x = '\U00100000'
1519 1520 1521
        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
        self.assertEqual(x, y)

1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
        y = br'\U00100000'
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
        self.assertEqual(x, y)
        y = br'\U00010000'
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
        self.assertEqual(x, y)

        try:
            br'\U11111111'.decode("raw-unicode-escape")
        except UnicodeDecodeError as e:
            self.assertEqual(e.start, 0)
            self.assertEqual(e.end, 10)
        else:
            self.fail("Should have raised UnicodeDecodeError")
Christian Heimes's avatar
Christian Heimes committed
1536

1537 1538 1539 1540 1541 1542 1543
    def test_conversion(self):
        # Make sure __unicode__() works properly
        class Foo0:
            def __str__(self):
                return "foo"

        class Foo1:
1544
            def __str__(self):
1545
                return "foo"
1546 1547

        class Foo2(object):
1548
            def __str__(self):
1549
                return "foo"
1550 1551

        class Foo3(object):
1552
            def __str__(self):
1553 1554 1555
                return "foo"

        class Foo4(str):
1556
            def __str__(self):
1557 1558
                return "foo"

1559
        class Foo5(str):
1560
            def __str__(self):
1561 1562 1563 1564 1565 1566
                return "foo"

        class Foo6(str):
            def __str__(self):
                return "foos"

1567
            def __str__(self):
1568
                return "foou"
1569

1570
        class Foo7(str):
1571 1572
            def __str__(self):
                return "foos"
1573
            def __str__(self):
1574
                return "foou"
1575

1576
        class Foo8(str):
1577
            def __new__(cls, content=""):
1578
                return str.__new__(cls, 2*content)
1579
            def __str__(self):
1580 1581
                return self

1582
        class Foo9(str):
1583 1584 1585
            def __str__(self):
                return "not unicode"

1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
        self.assertEqual(str(Foo0()), "foo")
        self.assertEqual(str(Foo1()), "foo")
        self.assertEqual(str(Foo2()), "foo")
        self.assertEqual(str(Foo3()), "foo")
        self.assertEqual(str(Foo4("bar")), "foo")
        self.assertEqual(str(Foo5("bar")), "foo")
        self.assertEqual(str(Foo6("bar")), "foou")
        self.assertEqual(str(Foo7("bar")), "foou")
        self.assertEqual(str(Foo8("foo")), "foofoo")
        self.assertEqual(str(Foo9("foo")), "not unicode")
1596

1597 1598 1599 1600 1601 1602 1603
    def test_unicode_repr(self):
        class s1:
            def __repr__(self):
                return '\\n'

        class s2:
            def __repr__(self):
1604
                return '\\n'
1605 1606 1607 1608

        self.assertEqual(repr(s1()), '\\n')
        self.assertEqual(repr(s2()), '\\n')

1609 1610
    def test_printable_repr(self):
        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
Martin v. Löwis's avatar
Martin v. Löwis committed
1611
        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
1612

1613 1614 1615 1616
    def test_expandtabs_overflows_gracefully(self):
        # This test only affects 32-bit platforms because expandtabs can only take
        # an int as the max value, not a 64-bit C long.  If expandtabs is changed
        # to take a 64-bit long, this test should apply to all platforms.
1617
        if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
1618
            return
1619
        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
1620

1621 1622 1623 1624 1625
    def test_raiseMemError(self):
        # Ensure that the freelist contains a consistent object, even
        # when a string allocation fails with a MemoryError.
        # This used to crash the interpreter,
        # or leak references when the number was smaller.
1626 1627 1628 1629
        charwidth = 4 if sys.maxunicode >= 0x10000 else 2
        # Note: sys.maxsize is half of the actual max allocation because of
        # the signedness of Py_ssize_t.
        alloc = lambda: "a" * (sys.maxsize // charwidth * 2)
1630 1631 1632
        self.assertRaises(MemoryError, alloc)
        self.assertRaises(MemoryError, alloc)

1633 1634 1635 1636 1637
    def test_format_subclass(self):
        class S(str):
            def __str__(self):
                return '__str__ overridden'
        s = S('xxx')
1638 1639
        self.assertEqual("%s" % s, '__str__ overridden')
        self.assertEqual("{}".format(s), '__str__ overridden')
1640

1641
    # Test PyUnicode_FromFormat()
1642
    def test_from_format(self):
1643
        support.import_module('ctypes')
1644
        from ctypes import pythonapi, py_object, c_int
1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
        if sys.maxunicode == 65535:
            name = "PyUnicodeUCS2_FromFormat"
        else:
            name = "PyUnicodeUCS4_FromFormat"
        _PyUnicode_FromFormat = getattr(pythonapi, name)
        _PyUnicode_FromFormat.restype = py_object

        def PyUnicode_FromFormat(format, *args):
            cargs = tuple(
                py_object(arg) if isinstance(arg, str) else arg
                for arg in args)
            return _PyUnicode_FromFormat(format, *cargs)
1657 1658

        # ascii format, non-ascii argument
1659
        text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9')
1660 1661
        self.assertEqual(text, 'ascii\x7f=unicode\xe9')

1662 1663
        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
        # raises an error
1664
        self.assertRaisesRegex(ValueError,
1665
            '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
1666
            'string, got a non-ASCII byte: 0xe9$',
1667
            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
1668

1669 1670 1671
        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd')
        self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff')

1672
        # other tests
1673
        text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
1674 1675
        self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")

1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
        text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz')
        self.assertEqual(text, 'repr=abc')

        # Test string decode from parameter of %s using utf-8.
        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
        # '\u4eba\u6c11'
        text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
        self.assertEqual(text, 'repr=\u4eba\u6c11')

        #Test replace error handler.
        text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
        self.assertEqual(text, 'repr=abc\ufffd')

1689 1690
    # Test PyUnicode_AsWideChar()
    def test_aswidechar(self):
1691
        from _testcapi import unicode_aswidechar
1692
        support.import_module('ctypes')
1693 1694
        from ctypes import c_wchar, sizeof

1695
        wchar, size = unicode_aswidechar('abcdef', 2)
1696 1697
        self.assertEqual(size, 2)
        self.assertEqual(wchar, 'ab')
1698

1699
        wchar, size = unicode_aswidechar('abc', 3)
1700 1701
        self.assertEqual(size, 3)
        self.assertEqual(wchar, 'abc')
1702

1703
        wchar, size = unicode_aswidechar('abc', 4)
1704 1705
        self.assertEqual(size, 3)
        self.assertEqual(wchar, 'abc\0')
1706

1707
        wchar, size = unicode_aswidechar('abc', 10)
1708 1709
        self.assertEqual(size, 3)
        self.assertEqual(wchar, 'abc\0')
1710

1711
        wchar, size = unicode_aswidechar('abc\0def', 20)
1712 1713
        self.assertEqual(size, 7)
        self.assertEqual(wchar, 'abc\0def\0')
1714

1715 1716 1717 1718 1719 1720 1721
        nonbmp = chr(0x10ffff)
        if sizeof(c_wchar) == 2:
            buflen = 3
            nchar = 2
        else: # sizeof(c_wchar) == 4
            buflen = 2
            nchar = 1
1722
        wchar, size = unicode_aswidechar(nonbmp, buflen)
1723 1724
        self.assertEqual(size, nchar)
        self.assertEqual(wchar, nonbmp + '\0')
1725

1726 1727
    # Test PyUnicode_AsWideCharString()
    def test_aswidecharstring(self):
1728
        from _testcapi import unicode_aswidecharstring
1729
        support.import_module('ctypes')
1730 1731
        from ctypes import c_wchar, sizeof

1732
        wchar, size = unicode_aswidecharstring('abc')
1733 1734
        self.assertEqual(size, 3)
        self.assertEqual(wchar, 'abc\0')
1735

1736
        wchar, size = unicode_aswidecharstring('abc\0def')
1737 1738
        self.assertEqual(size, 7)
        self.assertEqual(wchar, 'abc\0def\0')
1739

1740 1741 1742 1743 1744
        nonbmp = chr(0x10ffff)
        if sizeof(c_wchar) == 2:
            nchar = 2
        else: # sizeof(c_wchar) == 4
            nchar = 1
1745
        wchar, size = unicode_aswidecharstring(nonbmp)
1746 1747
        self.assertEqual(size, nchar)
        self.assertEqual(wchar, nonbmp + '\0')
1748

1749

1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799
class StringModuleTest(unittest.TestCase):
    def test_formatter_parser(self):
        def parse(format):
            return list(_string.formatter_parser(format))

        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
        self.assertEqual(formatter, [
            ('prefix ', '2', '', 's'),
            ('xxx', '0', '^+10.3f', None),
            ('', 'obj.attr', '', 's'),
            (' ', 'z[0]', '10', 's'),
        ])

        formatter = parse("prefix {} suffix")
        self.assertEqual(formatter, [
            ('prefix ', '', '', None),
            (' suffix', None, None, None),
        ])

        formatter = parse("str")
        self.assertEqual(formatter, [
            ('str', None, None, None),
        ])

        formatter = parse("")
        self.assertEqual(formatter, [])

        formatter = parse("{0}")
        self.assertEqual(formatter, [
            ('', '0', '', None),
        ])

        self.assertRaises(TypeError, _string.formatter_parser, 1)

    def test_formatter_field_name_split(self):
        def split(name):
            items = list(_string.formatter_field_name_split(name))
            items[1] = list(items[1])
            return items
        self.assertEqual(split("obj"), ["obj", []])
        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
        self.assertEqual(split("obj.arg[key1][key2]"), [
            "obj",
            [(True, 'arg'),
             (False, 'key1'),
             (False, 'key2'),
            ]])
        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)

1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835
    def test_encode_decimal(self):
        from _testcapi import unicode_encodedecimal
        self.assertEqual(unicode_encodedecimal('123'),
                         b'123')
        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
                         b'3.14')
        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
                         b' 3.14 ')
        self.assertRaises(UnicodeEncodeError,
                          unicode_encodedecimal, "123\u20ac", "strict")
        self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
                         b'123?')
        self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
                         b'123')
        self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
                         b'123&#8364;')
        self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
                         b'123\\u20ac')
        self.assertEqual(unicode_encodedecimal("123\u20ac\N{EM SPACE}", "replace"),
                         b'123? ')
        self.assertEqual(unicode_encodedecimal("123\u20ac\u20ac", "replace"),
                         b'123??')
        self.assertEqual(unicode_encodedecimal("123\u20ac\u0660", "replace"),
                         b'123?0')

    def test_transform_decimal(self):
        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
        self.assertEqual(transform_decimal('123'),
                         '123')
        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
                         '3.14')
        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
                         "\N{EM SPACE}3.14\N{EN SPACE}")
        self.assertEqual(transform_decimal('123\u20ac'),
                         '123\u20ac')

1836

1837
def test_main():
1838
    support.run_unittest(__name__)
1839 1840 1841

if __name__ == "__main__":
    test_main()