test_codecs.py 62.1 KB
Newer Older
1
from test import support
2
import unittest
3
import codecs
4
import sys, _testcapi, io
5

6 7 8 9
class Queue(object):
    """
    queue: write bytes at one end, read bytes from the other end
    """
10 11
    def __init__(self, buffer):
        self._buffer = buffer
12 13 14 15 16 17 18

    def write(self, chars):
        self._buffer += chars

    def read(self, size=-1):
        if size<0:
            s = self._buffer
19
            self._buffer = self._buffer[:0] # make empty
20 21 22 23 24 25
            return s
        else:
            s = self._buffer[:size]
            self._buffer = self._buffer[size:]
            return s

26 27
class MixInCheckStateHandling:
    def check_state_handling_decode(self, encoding, u, s):
28
        for i in range(len(s)+1):
29 30 31
            d = codecs.getincrementaldecoder(encoding)()
            part1 = d.decode(s[:i])
            state = d.getstate()
32
            self.assertIsInstance(state[1], int)
33 34 35 36 37 38
            # Check that the condition stated in the documentation for
            # IncrementalDecoder.getstate() holds
            if not state[1]:
                # reset decoder to the default state without anything buffered
                d.setstate((state[0][:0], 0))
                # Feeding the previous input may not produce any output
39
                self.assertTrue(not d.decode(state[0]))
40 41 42 43 44 45 46 47 48 49
                # The decoder must return to the same state
                self.assertEqual(state, d.getstate())
            # Create a new decoder and set it to the state
            # we extracted from the old one
            d = codecs.getincrementaldecoder(encoding)()
            d.setstate(state)
            part2 = d.decode(s[i:], True)
            self.assertEqual(u, part1+part2)

    def check_state_handling_encode(self, encoding, u, s):
50
        for i in range(len(u)+1):
51 52 53 54 55 56 57 58 59
            d = codecs.getincrementalencoder(encoding)()
            part1 = d.encode(u[:i])
            state = d.getstate()
            d = codecs.getincrementalencoder(encoding)()
            d.setstate(state)
            part2 = d.encode(u[i:], True)
            self.assertEqual(s, part1+part2)

class ReadTest(unittest.TestCase, MixInCheckStateHandling):
60
    def check_partial(self, input, partialresults):
61
        # get a StreamReader for the encoding and feed the bytestring version
62
        # of input to the reader byte by byte. Read everything available from
63 64
        # the StreamReader and check that the results equal the appropriate
        # entries from partialresults.
65
        q = Queue(b"")
66
        r = codecs.getreader(self.encoding)(q)
67
        result = ""
68
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
69
            q.write(bytes([c]))
70 71 72
            result += r.read()
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
73
        self.assertEqual(r.read(), "")
74
        self.assertEqual(r.bytebuffer, b"")
75

76 77
        # do the check again, this time using a incremental decoder
        d = codecs.getincrementaldecoder(self.encoding)()
78
        result = ""
79
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
80
            result += d.decode(bytes([c]))
81 82
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
83 84
        self.assertEqual(d.decode(b"", True), "")
        self.assertEqual(d.buffer, b"")
85

86
        # Check whether the reset method works properly
87
        d.reset()
88
        result = ""
89
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
90
            result += d.decode(bytes([c]))
91 92
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
93 94
        self.assertEqual(d.decode(b"", True), "")
        self.assertEqual(d.buffer, b"")
95 96 97 98 99

        # check iterdecode()
        encoded = input.encode(self.encoding)
        self.assertEqual(
            input,
100
            "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
101 102
        )

103 104
    def test_readline(self):
        def getreader(input):
105
            stream = io.BytesIO(input.encode(self.encoding))
106 107
            return codecs.getreader(self.encoding)(stream)

108
        def readalllines(input, keepends=True, size=None):
109 110 111
            reader = getreader(input)
            lines = []
            while True:
112
                line = reader.readline(size=size, keepends=keepends)
113 114 115
                if not line:
                    break
                lines.append(line)
116
            return "|".join(lines)
117

118 119 120
        s = "foo\nbar\r\nbaz\rspam\u2028eggs"
        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
        sexpectednoends = "foo|bar|baz|spam|eggs"
121 122 123 124
        self.assertEqual(readalllines(s, True), sexpected)
        self.assertEqual(readalllines(s, False), sexpectednoends)
        self.assertEqual(readalllines(s, True, 10), sexpected)
        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
125 126 127 128

        # Test long lines (multiple calls to read() in readline())
        vw = []
        vwo = []
129 130 131
        for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
            vw.append((i*200)*"\3042" + lineend)
            vwo.append((i*200)*"\3042")
132 133 134 135 136
        self.assertEqual(readalllines("".join(vw), True), "".join(vw))
        self.assertEqual(readalllines("".join(vw), False),"".join(vwo))

        # Test lines where the first read might end with \r, so the
        # reader has to look ahead whether this is a lone \r or a \r\n
137
        for size in range(80):
138 139
            for lineend in "\n \r\n \r \u2028".split():
                s = 10*(size*"a" + lineend + "xxx\n")
140
                reader = getreader(s)
141
                for i in range(10):
142 143
                    self.assertEqual(
                        reader.readline(keepends=True),
144
                        size*"a" + lineend,
145 146
                    )
                reader = getreader(s)
147
                for i in range(10):
148 149
                    self.assertEqual(
                        reader.readline(keepends=False),
150
                        size*"a",
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
                    )

    def test_bug1175396(self):
        s = [
            '<%!--===================================================\r\n',
            '    BLOG index page: show recent articles,\r\n',
            '    today\'s articles, or articles of a specific date.\r\n',
            '========================================================--%>\r\n',
            '<%@inputencoding="ISO-8859-1"%>\r\n',
            '<%@pagetemplate=TEMPLATE.y%>\r\n',
            '<%@import=import frog.util, frog%>\r\n',
            '<%@import=import frog.objects%>\r\n',
            '<%@import=from frog.storageerrors import StorageError%>\r\n',
            '<%\r\n',
            '\r\n',
            'import logging\r\n',
            'log=logging.getLogger("Snakelets.logger")\r\n',
            '\r\n',
            '\r\n',
            'user=self.SessionCtx.user\r\n',
            'storageEngine=self.SessionCtx.storageEngine\r\n',
            '\r\n',
            '\r\n',
            'def readArticlesFromDate(date, count=None):\r\n',
            '    entryids=storageEngine.listBlogEntries(date)\r\n',
            '    entryids.reverse() # descending\r\n',
            '    if count:\r\n',
            '        entryids=entryids[:count]\r\n',
            '    try:\r\n',
            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
            '    except StorageError,x:\r\n',
            '        log.error("Error loading articles: "+str(x))\r\n',
            '        self.abort("cannot load articles")\r\n',
            '\r\n',
            'showdate=None\r\n',
            '\r\n',
            'arg=self.Request.getArg()\r\n',
            'if arg=="today":\r\n',
            '    #-------------------- TODAY\'S ARTICLES\r\n',
            '    self.write("<h2>Today\'s articles</h2>")\r\n',
            '    showdate = frog.util.isodatestr() \r\n',
            '    entries = readArticlesFromDate(showdate)\r\n',
            'elif arg=="active":\r\n',
            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
            '    self.Yredirect("active.y")\r\n',
            'elif arg=="login":\r\n',
            '    #-------------------- LOGIN PAGE redirect\r\n',
            '    self.Yredirect("login.y")\r\n',
            'elif arg=="date":\r\n',
            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
            '    showdate = self.Request.getParameter("date")\r\n',
            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
            '    entries = readArticlesFromDate(showdate)\r\n',
            'else:\r\n',
            '    #-------------------- RECENT ARTICLES\r\n',
            '    self.write("<h2>Recent articles</h2>")\r\n',
            '    dates=storageEngine.listBlogEntryDates()\r\n',
            '    if dates:\r\n',
            '        entries=[]\r\n',
            '        SHOWAMOUNT=10\r\n',
            '        for showdate in dates:\r\n',
            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
            '            if len(entries)>=SHOWAMOUNT:\r\n',
            '                break\r\n',
            '                \r\n',
        ]
217
        stream = io.BytesIO("".join(s).encode(self.encoding))
218 219 220
        reader = codecs.getreader(self.encoding)(stream)
        for (i, line) in enumerate(reader):
            self.assertEqual(line, s[i])
221 222

    def test_readlinequeue(self):
223
        q = Queue(b"")
224 225 226 227
        writer = codecs.getwriter(self.encoding)(q)
        reader = codecs.getreader(self.encoding)(q)

        # No lineends
228 229 230 231 232 233 234 235
        writer.write("foo\r")
        self.assertEqual(reader.readline(keepends=False), "foo")
        writer.write("\nbar\r")
        self.assertEqual(reader.readline(keepends=False), "")
        self.assertEqual(reader.readline(keepends=False), "bar")
        writer.write("baz")
        self.assertEqual(reader.readline(keepends=False), "baz")
        self.assertEqual(reader.readline(keepends=False), "")
236 237

        # Lineends
238 239 240 241 242 243 244 245 246 247
        writer.write("foo\r")
        self.assertEqual(reader.readline(keepends=True), "foo\r")
        writer.write("\nbar\r")
        self.assertEqual(reader.readline(keepends=True), "\n")
        self.assertEqual(reader.readline(keepends=True), "bar\r")
        writer.write("baz")
        self.assertEqual(reader.readline(keepends=True), "baz")
        self.assertEqual(reader.readline(keepends=True), "")
        writer.write("foo\r\n")
        self.assertEqual(reader.readline(keepends=True), "foo\r\n")
248

249
    def test_bug1098990_a(self):
250 251 252
        s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
        s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
        s3 = "next line.\r\n"
253 254

        s = (s1+s2+s3).encode(self.encoding)
255
        stream = io.BytesIO(s)
256 257 258 259
        reader = codecs.getreader(self.encoding)(stream)
        self.assertEqual(reader.readline(), s1)
        self.assertEqual(reader.readline(), s2)
        self.assertEqual(reader.readline(), s3)
260
        self.assertEqual(reader.readline(), "")
261 262

    def test_bug1098990_b(self):
263 264 265 266 267
        s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
        s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
        s3 = "stillokay:bbbbxx\r\n"
        s4 = "broken!!!!badbad\r\n"
        s5 = "againokay.\r\n"
268 269

        s = (s1+s2+s3+s4+s5).encode(self.encoding)
270
        stream = io.BytesIO(s)
271 272 273 274 275 276
        reader = codecs.getreader(self.encoding)(stream)
        self.assertEqual(reader.readline(), s1)
        self.assertEqual(reader.readline(), s2)
        self.assertEqual(reader.readline(), s3)
        self.assertEqual(reader.readline(), s4)
        self.assertEqual(reader.readline(), s5)
277
        self.assertEqual(reader.readline(), "")
278

279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
class UTF32Test(ReadTest):
    encoding = "utf-32"

    spamle = (b'\xff\xfe\x00\x00'
              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
    spambe = (b'\x00\x00\xfe\xff'
              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')

    def test_only_one_bom(self):
        _,_,reader,writer = codecs.lookup(self.encoding)
        # encode some stream
        s = io.BytesIO()
        f = writer(s)
        f.write("spam")
        f.write("spam")
        d = s.getvalue()
        # check whether there is exactly one BOM in it
298
        self.assertTrue(d == self.spamle or d == self.spambe)
299 300 301
        # try to read it back
        s = io.BytesIO(d)
        f = reader(s)
302
        self.assertEqual(f.read(), "spamspam")
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339

    def test_badbom(self):
        s = io.BytesIO(4*b"\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)

        s = io.BytesIO(8*b"\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)

    def test_partial(self):
        self.check_partial(
            "\x00\xff\u0100\uffff",
            [
                "", # first byte of BOM read
                "", # second byte of BOM read
                "", # third byte of BOM read
                "", # fourth byte of BOM read => byteorder known
                "",
                "",
                "",
                "\x00",
                "\x00",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
            ]
        )

340 341 342 343 344 345
    def test_handlers(self):
        self.assertEqual(('\ufffd', 1),
                         codecs.utf_32_decode(b'\x01', 'replace', True))
        self.assertEqual(('', 1),
                         codecs.utf_32_decode(b'\x01', 'ignore', True))

346 347 348 349 350 351 352 353 354 355
    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
                          b"\xff", "strict", True)

    def test_decoder_state(self):
        self.check_state_handling_decode(self.encoding,
                                         "spamspam", self.spamle)
        self.check_state_handling_decode(self.encoding,
                                         "spamspam", self.spambe)

356 357 358 359 360 361 362 363 364 365
    def test_issue8941(self):
        # Issue #8941: insufficient result allocation when decoding into
        # surrogate pairs on UCS-2 builds.
        encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_decode(encoded_le)[0])
        encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_decode(encoded_be)[0])

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
class UTF32LETest(ReadTest):
    encoding = "utf-32-le"

    def test_partial(self):
        self.check_partial(
            "\x00\xff\u0100\uffff",
            [
                "",
                "",
                "",
                "\x00",
                "\x00",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
            ]
        )

    def test_simple(self):
        self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")

    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
                          b"\xff", "strict", True)

399 400 401 402 403 404 405
    def test_issue8941(self):
        # Issue #8941: insufficient result allocation when decoding into
        # surrogate pairs on UCS-2 builds.
        encoded = b'\x00\x00\x01\x00' * 1024
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_le_decode(encoded)[0])

406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
class UTF32BETest(ReadTest):
    encoding = "utf-32-be"

    def test_partial(self):
        self.check_partial(
            "\x00\xff\u0100\uffff",
            [
                "",
                "",
                "",
                "\x00",
                "\x00",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
            ]
        )

    def test_simple(self):
        self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")

    def test_errors(self):
        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
                          b"\xff", "strict", True)

439 440 441 442 443 444 445 446
    def test_issue8941(self):
        # Issue #8941: insufficient result allocation when decoding into
        # surrogate pairs on UCS-2 builds.
        encoded = b'\x00\x01\x00\x00' * 1024
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_be_decode(encoded)[0])


447 448
class UTF16Test(ReadTest):
    encoding = "utf-16"
449

450 451
    spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
452 453

    def test_only_one_bom(self):
454
        _,_,reader,writer = codecs.lookup(self.encoding)
455
        # encode some stream
456
        s = io.BytesIO()
457
        f = writer(s)
458 459
        f.write("spam")
        f.write("spam")
460 461
        d = s.getvalue()
        # check whether there is exactly one BOM in it
462
        self.assertTrue(d == self.spamle or d == self.spambe)
463
        # try to read it back
464
        s = io.BytesIO(d)
465
        f = reader(s)
466
        self.assertEqual(f.read(), "spamspam")
467

468
    def test_badbom(self):
469
        s = io.BytesIO(b"\xff\xff")
470
        f = codecs.getreader(self.encoding)(s)
471 472
        self.assertRaises(UnicodeError, f.read)

473
        s = io.BytesIO(b"\xff\xff\xff\xff")
474
        f = codecs.getreader(self.encoding)(s)
475 476
        self.assertRaises(UnicodeError, f.read)

477 478
    def test_partial(self):
        self.check_partial(
479
            "\x00\xff\u0100\uffff",
480
            [
481 482 483 484 485 486 487 488 489 490
                "", # first byte of BOM read
                "", # second byte of BOM read => byteorder known
                "",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
491 492 493
            ]
        )

494 495 496 497 498 499
    def test_handlers(self):
        self.assertEqual(('\ufffd', 1),
                         codecs.utf_16_decode(b'\x01', 'replace', True))
        self.assertEqual(('', 1),
                         codecs.utf_16_decode(b'\x01', 'ignore', True))

500
    def test_errors(self):
501
        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
502
                          b"\xff", "strict", True)
503 504 505

    def test_decoder_state(self):
        self.check_state_handling_decode(self.encoding,
506
                                         "spamspam", self.spamle)
507
        self.check_state_handling_decode(self.encoding,
508
                                         "spamspam", self.spambe)
509

510 511 512 513 514 515 516
    def test_bug691291(self):
        # Files are always opened in binary mode, even if no binary mode was
        # specified.  This means that no automatic conversion of '\n' is done
        # on reading and writing.
        s1 = 'Hello\r\nworld\r\n'

        s = s1.encode(self.encoding)
517 518 519 520 521
        self.addCleanup(support.unlink, support.TESTFN)
        with open(support.TESTFN, 'wb') as fp:
            fp.write(s)
        with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
            self.assertEqual(reader.read(), s1)
522

523 524
class UTF16LETest(ReadTest):
    encoding = "utf-16-le"
525 526 527

    def test_partial(self):
        self.check_partial(
528
            "\x00\xff\u0100\uffff",
529
            [
530 531 532 533 534 535 536 537
                "",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
538 539 540
            ]
        )

541
    def test_errors(self):
542
        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
543
                          b"\xff", "strict", True)
544

545 546 547 548 549 550
    def test_nonbmp(self):
        self.assertEqual("\U00010203".encode(self.encoding),
                         b'\x00\xd8\x03\xde')
        self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
                         "\U00010203")

551 552
class UTF16BETest(ReadTest):
    encoding = "utf-16-be"
553 554 555

    def test_partial(self):
        self.check_partial(
556
            "\x00\xff\u0100\uffff",
557
            [
558 559 560 561 562 563 564 565
                "",
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u0100",
                "\x00\xff\u0100",
                "\x00\xff\u0100\uffff",
566 567 568
            ]
        )

569
    def test_errors(self):
570
        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
571
                          b"\xff", "strict", True)
572

573 574 575 576 577 578
    def test_nonbmp(self):
        self.assertEqual("\U00010203".encode(self.encoding),
                         b'\xd8\x00\xde\x03')
        self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
                         "\U00010203")

579 580
class UTF8Test(ReadTest):
    encoding = "utf-8"
581 582 583

    def test_partial(self):
        self.check_partial(
584
            "\x00\xff\u07ff\u0800\uffff",
585
            [
586 587 588 589 590 591 592 593 594 595 596
                "\x00",
                "\x00",
                "\x00\xff",
                "\x00\xff",
                "\x00\xff\u07ff",
                "\x00\xff\u07ff",
                "\x00\xff\u07ff",
                "\x00\xff\u07ff\u0800",
                "\x00\xff\u07ff\u0800",
                "\x00\xff\u07ff\u0800",
                "\x00\xff\u07ff\u0800\uffff",
597 598 599
            ]
        )

600
    def test_decoder_state(self):
601
        u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
602 603 604
        self.check_state_handling_decode(self.encoding,
                                         u, u.encode(self.encoding))

605 606 607
    def test_lone_surrogates(self):
        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
608 609 610 611 612 613 614 615 616 617
        self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
                         b'[\\udc80]')
        self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
                         b'[&#56448;]')
        self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
                         b'[\x80]')
        self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
                         b'[]')
        self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
                         b'[?]')
618

619
    def test_surrogatepass_handler(self):
620 621 622 623
        self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
                         b"abc\xed\xa0\x80def")
        self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
                         "abc\ud800def")
624
        self.assertTrue(codecs.lookup_error("surrogatepass"))
625

626 627 628
class UTF7Test(ReadTest):
    encoding = "utf-7"

629 630 631 632 633 634 635 636 637 638 639
    def test_partial(self):
        self.check_partial(
            "a+-b",
            [
                "a",
                "a",
                "a+",
                "a+-",
                "a+-b",
            ]
        )
640 641 642 643

class UTF16ExTest(unittest.TestCase):

    def test_errors(self):
644
        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
645 646 647 648 649 650 651 652 653

    def test_bad_args(self):
        self.assertRaises(TypeError, codecs.utf_16_ex_decode)

class ReadBufferTest(unittest.TestCase):

    def test_array(self):
        import array
        self.assertEqual(
654
            codecs.readbuffer_encode(array.array("b", b"spam")),
655
            (b"spam", 4)
656 657 658
        )

    def test_empty(self):
659
        self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
660 661 662 663 664

    def test_bad_args(self):
        self.assertRaises(TypeError, codecs.readbuffer_encode)
        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)

665 666 667 668 669
class UTF8SigTest(ReadTest):
    encoding = "utf-8-sig"

    def test_partial(self):
        self.check_partial(
670
            "\ufeff\x00\xff\u07ff\u0800\uffff",
671
            [
672 673 674 675 676 677 678
                "",
                "",
                "", # First BOM has been read and skipped
                "",
                "",
                "\ufeff", # Second BOM has been read and emitted
                "\ufeff\x00", # "\x00" read and emitted
679 680 681 682
                "\ufeff\x00", # First byte of encoded "\xff" read
                "\ufeff\x00\xff", # Second byte of encoded "\xff" read
                "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
                "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
683 684 685 686 687 688
                "\ufeff\x00\xff\u07ff",
                "\ufeff\x00\xff\u07ff",
                "\ufeff\x00\xff\u07ff\u0800",
                "\ufeff\x00\xff\u07ff\u0800",
                "\ufeff\x00\xff\u07ff\u0800",
                "\ufeff\x00\xff\u07ff\u0800\uffff",
689 690 691
            ]
        )

692 693
    def test_bug1601501(self):
        # SF bug #1601501: check that the codec works with a buffer
694
        self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
695

696 697
    def test_bom(self):
        d = codecs.getincrementaldecoder("utf-8-sig")()
698
        s = "spam"
699 700
        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)

701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
    def test_stream_bom(self):
        unistring = "ABC\u00A1\u2200XYZ"
        bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"

        reader = codecs.getreader("utf-8-sig")
        for sizehint in [None] + list(range(1, 11)) + \
                        [64, 128, 256, 512, 1024]:
            istream = reader(io.BytesIO(bytestring))
            ostream = io.StringIO()
            while 1:
                if sizehint is not None:
                    data = istream.read(sizehint)
                else:
                    data = istream.read()

                if not data:
                    break
                ostream.write(data)

            got = ostream.getvalue()
            self.assertEqual(got, unistring)

    def test_stream_bare(self):
        unistring = "ABC\u00A1\u2200XYZ"
        bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"

        reader = codecs.getreader("utf-8-sig")
        for sizehint in [None] + list(range(1, 11)) + \
                        [64, 128, 256, 512, 1024]:
            istream = reader(io.BytesIO(bytestring))
            ostream = io.StringIO()
            while 1:
                if sizehint is not None:
                    data = istream.read(sizehint)
                else:
                    data = istream.read()

                if not data:
                    break
                ostream.write(data)

            got = ostream.getvalue()
            self.assertEqual(got, unistring)

class EscapeDecodeTest(unittest.TestCase):
    def test_empty(self):
747
        self.assertEqual(codecs.escape_decode(""), ("", 0))
748

749 750
class RecodingTest(unittest.TestCase):
    def test_recoding(self):
751
        f = io.BytesIO()
752
        f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
753
        f2.write("a")
754 755 756
        f2.close()
        # Python used to crash on this at exit because of a refcount
        # bug in _codecsmodule.c
757

758 759 760
# From RFC 3492
punycode_testcases = [
    # A Arabic (Egyptian):
761 762
    ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
     "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
763
     b"egbpdaj6bu4bxfgehfvwxn"),
764
    # B Chinese (simplified):
765
    ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
766
     b"ihqwcrb4cv8a8dqg056pqjye"),
767
    # C Chinese (traditional):
768
    ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
769
     b"ihqwctvzc91f659drss3x8bo0yb"),
770
    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
771 772 773
    ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
     "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
     "\u0065\u0073\u006B\u0079",
774
     b"Proprostnemluvesky-uyb24dma41a"),
775
    # E Hebrew:
776 777 778
    ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
     "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
     "\u05D1\u05E8\u05D9\u05EA",
779
     b"4dbcagdahymbxekheh6e0a7fei0b"),
780
    # F Hindi (Devanagari):
781
    ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
782 783 784 785
     "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
     "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
     "\u0939\u0948\u0902",
     b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
786 787

    #(G) Japanese (kanji and hiragana):
788
    ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
789 790
     "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
     b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
791 792

    # (H) Korean (Hangul syllables):
793 794 795
    ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
     "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
     "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
796 797
     b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
     b"psd879ccm6fea98c"),
798 799

    # (I) Russian (Cyrillic):
800 801 802 803
    ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
     "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
     "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
     "\u0438",
804
     b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
805 806

    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
807 808 809 810 811
    ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
     "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
     "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
     "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
     "\u0061\u00F1\u006F\u006C",
812
     b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
813 814 815 816

    # (K) Vietnamese:
    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
817 818 819 820
    ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
     "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
     "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
     "\u0056\u0069\u1EC7\u0074",
821
     b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
822 823

    #(L) 3<nen>B<gumi><kinpachi><sensei>
824
    ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
825
     b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters's avatar
Tim Peters committed
826

827
    # (M) <amuro><namie>-with-SUPER-MONKEYS
828 829 830
    ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
     "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
     "\u004F\u004E\u004B\u0045\u0059\u0053",
831
     b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
832 833

    # (N) Hello-Another-Way-<sorezore><no><basho>
834 835 836
    ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
     "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
     "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
837
     b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
838 839

    # (O) <hitotsu><yane><no><shita>2
840
    ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
841
     b"2-u9tlzr9756bt3uc0v"),
842 843

    # (P) Maji<de>Koi<suru>5<byou><mae>
844 845
    ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
     "\u308B\u0035\u79D2\u524D",
846
     b"MajiKoi5-783gue6qz075azm5e"),
847 848

     # (Q) <pafii>de<runba>
849
    ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
850
     b"de-jg4avhby1noc0d"),
851 852

    # (R) <sono><supiido><de>
853
    ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
854
     b"d9juau41awczczp"),
855 856

    # (S) -> $1.00 <-
857 858
    ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
     "\u003C\u002D",
859
     b"-> $1.00 <--")
860 861 862 863
    ]

for i in punycode_testcases:
    if len(i)!=2:
864
        print(repr(i))
865 866 867 868 869 870 871 872 873

class PunycodeTest(unittest.TestCase):
    def test_encode(self):
        for uni, puny in punycode_testcases:
            # Need to convert both strings to lower case, since
            # some of the extended encodings use upper case, but our
            # code produces only lower case. Converting just puny to
            # lower is also insufficient, since some of the input characters
            # are upper case.
874
            self.assertEqual(
875 876 877
                str(uni.encode("punycode"), "ascii").lower(),
                str(puny, "ascii").lower()
            )
878 879 880

    def test_decode(self):
        for uni, puny in punycode_testcases:
881
            self.assertEqual(uni, puny.decode("punycode"))
882
            puny = puny.decode("ascii").encode("ascii")
883
            self.assertEqual(uni, puny.decode("punycode"))
884

885 886 887 888 889 890
class UnicodeInternalTest(unittest.TestCase):
    def test_bug1251300(self):
        # Decoding with unicode_internal used to not correctly handle "code
        # points" above 0x10ffff on UCS-4 builds.
        if sys.maxunicode > 0xffff:
            ok = [
891 892 893
                (b"\x00\x10\xff\xff", "\U0010ffff"),
                (b"\x00\x00\x01\x01", "\U00000101"),
                (b"", ""),
894 895
            ]
            not_ok = [
896 897 898 899 900
                b"\x7f\xff\xff\xff",
                b"\x80\x00\x00\x00",
                b"\x81\x00\x00\x00",
                b"\x00",
                b"\x00\x00\x00\x00\x00",
901 902 903
            ]
            for internal, uni in ok:
                if sys.byteorder == "little":
904
                    internal = bytes(reversed(internal))
905
                self.assertEqual(uni, internal.decode("unicode_internal"))
906 907
            for internal in not_ok:
                if sys.byteorder == "little":
908
                    internal = bytes(reversed(internal))
909 910 911 912 913 914
                self.assertRaises(UnicodeDecodeError, internal.decode,
                    "unicode_internal")

    def test_decode_error_attributes(self):
        if sys.maxunicode > 0xffff:
            try:
915
                b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
916
            except UnicodeDecodeError as ex:
917 918 919 920
                self.assertEqual("unicode_internal", ex.encoding)
                self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
                self.assertEqual(4, ex.start)
                self.assertEqual(8, ex.end)
921 922 923 924 925 926 927
            else:
                self.fail()

    def test_decode_callback(self):
        if sys.maxunicode > 0xffff:
            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
            decoder = codecs.getdecoder("unicode_internal")
928
            ab = "ab".encode("unicode_internal").decode()
929 930 931
            ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
                                    "ascii"),
                              "UnicodeInternalTest")
932
            self.assertEqual(("ab", 12), ignored)
933

934 935 936
    def test_encode_length(self):
        # Issue 3739
        encoder = codecs.getencoder("unicode_internal")
937 938
        self.assertEqual(encoder("a")[1], 1)
        self.assertEqual(encoder("\xe9\u0142")[1], 2)
939

940
        self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
941

942 943 944
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests = [
    # 3.1 Map to nothing.
945 946 947 948
    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
     b'\xb8\x8f\xef\xbb\xbf',
     b'foobarbaz'),
949
    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
950 951
    (b'CAFE',
     b'cafe'),
952 953
    # 3.3 Case folding 8bit U+00DF (german sharp s).
    # The original test case is bogus; it says \xc3\xdf
954 955
    (b'\xc3\x9f',
     b'ss'),
956
    # 3.4 Case folding U+0130 (turkish capital I with dot).
957 958
    (b'\xc4\xb0',
     b'i\xcc\x87'),
959
    # 3.5 Case folding multibyte U+0143 U+037A.
960 961
    (b'\xc5\x83\xcd\xba',
     b'\xc5\x84 \xce\xb9'),
962 963 964 965 966 967
    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
    # XXX: skip this as it fails in UCS-2 mode
    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
    # 'telc\xe2\x88\x95kg\xcf\x83'),
    (None, None),
    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
968 969
    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
     b'\xc7\xb0 a'),
970
    # 3.8 Case folding U+1FB7 and normalization.
971 972
    (b'\xe1\xbe\xb7',
     b'\xe1\xbe\xb6\xce\xb9'),
973 974
    # 3.9 Self-reverting case folding U+01F0 and normalization.
    # The original test case is bogus, it says `\xc7\xf0'
975 976
    (b'\xc7\xb0',
     b'\xc7\xb0'),
977
    # 3.10 Self-reverting case folding U+0390 and normalization.
978 979
    (b'\xce\x90',
     b'\xce\x90'),
980
    # 3.11 Self-reverting case folding U+03B0 and normalization.
981 982
    (b'\xce\xb0',
     b'\xce\xb0'),
983
    # 3.12 Self-reverting case folding U+1E96 and normalization.
984 985
    (b'\xe1\xba\x96',
     b'\xe1\xba\x96'),
986
    # 3.13 Self-reverting case folding U+1F56 and normalization.
987 988
    (b'\xe1\xbd\x96',
     b'\xe1\xbd\x96'),
989
    # 3.14 ASCII space character U+0020.
990 991
    (b' ',
     b' '),
992
    # 3.15 Non-ASCII 8bit space character U+00A0.
993 994
    (b'\xc2\xa0',
     b' '),
995
    # 3.16 Non-ASCII multibyte space character U+1680.
996
    (b'\xe1\x9a\x80',
997 998
     None),
    # 3.17 Non-ASCII multibyte space character U+2000.
999 1000
    (b'\xe2\x80\x80',
     b' '),
1001
    # 3.18 Zero Width Space U+200b.
1002 1003
    (b'\xe2\x80\x8b',
     b''),
1004
    # 3.19 Non-ASCII multibyte space character U+3000.
1005 1006
    (b'\xe3\x80\x80',
     b' '),
1007
    # 3.20 ASCII control characters U+0010 U+007F.
1008 1009
    (b'\x10\x7f',
     b'\x10\x7f'),
1010
    # 3.21 Non-ASCII 8bit control character U+0085.
1011
    (b'\xc2\x85',
1012 1013
     None),
    # 3.22 Non-ASCII multibyte control character U+180E.
1014
    (b'\xe1\xa0\x8e',
1015 1016
     None),
    # 3.23 Zero Width No-Break Space U+FEFF.
1017 1018
    (b'\xef\xbb\xbf',
     b''),
1019
    # 3.24 Non-ASCII control character U+1D175.
1020
    (b'\xf0\x9d\x85\xb5',
1021 1022
     None),
    # 3.25 Plane 0 private use character U+F123.
1023
    (b'\xef\x84\xa3',
1024 1025
     None),
    # 3.26 Plane 15 private use character U+F1234.
1026
    (b'\xf3\xb1\x88\xb4',
1027 1028
     None),
    # 3.27 Plane 16 private use character U+10F234.
1029
    (b'\xf4\x8f\x88\xb4',
1030 1031
     None),
    # 3.28 Non-character code point U+8FFFE.
1032
    (b'\xf2\x8f\xbf\xbe',
1033 1034
     None),
    # 3.29 Non-character code point U+10FFFF.
1035
    (b'\xf4\x8f\xbf\xbf',
1036 1037
     None),
    # 3.30 Surrogate code U+DF42.
1038
    (b'\xed\xbd\x82',
1039 1040
     None),
    # 3.31 Non-plain text character U+FFFD.
1041
    (b'\xef\xbf\xbd',
1042 1043
     None),
    # 3.32 Ideographic description character U+2FF5.
1044
    (b'\xe2\xbf\xb5',
1045 1046
     None),
    # 3.33 Display property character U+0341.
1047 1048
    (b'\xcd\x81',
     b'\xcc\x81'),
1049
    # 3.34 Left-to-right mark U+200E.
1050
    (b'\xe2\x80\x8e',
1051 1052
     None),
    # 3.35 Deprecated U+202A.
1053
    (b'\xe2\x80\xaa',
1054 1055
     None),
    # 3.36 Language tagging character U+E0001.
1056
    (b'\xf3\xa0\x80\x81',
1057 1058
     None),
    # 3.37 Language tagging character U+E0042.
1059
    (b'\xf3\xa0\x81\x82',
1060 1061
     None),
    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1062
    (b'foo\xd6\xbebar',
1063 1064
     None),
    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1065
    (b'foo\xef\xb5\x90bar',
1066 1067
     None),
    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1068 1069
    (b'foo\xef\xb9\xb6bar',
     b'foo \xd9\x8ebar'),
1070
    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1071
    (b'\xd8\xa71',
1072 1073
     None),
    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1074 1075
    (b'\xd8\xa71\xd8\xa8',
     b'\xd8\xa71\xd8\xa8'),
1076
    # 3.43 Unassigned code point U+E0002.
1077
    # Skip this test as we allow unassigned
1078
    #(b'\xf3\xa0\x80\x82',
1079 1080
    # None),
    (None, None),
1081 1082
    # 3.44 Larger test (shrinking).
    # Original test case reads \xc3\xdf
1083 1084 1085
    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
     b'\xaa\xce\xb0\xe2\x80\x80',
     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1086 1087
    # 3.45 Larger test (expanding).
    # Original test case reads \xc3\x9f
1088 1089 1090 1091 1092
    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
     b'\x80',
     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
    ]


class NameprepTest(unittest.TestCase):
    def test_nameprep(self):
        from encodings.idna import nameprep
        for pos, (orig, prepped) in enumerate(nameprep_tests):
            if orig is None:
                # Skipped
                continue
            # The Unicode strings are given in UTF-8
1104
            orig = str(orig, "utf-8", "surrogatepass")
1105 1106 1107 1108
            if prepped is None:
                # Input contains prohibited characters
                self.assertRaises(UnicodeError, nameprep, orig)
            else:
1109
                prepped = str(prepped, "utf-8", "surrogatepass")
1110
                try:
1111
                    self.assertEqual(nameprep(orig), prepped)
1112
                except Exception as e:
1113
                    raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1114

1115 1116
class IDNACodecTest(unittest.TestCase):
    def test_builtin_decode(self):
1117 1118 1119 1120
        self.assertEqual(str(b"python.org", "idna"), "python.org")
        self.assertEqual(str(b"python.org.", "idna"), "python.org.")
        self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
        self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
1121 1122

    def test_builtin_encode(self):
1123 1124 1125 1126
        self.assertEqual("python.org".encode("idna"), b"python.org")
        self.assertEqual("python.org.".encode("idna"), b"python.org.")
        self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
        self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
1127

1128
    def test_stream(self):
1129
        r = codecs.getreader("idna")(io.BytesIO(b"abc"))
1130
        r.read(3)
1131
        self.assertEqual(r.read(), "")
1132

1133
    def test_incremental_decode(self):
1134
        self.assertEqual(
1135
            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
1136
            "python.org"
1137
        )
1138
        self.assertEqual(
1139
            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
1140
            "python.org."
1141
        )
1142
        self.assertEqual(
1143
            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1144
            "pyth\xf6n.org."
1145
        )
1146
        self.assertEqual(
1147
            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1148
            "pyth\xf6n.org."
1149 1150 1151
        )

        decoder = codecs.getincrementaldecoder("idna")()
1152 1153 1154 1155
        self.assertEqual(decoder.decode(b"xn--xam", ), "")
        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
        self.assertEqual(decoder.decode(b"rg"), "")
        self.assertEqual(decoder.decode(b"", True), "org")
1156 1157

        decoder.reset()
1158 1159 1160 1161
        self.assertEqual(decoder.decode(b"xn--xam", ), "")
        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
        self.assertEqual(decoder.decode(b"rg."), "org.")
        self.assertEqual(decoder.decode(b"", True), "")
1162 1163

    def test_incremental_encode(self):
1164
        self.assertEqual(
1165 1166
            b"".join(codecs.iterencode("python.org", "idna")),
            b"python.org"
1167
        )
1168
        self.assertEqual(
1169 1170
            b"".join(codecs.iterencode("python.org.", "idna")),
            b"python.org."
1171
        )
1172
        self.assertEqual(
1173 1174
            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
            b"xn--pythn-mua.org."
1175
        )
1176
        self.assertEqual(
1177 1178
            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
            b"xn--pythn-mua.org."
1179 1180 1181
        )

        encoder = codecs.getincrementalencoder("idna")()
1182 1183 1184
        self.assertEqual(encoder.encode("\xe4x"), b"")
        self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
        self.assertEqual(encoder.encode("", True), b"org")
1185 1186

        encoder.reset()
1187 1188 1189
        self.assertEqual(encoder.encode("\xe4x"), b"")
        self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
        self.assertEqual(encoder.encode("", True), b"")
1190

1191 1192 1193
class CodecsModuleTest(unittest.TestCase):

    def test_decode(self):
1194 1195
        self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
                         '\xe4\xf6\xfc')
1196
        self.assertRaises(TypeError, codecs.decode)
1197
        self.assertEqual(codecs.decode(b'abc'), 'abc')
1198
        self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
1199

1200
    def test_encode(self):
1201 1202
        self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
                         b'\xe4\xf6\xfc')
1203
        self.assertRaises(TypeError, codecs.encode)
1204
        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1205
        self.assertEqual(codecs.encode('abc'), b'abc')
1206
        self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
1207 1208 1209

    def test_register(self):
        self.assertRaises(TypeError, codecs.register)
1210
        self.assertRaises(TypeError, codecs.register, 42)
1211 1212 1213 1214

    def test_lookup(self):
        self.assertRaises(TypeError, codecs.lookup)
        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
        self.assertRaises(LookupError, codecs.lookup, " ")

    def test_getencoder(self):
        self.assertRaises(TypeError, codecs.getencoder)
        self.assertRaises(LookupError, codecs.getencoder, "__spam__")

    def test_getdecoder(self):
        self.assertRaises(TypeError, codecs.getdecoder)
        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")

    def test_getreader(self):
        self.assertRaises(TypeError, codecs.getreader)
        self.assertRaises(LookupError, codecs.getreader, "__spam__")

    def test_getwriter(self):
        self.assertRaises(TypeError, codecs.getwriter)
        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1232

1233 1234 1235 1236
class StreamReaderTest(unittest.TestCase):

    def setUp(self):
        self.reader = codecs.getreader('utf-8')
1237
        self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1238 1239 1240

    def test_readlines(self):
        f = self.reader(self.stream)
1241
        self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
1242

1243 1244 1245
class EncodedFileTest(unittest.TestCase):

    def test_basic(self):
1246
        f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1247
        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1248
        self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
1249

1250
        f = io.BytesIO()
1251
        ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1252
        ef.write(b'\xc3\xbc')
1253
        self.assertEqual(f.getvalue(), b'\xfc')
1254

1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
all_unicode_encodings = [
    "ascii",
    "big5",
    "big5hkscs",
    "charmap",
    "cp037",
    "cp1006",
    "cp1026",
    "cp1140",
    "cp1250",
    "cp1251",
    "cp1252",
    "cp1253",
    "cp1254",
    "cp1255",
    "cp1256",
    "cp1257",
    "cp1258",
    "cp424",
    "cp437",
    "cp500",
1276
    "cp720",
1277 1278 1279 1280 1281 1282 1283
    "cp737",
    "cp775",
    "cp850",
    "cp852",
    "cp855",
    "cp856",
    "cp857",
1284
    "cp858",
1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
    "cp860",
    "cp861",
    "cp862",
    "cp863",
    "cp864",
    "cp865",
    "cp866",
    "cp869",
    "cp874",
    "cp875",
    "cp932",
    "cp949",
    "cp950",
    "euc_jis_2004",
    "euc_jisx0213",
    "euc_jp",
    "euc_kr",
    "gb18030",
    "gb2312",
    "gbk",
    "hp_roman8",
    "hz",
    "idna",
    "iso2022_jp",
    "iso2022_jp_1",
    "iso2022_jp_2",
    "iso2022_jp_2004",
    "iso2022_jp_3",
    "iso2022_jp_ext",
    "iso2022_kr",
    "iso8859_1",
    "iso8859_10",
    "iso8859_11",
    "iso8859_13",
    "iso8859_14",
    "iso8859_15",
    "iso8859_16",
    "iso8859_2",
    "iso8859_3",
    "iso8859_4",
    "iso8859_5",
    "iso8859_6",
    "iso8859_7",
    "iso8859_8",
    "iso8859_9",
    "johab",
    "koi8_r",
    "koi8_u",
    "latin_1",
    "mac_cyrillic",
    "mac_greek",
    "mac_iceland",
    "mac_latin2",
    "mac_roman",
    "mac_turkish",
    "palmos",
    "ptcp154",
    "punycode",
    "raw_unicode_escape",
    "shift_jis",
    "shift_jis_2004",
    "shift_jisx0213",
    "tis_620",
    "unicode_escape",
    "unicode_internal",
    "utf_16",
    "utf_16_be",
    "utf_16_le",
    "utf_7",
    "utf_8",
]

if hasattr(codecs, "mbcs_encode"):
    all_unicode_encodings.append("mbcs")

# The following encoding is not tested, because it's not supposed
# to work:
#    "undefined"

# The following encodings don't work in stateful mode
broken_unicode_with_streams = [
    "punycode",
    "unicode_internal"
]
1369 1370 1371
broken_incremental_coders = broken_unicode_with_streams + [
    "idna",
]
1372

1373
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
1374
    def test_basics(self):
1375
        s = "abc123" # all codecs should be able to encode these
1376
        for encoding in all_unicode_encodings:
1377 1378 1379 1380 1381 1382
            name = codecs.lookup(encoding).name
            if encoding.endswith("_codec"):
                name += "_codec"
            elif encoding == "latin_1":
                name = "latin_1"
            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1383
            (b, size) = codecs.getencoder(encoding)(s)
1384
            self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1385
            (chars, size) = codecs.getdecoder(encoding)(b)
1386 1387 1388 1389
            self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))

            if encoding not in broken_unicode_with_streams:
                # check stream reader/writer
1390
                q = Queue(b"")
1391
                writer = codecs.getwriter(encoding)(q)
1392
                encodedresult = b""
1393 1394
                for c in s:
                    writer.write(c)
1395
                    chunk = q.read()
1396
                    self.assertTrue(type(chunk) is bytes, type(chunk))
1397
                    encodedresult += chunk
1398
                q = Queue(b"")
1399
                reader = codecs.getreader(encoding)(q)
1400
                decodedresult = ""
1401
                for c in encodedresult:
1402
                    q.write(bytes([c]))
1403 1404 1405
                    decodedresult += reader.read()
                self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

1406
            if encoding not in broken_incremental_coders:
1407 1408
                # check incremental decoder/encoder (fetched via the Python
                # and C API) and iterencode()/iterdecode()
1409 1410
                try:
                    encoder = codecs.getincrementalencoder(encoding)()
1411
                    cencoder = _testcapi.codec_incrementalencoder(encoding)
1412 1413 1414 1415
                except LookupError: # no IncrementalEncoder
                    pass
                else:
                    # check incremental decoder/encoder
1416
                    encodedresult = b""
1417 1418
                    for c in s:
                        encodedresult += encoder.encode(c)
1419
                    encodedresult += encoder.encode("", True)
1420
                    decoder = codecs.getincrementaldecoder(encoding)()
1421
                    decodedresult = ""
1422
                    for c in encodedresult:
1423
                        decodedresult += decoder.decode(bytes([c]))
1424
                    decodedresult += decoder.decode(b"", True)
1425 1426 1427
                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                    # check C API
1428
                    encodedresult = b""
1429 1430
                    for c in s:
                        encodedresult += cencoder.encode(c)
1431
                    encodedresult += cencoder.encode("", True)
1432
                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1433
                    decodedresult = ""
1434
                    for c in encodedresult:
1435
                        decodedresult += cdecoder.decode(bytes([c]))
1436
                    decodedresult += cdecoder.decode(b"", True)
1437 1438 1439
                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                    # check iterencode()/iterdecode()
1440
                    result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1441 1442 1443
                    self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))

                    # check iterencode()/iterdecode() with empty string
1444 1445
                    result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
                    self.assertEqual(result, "")
1446

1447
                if encoding not in ("idna", "mbcs"):
1448 1449 1450 1451 1452 1453 1454
                    # check incremental decoder/encoder with errors argument
                    try:
                        encoder = codecs.getincrementalencoder(encoding)("ignore")
                        cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
                    except LookupError: # no IncrementalEncoder
                        pass
                    else:
1455
                        encodedresult = b"".join(encoder.encode(c) for c in s)
1456
                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
1457
                        decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
1458 1459
                        self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

1460
                        encodedresult = b"".join(cencoder.encode(c) for c in s)
1461
                        cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1462
                        decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
1463 1464
                        self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

1465 1466
    def test_seek(self):
        # all codecs should be able to encode these
1467
        s = "%s\n%s\n" % (100*"abc123", 100*"def456")
1468 1469 1470 1471 1472
        for encoding in all_unicode_encodings:
            if encoding == "idna": # FIXME: See SF bug #1163178
                continue
            if encoding in broken_unicode_with_streams:
                continue
1473
            reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
1474
            for t in range(5):
1475 1476
                # Test that calling seek resets the internal codec state and buffers
                reader.seek(0, 0)
1477 1478
                data = reader.read()
                self.assertEqual(s, data)
1479

1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491
    def test_bad_decode_args(self):
        for encoding in all_unicode_encodings:
            decoder = codecs.getdecoder(encoding)
            self.assertRaises(TypeError, decoder)
            if encoding not in ("idna", "punycode"):
                self.assertRaises(TypeError, decoder, 42)

    def test_bad_encode_args(self):
        for encoding in all_unicode_encodings:
            encoder = codecs.getencoder(encoding)
            self.assertRaises(TypeError, encoder)

1492 1493 1494 1495 1496 1497
    def test_encoding_map_type_initialized(self):
        from encodings import cp1140
        # This used to crash, we are only verifying there's no crash.
        table_type = type(cp1140.encoding_table)
        self.assertEqual(table_type, table_type)

1498 1499
    def test_decoder_state(self):
        # Check that getstate() and setstate() handle the state properly
1500
        u = "abc123"
1501 1502 1503 1504 1505
        for encoding in all_unicode_encodings:
            if encoding not in broken_incremental_coders:
                self.check_state_handling_decode(encoding, u, u.encode(encoding))
                self.check_state_handling_encode(encoding, u, u.encode(encoding))

1506 1507
class CharmapTest(unittest.TestCase):
    def test_decode_with_string_map(self):
1508
        self.assertEqual(
1509
            codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
1510
            ("abc", 3)
1511 1512
        )

1513
        self.assertEqual(
1514
            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
1515
            ("ab\ufffd", 3)
1516 1517
        )

1518
        self.assertEqual(
1519
            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
1520
            ("ab\ufffd", 3)
1521 1522
        )

1523
        self.assertEqual(
1524
            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
1525
            ("ab", 3)
1526 1527
        )

1528
        self.assertEqual(
1529
            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
1530
            ("ab", 3)
1531 1532
        )

1533
        allbytes = bytes(range(256))
1534
        self.assertEqual(
1535 1536
            codecs.charmap_decode(allbytes, "ignore", ""),
            ("", len(allbytes))
1537 1538
        )

1539 1540
class WithStmtTest(unittest.TestCase):
    def test_encodedfile(self):
1541
        f = io.BytesIO(b"\xc3\xbc")
1542
        with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1543
            self.assertEqual(ef.read(), b"\xfc")
1544 1545

    def test_streamreaderwriter(self):
1546
        f = io.BytesIO(b"\xc3\xbc")
1547 1548 1549
        info = codecs.lookup("utf-8")
        with codecs.StreamReaderWriter(f, info.streamreader,
                                       info.streamwriter, 'strict') as srw:
1550
            self.assertEqual(srw.read(), "\xfc")
1551

1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
class TypesTest(unittest.TestCase):
    def test_decode_unicode(self):
        # Most decoders don't accept unicode input
        decoders = [
            codecs.utf_7_decode,
            codecs.utf_8_decode,
            codecs.utf_16_le_decode,
            codecs.utf_16_be_decode,
            codecs.utf_16_ex_decode,
            codecs.utf_32_decode,
            codecs.utf_32_le_decode,
            codecs.utf_32_be_decode,
            codecs.utf_32_ex_decode,
            codecs.latin_1_decode,
            codecs.ascii_decode,
            codecs.charmap_decode,
        ]
        if hasattr(codecs, "mbcs_decode"):
            decoders.append(codecs.mbcs_decode)
        for decoder in decoders:
            self.assertRaises(TypeError, decoder, "xxx")

    def test_unicode_escape(self):
        # Escape-decoding an unicode string is supported ang gives the same
        # result as decoding the equivalent ASCII bytes string.
1577 1578 1579 1580
        self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
        self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
        self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
        self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1581

1582
class SurrogateEscapeTest(unittest.TestCase):
1583 1584 1585

    def test_utf8(self):
        # Bad byte
1586
        self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
1587
                         "foo\udc80bar")
1588
        self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
1589 1590
                         b"foo\x80bar")
        # bad-utf-8 encoded surrogate
1591
        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
1592
                         "\udced\udcb0\udc80")
1593
        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
1594 1595 1596 1597
                         b"\xed\xb0\x80")

    def test_ascii(self):
        # bad byte
1598
        self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
1599
                         "foo\udc80bar")
1600
        self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
1601 1602 1603 1604
                         b"foo\x80bar")

    def test_charmap(self):
        # bad byte: \xa5 is unmapped in iso-8859-3
1605
        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
1606
                         "foo\udca5bar")
1607
        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
1608 1609
                         b"foo\xa5bar")

1610 1611 1612 1613 1614
    def test_latin1(self):
        # Issue6373
        self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
                         b"\xe4\xeb\xef\xf6\xfc")

1615

1616 1617 1618 1619 1620 1621 1622 1623 1624
class BomTest(unittest.TestCase):
    def test_seek0(self):
        data = "1234567890"
        tests = ("utf-16",
                 "utf-16-le",
                 "utf-16-be",
                 "utf-32",
                 "utf-32-le",
                 "utf-32-be")
1625
        self.addCleanup(support.unlink, support.TESTFN)
1626
        for encoding in tests:
1627 1628
            # Check if the BOM is written only once
            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1629 1630 1631
                f.write(data)
                f.write(data)
                f.seek(0)
1632
                self.assertEqual(f.read(), data * 2)
1633
                f.seek(0)
1634
                self.assertEqual(f.read(), data * 2)
1635

1636 1637 1638
            # Check that the BOM is written after a seek(0)
            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
                f.write(data[0])
1639
                self.assertNotEqual(f.tell(), 0)
1640 1641 1642
                f.seek(0)
                f.write(data)
                f.seek(0)
1643
                self.assertEqual(f.read(), data)
1644 1645 1646 1647

            # (StreamWriter) Check that the BOM is written after a seek(0)
            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
                f.writer.write(data[0])
1648
                self.assertNotEqual(f.writer.tell(), 0)
1649 1650 1651
                f.writer.seek(0)
                f.writer.write(data)
                f.seek(0)
1652
                self.assertEqual(f.read(), data)
1653 1654 1655 1656 1657 1658 1659 1660

            # Check that the BOM is not written after a seek() at a position
            # different than the start
            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
                f.write(data)
                f.seek(f.tell())
                f.write(data)
                f.seek(0)
1661
                self.assertEqual(f.read(), data * 2)
1662 1663 1664 1665 1666 1667 1668 1669

            # (StreamWriter) Check that the BOM is not written after a seek()
            # at a position different than the start
            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
                f.writer.write(data)
                f.writer.seek(f.writer.tell())
                f.writer.write(data)
                f.seek(0)
1670
                self.assertEqual(f.read(), data * 2)
1671

1672

1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
bytes_transform_encodings = [
    "base64_codec",
    "uu_codec",
    "quopri_codec",
    "hex_codec",
]
try:
    import zlib
except ImportError:
    pass
else:
    bytes_transform_encodings.append("zlib_codec")
try:
    import bz2
except ImportError:
    pass
else:
    bytes_transform_encodings.append("bz2_codec")

class TransformCodecTest(unittest.TestCase):
1693

1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
    def test_basics(self):
        binput = bytes(range(256))
        for encoding in bytes_transform_encodings:
            # generic codecs interface
            (o, size) = codecs.getencoder(encoding)(binput)
            self.assertEqual(size, len(binput))
            (i, size) = codecs.getdecoder(encoding)(o)
            self.assertEqual(size, len(o))
            self.assertEqual(i, binput)

    def test_read(self):
        for encoding in bytes_transform_encodings:
1706
            sin = codecs.encode(b"\x80", encoding)
1707 1708 1709 1710 1711 1712 1713 1714
            reader = codecs.getreader(encoding)(io.BytesIO(sin))
            sout = reader.read()
            self.assertEqual(sout, b"\x80")

    def test_readline(self):
        for encoding in bytes_transform_encodings:
            if encoding in ['uu_codec', 'zlib_codec']:
                continue
1715
            sin = codecs.encode(b"\x80", encoding)
1716 1717 1718 1719 1720
            reader = codecs.getreader(encoding)(io.BytesIO(sin))
            sout = reader.readline()
            self.assertEqual(sout, b"\x80")


1721
def test_main():
1722
    support.run_unittest(
1723 1724 1725
        UTF32Test,
        UTF32LETest,
        UTF32BETest,
1726
        UTF16Test,
1727 1728 1729
        UTF16LETest,
        UTF16BETest,
        UTF8Test,
1730
        UTF8SigTest,
1731 1732 1733
        UTF7Test,
        UTF16ExTest,
        ReadBufferTest,
1734 1735
        RecodingTest,
        PunycodeTest,
1736
        UnicodeInternalTest,
1737
        NameprepTest,
1738
        IDNACodecTest,
1739
        CodecsModuleTest,
1740
        StreamReaderTest,
1741
        EncodedFileTest,
1742
        BasicUnicodeTest,
1743 1744
        CharmapTest,
        WithStmtTest,
1745
        TypesTest,
1746
        SurrogateEscapeTest,
1747
        BomTest,
1748
        TransformCodecTest,
1749
    )
1750 1751 1752 1753


if __name__ == "__main__":
    test_main()