Commit e8906fe1 authored by Martin v. Löwis's avatar Martin v. Löwis

Implement PEP 3120.

parent d23fd01e
# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
import unittest
from test import test_support
class PEP3120Test(unittest.TestCase):
def test_pep3120(self):
self.assertEqual(
"Питон".encode("utf-8"),
b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
)
self.assertEqual(
"\П".encode("utf-8"),
b'\\\xd0\x9f'
)
def test_badsyntax(self):
try:
import test.badsyntax_pep3120
except SyntaxError as msg:
self.assert_(str(msg).find("Non-UTF-8 code starting with") >= 0)
else:
self.fail("expected exception didn't occur")
def test_main():
test_support.run_unittest(PEP3120Test)
if __name__=="__main__":
test_main()
...@@ -26,6 +26,8 @@ TO DO ...@@ -26,6 +26,8 @@ TO DO
Core and Builtins Core and Builtins
----------------- -----------------
- PEP 3120: Change default encoding to UTF-8.
- PEP 3123: Use proper C inheritance for PyObject. - PEP 3123: Use proper C inheritance for PyObject.
- Removed the __oct__ and __hex__ special methods and added a bin() - Removed the __oct__ and __hex__ special methods and added a bin()
......
...@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) { ...@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
ungetc(c, tok->fp); ungetc(c, tok->fp);
} }
/* Check whether the characters at s start a valid
UTF-8 sequence. Return the number of characters forming
the sequence if yes, 0 if not. */
static int valid_utf8(const unsigned char* s)
{
int expected = 0;
int length;
if (*s < 0x80)
/* single-byte code */
return 1;
if (*s < 0xc0)
/* following byte */
return 0;
if (*s < 0xE0)
expected = 1;
else if (*s < 0xF0)
expected = 2;
else if (*s < 0xF8)
expected = 3;
else
return 0;
length = expected + 1;
for (; expected; expected--)
if (s[expected] < 0x80 || s[expected] >= 0xC0)
return 0;
return length;
}
/* Read a line of input from TOK. Determine encoding /* Read a line of input from TOK. Determine encoding
if necessary. */ if necessary. */
...@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok) ...@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
} }
} }
#ifndef PGEN #ifndef PGEN
/* The default encoding is ASCII, so make sure we don't have any /* The default encoding is UTF-8, so make sure we don't have any
non-ASCII bytes in it. */ non-UTF-8 sequences in it. */
if (line && !tok->encoding) { if (line && !tok->encoding) {
unsigned char *c; unsigned char *c;
for (c = (unsigned char *)line; *c; c++) int length;
if (*c > 127) { for (c = (unsigned char *)line; *c; c += length)
if (!(length = valid_utf8(c))) {
badchar = *c; badchar = *c;
break; break;
} }
...@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok) ...@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
/* Need to add 1 to the line number, since this line /* Need to add 1 to the line number, since this line
has not been counted, yet. */ has not been counted, yet. */
sprintf(buf, sprintf(buf,
"Non-ASCII character '\\x%.2x' " "Non-UTF-8 code starting with '\\x%.2x' "
"in file %.200s on line %i, " "in file %.200s on line %i, "
"but no encoding declared; " "but no encoding declared; "
"see http://www.python.org/peps/pep-0263.html for details", "see http://www.python.org/peps/pep-0263.html for details",
......
...@@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename, ...@@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
c.c_encoding = STR(n); c.c_encoding = STR(n);
n = CHILD(n, 0); n = CHILD(n, 0);
} else { } else {
c.c_encoding = NULL; /* PEP 3120 */
c.c_encoding = "utf-8";
} }
c.c_arena = arena; c.c_arena = arena;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment