Patch #534304: Implement phase 1 of PEP 263.

00f1e3f5 · Martin v. Löwis · a729daf2 · 00f1e3f5 · 00f1e3f5 · 00f1e3f5
Commit 00f1e3f5 authored Aug 04, 2002 by Martin v. Löwis
13 changed files
--- a/Doc/ref/ref2.tex
+++ b/Doc/ref/ref2.tex
@@ -7,11 +7,14 @@ chapter describes how the lexical analyzer breaks a file into tokens.
 \index{parser}
 \index{token}

-Python uses the 7-bit \ASCII{} character set for program text and string
-literals. 8-bit characters may be used in string literals and comments
-but their interpretation is platform dependent; the proper way to
-insert 8-bit characters in string literals is by using octal or
-hexadecimal escape sequences.
+Python uses the 7-bit \ASCII{} character set for program text.
+\versionadded[An encoding declaration can be used to indicate that 
+string literals and comments use an encoding different from ASCII.]{2.3}
+For compatibility with older versions, Python only warns if it finds
+8-bit characters; those warnings should be corrected by either declaring
+an explicit encoding, or using escape sequences if those bytes are binary
+data, instead of characters.
+

 The run-time character set depends on the I/O devices connected to the
 program but is generally a superset of \ASCII.
@@ -69,6 +72,37 @@ Comments are ignored by the syntax; they are not tokens.
 \index{hash character}


+\subsection{Encoding declarations\label{encodings}}
+
+If a comment in the first or second line of the Python script matches
+the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
+processed as an encoding declaration; the first group of this
+expression names the encoding of the source code file. The recommended
+forms of this expression are
+
+\begin{verbatim}
+# -*- coding: <encoding-name> -*-
+\end{verbatim}
+
+which is recognized also by GNU Emacs, and
+
+\begin{verbatim}
+# vim:fileencoding=<encoding-name>
+\end{verbatim}
+
+which is recognized by Bram Moolenar's VIM. In addition, if the first
+bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
+declared file encoding is UTF-8 (this is supported, among others, by
+Microsoft's notepad.exe).
+
+If an encoding is declared, the encoding name must be recognized by
+Python. % XXX there should be a list of supported encodings.
+The encoding is used for all lexical analysis, in particular to find
+the end of a string, and to interpret the contents of Unicode literals.
+String literals are converted to Unicode for syntactical analysis,
+then converted back to their original encoding before interpretation
+starts.
+
 \subsection{Explicit line joining\label{explicit-joining}}

 Two or more physical lines may be joined into logical lines using

--- a/Grammar/Grammar
+++ b/Grammar/Grammar
@@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
 list_if: 'if' test [list_iter]

 testlist1: test (',' test)*
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -25,6 +25,7 @@ extern "C" {
 #define E_OVERFLOW      19	/* Node had too many children */
 #define E_TOODEEP	20	/* Too many indentation levels */
 #define E_DEDENT	21	/* No matching outer block for dedent */
+#define E_DECODE	22	/* Error in decoding into Unicode */

 #ifdef __cplusplus
 }

--- a/Include/graminit.h
+++ b/Include/graminit.h
@@ -65,3 +65,4 @@
 #define list_for 320
 #define list_if 321
 #define testlist1 322
+#define encoding_decl 323
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -190,15 +190,15 @@ POBJS=		\
 		Parser/node.o \
 		Parser/parser.o \
 		Parser/parsetok.o \
-		Parser/tokenizer.o \
 		Parser/bitset.o \
 		Parser/metagrammar.o

-PARSER_OBJS=	$(POBJS) Parser/myreadline.o
+PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o

 PGOBJS=		\
 		Objects/obmalloc.o \
 		Python/mysnprintf.o \
+		Parser/tokenizer_pgen.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
 		Parser/pgen.o \
@@ -434,6 +434,8 @@ Parser/grammar.o:	$(srcdir)/Parser/grammar.c \
 				$(srcdir)/Include/grammar.h
 Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c

+Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
+

 Python/compile.o Python/symtable.o: $(GRAMMAR_H)


--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -6,6 +6,8 @@ Type/class unification and new-style classes

 Core and builtins

+- Encoding declarations (PEP 263, phase 1) have been implemented.
+
 - list.sort() has a new implementation.  While cross-platform results
  may vary, and in data-dependent ways, this is much faster on many
  kinds of partially ordered lists than the previous implementation,

--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -8,6 +8,7 @@
 #include "parser.h"
 #include "parsetok.h"
 #include "errcode.h"
+#include "graminit.h"

 int Py_TabcheckFlag;

@@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
 		return NULL;
 	}

+        tok->filename = filename ? filename : "<string>";
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = filename ? filename : "<string>";
 		tok->altwarning = (tok->filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
 		err_ret->error = E_NOMEM;
 		return NULL;
 	}
+	tok->filename = filename;
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = filename;
 		tok->altwarning = (filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 				err_ret->text[len] = '\0';
 			}
 		}
+	} else if (tok->encoding != NULL) {
+		node* r = PyNode_New(encoding_decl);
+		r->n_str = tok->encoding;
+		r->n_nchildren = 1;
+		r->n_child = n;
+		tok->encoding = NULL;
+		n = r;
 	}

 	PyTokenizer_Free(tok);

--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -4,6 +4,7 @@
 extern "C" {
 #endif

+#include "object.h"

 /* Tokenizer interface */

@@ -38,6 +39,16 @@ struct tok_state {
 	int alterror;	/* Issue error if alternate tabs don't match */
 	int alttabsize;	/* Alternate tab spacing */
 	int altindstack[MAXINDENT];	/* Stack of alternate indents */
+	/* Stuff for PEP 0263 */
+	int decoding_state;	/* -1:decoding, 0:init, 1:raw */
+	int decoding_erred;	/* whether erred in decoding  */
+	int read_coding_spec;	/* whether 'coding:...' has been read  */
+	int issued_encoding_warning; /* whether non-ASCII warning was issued */
+	char *encoding;
+	PyObject *decoding_readline; /* codecs.open(...).readline */
+	PyObject *decoding_buffer;
+	const char* enc;
+	const char* str;
 };

 extern struct tok_state *PyTokenizer_FromString(char *);

--- a/Parser/tokenizer_pgen.c
+++ b/Parser/tokenizer_pgen.c
+#define PGEN
+#include "tokenizer.c"
--- a/Python/compile.c
+++ b/Python/compile.c
@@ -485,6 +485,7 @@ struct compiling {
 	int c_closure;		/* Is nested w/freevars? */
 	struct symtable *c_symtable; /* pointer to module symbol table */
        PyFutureFeatures *c_future; /* pointer to module's __future__ */
+	char *c_encoding;	/* source encoding (a borrowed reference) */
 };

 static int
@@ -1181,6 +1182,23 @@ parsenumber(struct compiling *co, char *s)
 	}
 }

+static PyObject *
+decode_utf8(char **sPtr, char *end, char* encoding)
+{
+	PyObject *u, *v;
+	char *s, *t;
+	t = s = *sPtr;
+	/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
+	while (s < end && (*s & 0x80)) s++;
+	*sPtr = s;
+	u = PyUnicode_DecodeUTF8(t, s - t, NULL);
+	if (u == NULL)
+		return NULL;
+	v = PyUnicode_AsEncodedString(u, encoding, NULL);
+	Py_DECREF(u);
+	return v;
+}
+
 static PyObject *
 parsestr(struct compiling *com, char *s)
 {
@@ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s)
 	int first = *s;
 	int quote = first;
 	int rawmode = 0;
+	char* encoding = ((com == NULL) ? NULL : com->c_encoding);
+	int need_encoding;
 	int unicode = 0;

 	if (isalpha(quote) || quote == '_') {
@@ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s)
 	}
 #ifdef Py_USING_UNICODE
 	if (unicode || Py_UnicodeFlag) {
+		PyObject *u, *w;
+		if (encoding == NULL) {
+			buf = s;
+			u = NULL;
+		} else if (strcmp(encoding, "iso-8859-1") == 0) {
+			buf = s;
+			u = NULL;
+		} else {
+			/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
+			u = PyString_FromStringAndSize((char *)NULL, len * 4);
+			if (u == NULL)
+				return NULL;
+			p = buf = PyString_AsString(u);
+			end = s + len;
+			while (s < end) {
+				if (*s == '\\') {
+					*p++ = *s++;
+					if (*s & 0x80) {
+						strcpy(p, "u005c");
+						p += 5;
+					}
+				}
+				if (*s & 0x80) { /* XXX inefficient */
+					char *r;
+					int rn, i;
+					w = decode_utf8(&s, end, "utf-16-be");
+					if (w == NULL) {
+						Py_DECREF(u);
+						return NULL;
+					}
+					r = PyString_AsString(w);
+					rn = PyString_Size(w);
+					assert(rn % 2 == 0);
+					for (i = 0; i < rn; i += 2) {
+						sprintf(p, "\\u%02x%02x",
+							r[i + 0] & 0xFF,
+							r[i + 1] & 0xFF);
+						p += 6;
+					}
+					Py_DECREF(w);
+				} else {
+					*p++ = *s++;
+				}
+			}
+			len = p - buf;
+		}
 		if (rawmode)
-			v = PyUnicode_DecodeRawUnicodeEscape(
-				 s, len, NULL);
+			v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
 		else
-			v = PyUnicode_DecodeUnicodeEscape(
-				s, len, NULL);
+			v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
+		Py_XDECREF(u);
 		if (v == NULL)
 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
 		return v;
 			
 	}
 #endif
-	if (rawmode || strchr(s, '\\') == NULL)
-		return PyString_FromStringAndSize(s, len);
-	v = PyString_FromStringAndSize((char *)NULL, len);
+	need_encoding = (encoding != NULL &&
+			 strcmp(encoding, "utf-8") != 0 &&
+			 strcmp(encoding, "iso-8859-1") != 0);
+	if (rawmode || strchr(s, '\\') == NULL) {
+		if (need_encoding) {
+			PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
+			if (u == NULL)
+				return NULL;
+			v = PyUnicode_AsEncodedString(u, encoding, NULL);
+			Py_DECREF(u);
+			return v;
+		} else {
+			return PyString_FromStringAndSize(s, len);
+		}
+	}
+	v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
+				       need_encoding ? len * 4 : len);
 	if (v == NULL)
 		return NULL;
 	p = buf = PyString_AsString(v);
 	end = s + len;
 	while (s < end) {
 		if (*s != '\\') {
-			*p++ = *s++;
+		  ORDINAL: 
+			if (need_encoding && (*s & 0x80)) {
+				char *r;
+				int rn;
+				PyObject* w = decode_utf8(&s, end, encoding);
+				if (w == NULL)
+					return NULL;
+				r = PyString_AsString(w);
+				rn = PyString_Size(w);
+				memcpy(p, r, rn);
+				p += rn;
+				Py_DECREF(w);
+			} else {
+				*p++ = *s++;
+			}
 			continue;
 		}
 		s++;
@@ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s)
 #endif
 		default:
 			*p++ = '\\';
-			*p++ = s[-1];
-			break;
+			s--;
+			goto ORDINAL;
 		}
 	}
 	_PyString_Resize(&v, (int)(p - buf));
@@ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base,
 	PyCodeObject *co;
 	if (!com_init(&sc, filename))
 		return NULL;
+	if (TYPE(n) == encoding_decl) {
+		sc.c_encoding = STR(n);
+		n = CHILD(n, 0);
+	} else {
+		sc.c_encoding = NULL;
+	}
 	if (base) {
 		sc.c_private = base->c_private;
 		sc.c_symtable = base->c_symtable;
@@ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base,
 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
 			sc.c_nested = 1;
 		sc.c_flags |= base->c_flags & PyCF_MASK;
+		if (base->c_encoding != NULL) {
+			assert(sc.c_encoding == NULL);
+			sc.c_encoding = base->c_encoding;
+		}
 	} else {
 		sc.c_private = NULL;
 		sc.c_future = PyNode_Future(n, filename);

--- a/Python/graminit.c
+++ b/Python/graminit.c
@@ -1463,7 +1463,17 @@ static state states_66[2] = {
 	{1, arcs_66_0},
 	{2, arcs_66_1},
 };
-static dfa dfas[67] = {
+static arc arcs_67_0[1] = {
+	{12, 1},
+};
+static arc arcs_67_1[1] = {
+	{0, 1},
+};
+static state states_67[2] = {
+	{1, arcs_67_0},
+	{1, arcs_67_1},
+};
+static dfa dfas[68] = {
 	{256, "single_input", 0, 3, states_0,
 	 "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"},
 	{257, "file_input", 0, 2, states_1,
@@ -1598,8 +1608,10 @@ static dfa dfas[67] = {
 	 "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"},
 	{322, "testlist1", 0, 2, states_66,
 	 "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"},
+	{323, "encoding_decl", 0, 2, states_67,
+	 "\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"},
 };
-static label labels[148] = {
+static label labels[149] = {
 	{0, "EMPTY"},
 	{256, 0},
 	{4, 0},
@@ -1748,10 +1760,11 @@ static label labels[148] = {
 	{318, 0},
 	{319, 0},
 	{321, 0},
+	{323, 0},
 };
 grammar _PyParser_Grammar = {
-	67,
+	68,
 	dfas,
-	{148, labels},
+	{149, labels},
 	256
 };
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1221,6 +1221,7 @@ static void
 err_input(perrdetail *err)
 {
 	PyObject *v, *w, *errtype;
+	PyObject* u = NULL;
 	char *msg = NULL;
 	errtype = PyExc_SyntaxError;
 	v = Py_BuildValue("(ziiz)", err->filename,
@@ -1272,12 +1273,24 @@ err_input(perrdetail *err)
 		errtype = PyExc_IndentationError;
 		msg = "too many levels of indentation";
 		break;
+	case E_DECODE: {	/* XXX */
+		PyThreadState* tstate = PyThreadState_Get();
+		PyObject* value = tstate->curexc_value;
+		if (value != NULL) {
+			u = PyObject_Repr(value);
+			if (u != NULL) {
+				msg = PyString_AsString(u);
+				break;
+			}
+		}
+	}
 	default:
 		fprintf(stderr, "error=%d\n", err->error);
 		msg = "unknown parsing error";
 		break;
 	}
 	w = Py_BuildValue("(sO)", msg, v);
+	Py_XDECREF(u);
 	Py_XDECREF(v);
 	PyErr_SetObject(errtype, w);
 	Py_XDECREF(w);