Commit 00f1e3f5 authored by Martin v. Löwis's avatar Martin v. Löwis

Patch #534304: Implement phase 1 of PEP 263.

parent a729daf2
......@@ -7,11 +7,14 @@ chapter describes how the lexical analyzer breaks a file into tokens.
\index{parser}
\index{token}
Python uses the 7-bit \ASCII{} character set for program text and string
literals. 8-bit characters may be used in string literals and comments
but their interpretation is platform dependent; the proper way to
insert 8-bit characters in string literals is by using octal or
hexadecimal escape sequences.
Python uses the 7-bit \ASCII{} character set for program text.
\versionadded[An encoding declaration can be used to indicate that
string literals and comments use an encoding different from ASCII.]{2.3}
For compatibility with older versions, Python only warns if it finds
8-bit characters; those warnings should be corrected by either declaring
an explicit encoding, or using escape sequences if those bytes are binary
data, instead of characters.
The run-time character set depends on the I/O devices connected to the
program but is generally a superset of \ASCII.
......@@ -69,6 +72,37 @@ Comments are ignored by the syntax; they are not tokens.
\index{hash character}
\subsection{Encoding declarations\label{encodings}}
If a comment in the first or second line of the Python script matches
the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
processed as an encoding declaration; the first group of this
expression names the encoding of the source code file. The recommended
forms of this expression are
\begin{verbatim}
# -*- coding: <encoding-name> -*-
\end{verbatim}
which is recognized also by GNU Emacs, and
\begin{verbatim}
# vim:fileencoding=<encoding-name>
\end{verbatim}
which is recognized by Bram Moolenar's VIM. In addition, if the first
bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
declared file encoding is UTF-8 (this is supported, among others, by
Microsoft's notepad.exe).
If an encoding is declared, the encoding name must be recognized by
Python. % XXX there should be a list of supported encodings.
The encoding is used for all lexical analysis, in particular to find
the end of a string, and to interpret the contents of Unicode literals.
String literals are converted to Unicode for syntactical analysis,
then converted back to their original encoding before interpretation
starts.
\subsection{Explicit line joining\label{explicit-joining}}
Two or more physical lines may be joined into logical lines using
......
......@@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
list_if: 'if' test [list_iter]
testlist1: test (',' test)*
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
......@@ -25,6 +25,7 @@ extern "C" {
#define E_OVERFLOW 19 /* Node had too many children */
#define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */
#ifdef __cplusplus
}
......
......@@ -65,3 +65,4 @@
#define list_for 320
#define list_if 321
#define testlist1 322
#define encoding_decl 323
......@@ -190,15 +190,15 @@ POBJS= \
Parser/node.o \
Parser/parser.o \
Parser/parsetok.o \
Parser/tokenizer.o \
Parser/bitset.o \
Parser/metagrammar.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/tokenizer.o
PGOBJS= \
Objects/obmalloc.o \
Python/mysnprintf.o \
Parser/tokenizer_pgen.o \
Parser/firstsets.o \
Parser/grammar.o \
Parser/pgen.o \
......@@ -434,6 +434,8 @@ Parser/grammar.o: $(srcdir)/Parser/grammar.c \
$(srcdir)/Include/grammar.h
Parser/metagrammar.o: $(srcdir)/Parser/metagrammar.c
Parser/tokenizer_pgen.o: $(srcdir)/Parser/tokenizer.c
Python/compile.o Python/symtable.o: $(GRAMMAR_H)
......
......@@ -6,6 +6,8 @@ Type/class unification and new-style classes
Core and builtins
- Encoding declarations (PEP 263, phase 1) have been implemented.
- list.sort() has a new implementation. While cross-platform results
may vary, and in data-dependent ways, this is much faster on many
kinds of partially ordered lists than the previous implementation,
......
......@@ -8,6 +8,7 @@
#include "parser.h"
#include "parsetok.h"
#include "errcode.h"
#include "graminit.h"
int Py_TabcheckFlag;
......@@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
return NULL;
}
tok->filename = filename ? filename : "<string>";
if (Py_TabcheckFlag || Py_VerboseFlag) {
tok->filename = filename ? filename : "<string>";
tok->altwarning = (tok->filename != NULL);
if (Py_TabcheckFlag >= 2)
tok->alterror++;
......@@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
err_ret->error = E_NOMEM;
return NULL;
}
tok->filename = filename;
if (Py_TabcheckFlag || Py_VerboseFlag) {
tok->filename = filename;
tok->altwarning = (filename != NULL);
if (Py_TabcheckFlag >= 2)
tok->alterror++;
......@@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
err_ret->text[len] = '\0';
}
}
} else if (tok->encoding != NULL) {
node* r = PyNode_New(encoding_decl);
r->n_str = tok->encoding;
r->n_nchildren = 1;
r->n_child = n;
tok->encoding = NULL;
n = r;
}
PyTokenizer_Free(tok);
......
This diff is collapsed.
......@@ -4,6 +4,7 @@
extern "C" {
#endif
#include "object.h"
/* Tokenizer interface */
......@@ -38,6 +39,16 @@ struct tok_state {
int alterror; /* Issue error if alternate tabs don't match */
int alttabsize; /* Alternate tab spacing */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
int decoding_state; /* -1:decoding, 0:init, 1:raw */
int decoding_erred; /* whether erred in decoding */
int read_coding_spec; /* whether 'coding:...' has been read */
int issued_encoding_warning; /* whether non-ASCII warning was issued */
char *encoding;
PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer;
const char* enc;
const char* str;
};
extern struct tok_state *PyTokenizer_FromString(char *);
......
#define PGEN
#include "tokenizer.c"
......@@ -485,6 +485,7 @@ struct compiling {
int c_closure; /* Is nested w/freevars? */
struct symtable *c_symtable; /* pointer to module symbol table */
PyFutureFeatures *c_future; /* pointer to module's __future__ */
char *c_encoding; /* source encoding (a borrowed reference) */
};
static int
......@@ -1181,6 +1182,23 @@ parsenumber(struct compiling *co, char *s)
}
}
static PyObject *
decode_utf8(char **sPtr, char *end, char* encoding)
{
PyObject *u, *v;
char *s, *t;
t = s = *sPtr;
/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
while (s < end && (*s & 0x80)) s++;
*sPtr = s;
u = PyUnicode_DecodeUTF8(t, s - t, NULL);
if (u == NULL)
return NULL;
v = PyUnicode_AsEncodedString(u, encoding, NULL);
Py_DECREF(u);
return v;
}
static PyObject *
parsestr(struct compiling *com, char *s)
{
......@@ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s)
int first = *s;
int quote = first;
int rawmode = 0;
char* encoding = ((com == NULL) ? NULL : com->c_encoding);
int need_encoding;
int unicode = 0;
if (isalpha(quote) || quote == '_') {
......@@ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s)
}
#ifdef Py_USING_UNICODE
if (unicode || Py_UnicodeFlag) {
PyObject *u, *w;
if (encoding == NULL) {
buf = s;
u = NULL;
} else if (strcmp(encoding, "iso-8859-1") == 0) {
buf = s;
u = NULL;
} else {
/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
u = PyString_FromStringAndSize((char *)NULL, len * 4);
if (u == NULL)
return NULL;
p = buf = PyString_AsString(u);
end = s + len;
while (s < end) {
if (*s == '\\') {
*p++ = *s++;
if (*s & 0x80) {
strcpy(p, "u005c");
p += 5;
}
}
if (*s & 0x80) { /* XXX inefficient */
char *r;
int rn, i;
w = decode_utf8(&s, end, "utf-16-be");
if (w == NULL) {
Py_DECREF(u);
return NULL;
}
r = PyString_AsString(w);
rn = PyString_Size(w);
assert(rn % 2 == 0);
for (i = 0; i < rn; i += 2) {
sprintf(p, "\\u%02x%02x",
r[i + 0] & 0xFF,
r[i + 1] & 0xFF);
p += 6;
}
Py_DECREF(w);
} else {
*p++ = *s++;
}
}
len = p - buf;
}
if (rawmode)
v = PyUnicode_DecodeRawUnicodeEscape(
s, len, NULL);
v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
else
v = PyUnicode_DecodeUnicodeEscape(
s, len, NULL);
v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
Py_XDECREF(u);
if (v == NULL)
PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
return v;
}
#endif
if (rawmode || strchr(s, '\\') == NULL)
return PyString_FromStringAndSize(s, len);
v = PyString_FromStringAndSize((char *)NULL, len);
need_encoding = (encoding != NULL &&
strcmp(encoding, "utf-8") != 0 &&
strcmp(encoding, "iso-8859-1") != 0);
if (rawmode || strchr(s, '\\') == NULL) {
if (need_encoding) {
PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
if (u == NULL)
return NULL;
v = PyUnicode_AsEncodedString(u, encoding, NULL);
Py_DECREF(u);
return v;
} else {
return PyString_FromStringAndSize(s, len);
}
}
v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
need_encoding ? len * 4 : len);
if (v == NULL)
return NULL;
p = buf = PyString_AsString(v);
end = s + len;
while (s < end) {
if (*s != '\\') {
*p++ = *s++;
ORDINAL:
if (need_encoding && (*s & 0x80)) {
char *r;
int rn;
PyObject* w = decode_utf8(&s, end, encoding);
if (w == NULL)
return NULL;
r = PyString_AsString(w);
rn = PyString_Size(w);
memcpy(p, r, rn);
p += rn;
Py_DECREF(w);
} else {
*p++ = *s++;
}
continue;
}
s++;
......@@ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s)
#endif
default:
*p++ = '\\';
*p++ = s[-1];
break;
s--;
goto ORDINAL;
}
}
_PyString_Resize(&v, (int)(p - buf));
......@@ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base,
PyCodeObject *co;
if (!com_init(&sc, filename))
return NULL;
if (TYPE(n) == encoding_decl) {
sc.c_encoding = STR(n);
n = CHILD(n, 0);
} else {
sc.c_encoding = NULL;
}
if (base) {
sc.c_private = base->c_private;
sc.c_symtable = base->c_symtable;
......@@ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base,
|| (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
sc.c_nested = 1;
sc.c_flags |= base->c_flags & PyCF_MASK;
if (base->c_encoding != NULL) {
assert(sc.c_encoding == NULL);
sc.c_encoding = base->c_encoding;
}
} else {
sc.c_private = NULL;
sc.c_future = PyNode_Future(n, filename);
......
......@@ -1463,7 +1463,17 @@ static state states_66[2] = {
{1, arcs_66_0},
{2, arcs_66_1},
};
static dfa dfas[67] = {
static arc arcs_67_0[1] = {
{12, 1},
};
static arc arcs_67_1[1] = {
{0, 1},
};
static state states_67[2] = {
{1, arcs_67_0},
{1, arcs_67_1},
};
static dfa dfas[68] = {
{256, "single_input", 0, 3, states_0,
"\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"},
{257, "file_input", 0, 2, states_1,
......@@ -1598,8 +1608,10 @@ static dfa dfas[67] = {
"\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"},
{322, "testlist1", 0, 2, states_66,
"\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"},
{323, "encoding_decl", 0, 2, states_67,
"\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"},
};
static label labels[148] = {
static label labels[149] = {
{0, "EMPTY"},
{256, 0},
{4, 0},
......@@ -1748,10 +1760,11 @@ static label labels[148] = {
{318, 0},
{319, 0},
{321, 0},
{323, 0},
};
grammar _PyParser_Grammar = {
67,
68,
dfas,
{148, labels},
{149, labels},
256
};
......@@ -1221,6 +1221,7 @@ static void
err_input(perrdetail *err)
{
PyObject *v, *w, *errtype;
PyObject* u = NULL;
char *msg = NULL;
errtype = PyExc_SyntaxError;
v = Py_BuildValue("(ziiz)", err->filename,
......@@ -1272,12 +1273,24 @@ err_input(perrdetail *err)
errtype = PyExc_IndentationError;
msg = "too many levels of indentation";
break;
case E_DECODE: { /* XXX */
PyThreadState* tstate = PyThreadState_Get();
PyObject* value = tstate->curexc_value;
if (value != NULL) {
u = PyObject_Repr(value);
if (u != NULL) {
msg = PyString_AsString(u);
break;
}
}
}
default:
fprintf(stderr, "error=%d\n", err->error);
msg = "unknown parsing error";
break;
}
w = Py_BuildValue("(sO)", msg, v);
Py_XDECREF(u);
Py_XDECREF(v);
PyErr_SetObject(errtype, w);
Py_XDECREF(w);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment