Commit 47383403 authored by Martin v. Löwis's avatar Martin v. Löwis

Implement PEP 3131. Add isidentifier to str.

parent 32c4ac01
...@@ -653,6 +653,11 @@ is at least one character, false otherwise. ...@@ -653,6 +653,11 @@ is at least one character, false otherwise.
For 8-bit strings, this method is locale-dependent. For 8-bit strings, this method is locale-dependent.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}[str]{isidentifier}{}
Return True if S is a valid identifier according\n\
to the language definition.
\end{methoddesc}
\begin{methoddesc}[str]{islower}{} \begin{methoddesc}[str]{islower}{}
Return true if all cased characters in the string are lowercase and Return true if all cased characters in the string are lowercase and
there is at least one cased character, false otherwise. there is at least one cased character, false otherwise.
......
...@@ -29,6 +29,7 @@ extern "C" { ...@@ -29,6 +29,7 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; ...@@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS2_Join # define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Partition PyUnicodeUCS2_Partition # define PyUnicode_Partition PyUnicodeUCS2_Partition
# define PyUnicode_RPartition PyUnicodeUCS2_RPartition # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
...@@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; ...@@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS4_Join # define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Partition PyUnicodeUCS4_Partition # define PyUnicode_Partition PyUnicodeUCS4_Partition
# define PyUnicode_RPartition PyUnicodeUCS4_RPartition # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
...@@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains( ...@@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains(
PyObject *element /* Element string */ PyObject *element /* Element string */
); );
/* Checks whether argument is a valid identifier. */
PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
/* Externally visible for str.strip(unicode) */ /* Externally visible for str.strip(unicode) */
PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
PyUnicodeObject *self, PyUnicodeObject *self,
......
# -*- coding: utf-8 -*-
= 2
# -*- coding: utf-8 -*-
import unittest
from test import test_support
class PEP3131Test(unittest.TestCase):
def test_valid(self):
class T:
ä = 1
µ = 2 # this is a compatibility character
= 3
self.assertEquals(getattr(T, "\xe4"), 1)
self.assertEquals(getattr(T, "\u03bc"), 2)
self.assertEquals(getattr(T, '\u87d2'), 3)
def test_invalid(self):
try:
from test import badsyntax_3131
except SyntaxError as s:
self.assertEquals(str(s),
"invalid character in identifier (badsyntax_3131.py, line 2)")
else:
self.fail("expected exception didn't occur")
def test_main():
test_support.run_unittest(PEP3131Test)
if __name__=="__main__":
test_main()
...@@ -313,6 +313,19 @@ class UnicodeTest( ...@@ -313,6 +313,19 @@ class UnicodeTest(
self.assertRaises(TypeError, "abc".isnumeric, 42) self.assertRaises(TypeError, "abc".isnumeric, 42)
def test_isidentifier(self):
self.assertTrue("a".isidentifier())
self.assertTrue("Z".isidentifier())
self.assertTrue("_".isidentifier())
self.assertTrue("b0".isidentifier())
self.assertTrue("bc".isidentifier())
self.assertTrue("b_".isidentifier())
self.assertTrue("".isidentifier())
self.assertFalse(" ".isidentifier())
self.assertFalse("[".isidentifier())
self.assertFalse("".isidentifier())
def test_contains(self): def test_contains(self):
# Testing Unicode contains method # Testing Unicode contains method
self.assert_('a' in 'abdb') self.assert_('a' in 'abdb')
......
...@@ -26,6 +26,8 @@ TO DO ...@@ -26,6 +26,8 @@ TO DO
Core and Builtins Core and Builtins
----------------- -----------------
- PEP 3131: Support non-ASCII identifiers.
- PEP 3120: Change default encoding to UTF-8. - PEP 3120: Change default encoding to UTF-8.
- PEP 3123: Use proper C inheritance for PyObject. - PEP 3123: Use proper C inheritance for PyObject.
......
...@@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode, ...@@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode,
} }
/* We allocate one more byte to make sure the string is /* We allocate one more byte to make sure the string is
Ux0000 terminated -- XXX is this needed ? Ux0000 terminated; some code (e.g. new_identifier)
relies on that.
XXX This allocator could further be enhanced by assuring that the XXX This allocator could further be enhanced by assuring that the
free list never reduces its size below 1. free list never reduces its size below 1.
...@@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self) ...@@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(1); return PyBool_FromLong(1);
} }
int
PyUnicode_IsIdentifier(PyObject *self)
{
register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
register const Py_UNICODE *e;
/* Special case for empty strings */
if (PyUnicode_GET_SIZE(self) == 0)
return 0;
/* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e
start with letters and underscore, continue with
letters, digits, underscore). However, given the current
definition of XID_Start and XID_Continue, it is sufficient
to check just for these, except that _ must be allowed
as starting an identifier. */
if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
return 0;
e = p + PyUnicode_GET_SIZE(self);
for (p++; p < e; p++) {
if (!_PyUnicode_IsXidContinue(*p))
return 0;
}
return 1;
}
PyDoc_STRVAR(isidentifier__doc__,
"S.isidentifier() -> bool\n\
\n\
Return True if S is a valid identifier according\n\
to the language definition.");
static PyObject*
unicode_isidentifier(PyObject *self)
{
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
}
PyDoc_STRVAR(join__doc__, PyDoc_STRVAR(join__doc__,
"S.join(sequence) -> unicode\n\ "S.join(sequence) -> unicode\n\
\n\ \n\
...@@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = { ...@@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = {
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
#if 0 #if 0
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
......
...@@ -21,13 +21,15 @@ ...@@ -21,13 +21,15 @@
#define is_potential_identifier_start(c) (\ #define is_potential_identifier_start(c) (\
(c >= 'a' && c <= 'z')\ (c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\ || (c >= 'A' && c <= 'Z')\
|| c == '_') || c == '_'\
|| (c >= 128))
#define is_potential_identifier_char(c) (\ #define is_potential_identifier_char(c) (\
(c >= 'a' && c <= 'z')\ (c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\ || (c >= 'A' && c <= 'Z')\
|| (c >= '0' && c <= '9')\ || (c >= '0' && c <= '9')\
|| c == '_') || c == '_'\
|| (c >= 128))
extern char *PyOS_Readline(FILE *, FILE *, char *); extern char *PyOS_Readline(FILE *, FILE *, char *);
/* Return malloc'ed string including trailing \n; /* Return malloc'ed string including trailing \n;
...@@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok) ...@@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok)
return 0; return 0;
} }
#ifdef PGEN
#define verify_identifier(s,e) 1
#else
/* Verify that the identifier follows PEP 3131. */
static int
verify_identifier(char *start, char *end)
{
PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
int result = PyUnicode_IsIdentifier(s);
Py_DECREF(s);
return result;
}
#endif
/* Get next token, after space stripping etc. */ /* Get next token, after space stripping etc. */
...@@ -1077,7 +1092,7 @@ static int ...@@ -1077,7 +1092,7 @@ static int
tok_get(register struct tok_state *tok, char **p_start, char **p_end) tok_get(register struct tok_state *tok, char **p_start, char **p_end)
{ {
register int c; register int c;
int blankline; int blankline, nonascii;
*p_start = *p_end = NULL; *p_start = *p_end = NULL;
nextline: nextline:
...@@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) ...@@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
} }
/* Identifier (most frequent token!) */ /* Identifier (most frequent token!) */
nonascii = 0;
if (is_potential_identifier_start(c)) { if (is_potential_identifier_start(c)) {
/* Process r"", u"" and ur"" */ /* Process r"", u"" and ur"" */
switch (c) { switch (c) {
...@@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) ...@@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
break; break;
} }
while (is_potential_identifier_char(c)) { while (is_potential_identifier_char(c)) {
if (c >= 128)
nonascii = 1;
c = tok_nextc(tok); c = tok_nextc(tok);
} }
tok_backup(tok, c); tok_backup(tok, c);
if (nonascii &&
!verify_identifier(tok->start, tok->cur)) {
tok->done = E_IDENTIFIER;
return ERRORTOKEN;
}
*p_start = tok->start; *p_start = tok->start;
*p_end = tok->cur; *p_end = tok->cur;
return NAME; return NAME;
......
...@@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n, ...@@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n,
#define COMP_SETCOMP 2 #define COMP_SETCOMP 2
static identifier static identifier
new_identifier(const char* n, PyArena *arena) { new_identifier(const char* n, PyArena *arena)
{
PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL); PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
Py_UNICODE *u = PyUnicode_AS_UNICODE(id);
/* Check whether there are non-ASCII characters in the
identifier; if so, normalize to NFKC. */
for (; *u; u++) {
if (*u >= 128) {
PyObject *m = PyImport_ImportModule("unicodedata");
PyObject *id2;
if (!m)
return NULL;
id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id);
Py_DECREF(m);
if (!id2)
return NULL;
Py_DECREF(id);
id = id2;
break;
}
}
PyUnicode_InternInPlace(&id); PyUnicode_InternInPlace(&id);
PyArena_AddPyObject(arena, id); PyArena_AddPyObject(arena, id);
return id; return id;
......
...@@ -1530,6 +1530,10 @@ err_input(perrdetail *err) ...@@ -1530,6 +1530,10 @@ err_input(perrdetail *err)
case E_LINECONT: case E_LINECONT:
msg = "unexpected character after line continuation character"; msg = "unexpected character after line continuation character";
break; break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
default: default:
fprintf(stderr, "error=%d\n", err->error); fprintf(stderr, "error=%d\n", err->error);
msg = "unknown parsing error"; msg = "unknown parsing error";
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment