Commit 96ec934e authored by Yury Selivanov's avatar Yury Selivanov

Issue #24619: Simplify async/await tokenization.

This commit simplifies async/await tokenization in tokenizer.c,
tokenize.py & lib2to3/tokenize.py.  Previous solution was to keep
a stack of async-def & def blocks, whereas the new approach is just
to remember position of the outermost async-def block.

This change won't bring any parsing performance improvements, but
it makes the code much easier to read and validate.
parent f315c1c0
...@@ -366,10 +366,11 @@ def generate_tokens(readline): ...@@ -366,10 +366,11 @@ def generate_tokens(readline):
contline = None contline = None
indents = [0] indents = [0]
# 'stashed' and 'ctx' are used for async/await parsing # 'stashed' and 'async_*' are used for async/await parsing
stashed = None stashed = None
ctx = [('sync', 0)] async_def = False
in_async = 0 async_def_indent = 0
async_def_nl = False
while 1: # loop over lines in stream while 1: # loop over lines in stream
try: try:
...@@ -438,15 +439,18 @@ def generate_tokens(readline): ...@@ -438,15 +439,18 @@ def generate_tokens(readline):
("<tokenize>", lnum, pos, line)) ("<tokenize>", lnum, pos, line))
indents = indents[:-1] indents = indents[:-1]
cur_indent = indents[-1] if async_def and async_def_indent >= indents[-1]:
while len(ctx) > 1 and ctx[-1][1] >= cur_indent: async_def = False
if ctx[-1][0] == 'async': async_def_nl = False
in_async -= 1 async_def_indent = 0
assert in_async >= 0
ctx.pop()
yield (DEDENT, '', (lnum, pos), (lnum, pos), line) yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
else: # continued statement else: # continued statement
if not line: if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0)) raise TokenError("EOF in multi-line statement", (lnum, 0))
...@@ -466,10 +470,13 @@ def generate_tokens(readline): ...@@ -466,10 +470,13 @@ def generate_tokens(readline):
newline = NEWLINE newline = NEWLINE
if parenlev > 0: if parenlev > 0:
newline = NL newline = NL
elif async_def:
async_def_nl = True
if stashed: if stashed:
yield stashed yield stashed
stashed = None stashed = None
yield (newline, token, spos, epos, line) yield (newline, token, spos, epos, line)
elif initial == '#': elif initial == '#':
assert not token.endswith("\n") assert not token.endswith("\n")
if stashed: if stashed:
...@@ -508,7 +515,7 @@ def generate_tokens(readline): ...@@ -508,7 +515,7 @@ def generate_tokens(readline):
yield (STRING, token, spos, epos, line) yield (STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name elif initial in namechars: # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if in_async: if async_def:
yield (ASYNC if token == 'async' else AWAIT, yield (ASYNC if token == 'async' else AWAIT,
token, spos, epos, line) token, spos, epos, line)
continue continue
...@@ -523,15 +530,13 @@ def generate_tokens(readline): ...@@ -523,15 +530,13 @@ def generate_tokens(readline):
and stashed[0] == NAME and stashed[0] == NAME
and stashed[1] == 'async'): and stashed[1] == 'async'):
ctx.append(('async', indents[-1])) async_def = True
in_async += 1 async_def_indent = indents[-1]
yield (ASYNC, stashed[1], yield (ASYNC, stashed[1],
stashed[2], stashed[3], stashed[2], stashed[3],
stashed[4]) stashed[4])
stashed = None stashed = None
else:
ctx.append(('sync', indents[-1]))
if stashed: if stashed:
yield stashed yield stashed
......
...@@ -67,10 +67,32 @@ class TestAsyncAwait(GrammarTest): ...@@ -67,10 +67,32 @@ class TestAsyncAwait(GrammarTest):
await x await x
""") """)
self.validate("""async def foo():
def foo(): pass
def foo(): pass
await x
""")
self.validate("""async def foo(): return await a""")
self.validate("""def foo():
def foo(): pass
async def foo(): await x
""")
self.invalid_syntax("await x") self.invalid_syntax("await x")
self.invalid_syntax("""def foo(): self.invalid_syntax("""def foo():
await x""") await x""")
self.invalid_syntax("""def foo():
def foo(): pass
async def foo(): pass
await x
""")
def test_async_var(self): def test_async_var(self):
self.validate("""async = 1""") self.validate("""async = 1""")
self.validate("""await = 1""") self.validate("""await = 1""")
......
...@@ -330,6 +330,7 @@ class AsyncBadSyntaxTest(unittest.TestCase): ...@@ -330,6 +330,7 @@ class AsyncBadSyntaxTest(unittest.TestCase):
async def f(): async def f():
async def g(): pass async def g(): pass
await z await z
await = 1
self.assertTrue(inspect.iscoroutinefunction(f)) self.assertTrue(inspect.iscoroutinefunction(f))
......
...@@ -840,6 +840,79 @@ Async/await extension: ...@@ -840,6 +840,79 @@ Async/await extension:
OP ')' (1, 19) (1, 20) OP ')' (1, 19) (1, 20)
OP ':' (1, 20) (1, 21) OP ':' (1, 20) (1, 21)
AWAIT 'await' (1, 22) (1, 27) AWAIT 'await' (1, 22) (1, 27)
>>> dump_tokens('''def f():
...
... def baz(): pass
... async def bar(): pass
...
... await = 2''')
ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3)
NAME 'f' (1, 4) (1, 5)
OP '(' (1, 5) (1, 6)
OP ')' (1, 6) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\n' (1, 8) (1, 9)
NL '\\n' (2, 0) (2, 1)
INDENT ' ' (3, 0) (3, 2)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
OP '(' (3, 9) (3, 10)
OP ')' (3, 10) (3, 11)
OP ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '\\n' (3, 17) (3, 18)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
OP '(' (4, 15) (4, 16)
OP ')' (4, 16) (4, 17)
OP ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '\\n' (4, 23) (4, 24)
NL '\\n' (5, 0) (5, 1)
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (7, 0) (7, 0)
>>> dump_tokens('''async def f():
...
... def baz(): pass
... async def bar(): pass
...
... await = 2''')
ENCODING 'utf-8' (0, 0) (0, 0)
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'f' (1, 10) (1, 11)
OP '(' (1, 11) (1, 12)
OP ')' (1, 12) (1, 13)
OP ':' (1, 13) (1, 14)
NEWLINE '\\n' (1, 14) (1, 15)
NL '\\n' (2, 0) (2, 1)
INDENT ' ' (3, 0) (3, 2)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
OP '(' (3, 9) (3, 10)
OP ')' (3, 10) (3, 11)
OP ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '\\n' (3, 17) (3, 18)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
OP '(' (4, 15) (4, 16)
OP ')' (4, 16) (4, 17)
OP ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '\\n' (4, 23) (4, 24)
NL '\\n' (5, 0) (5, 1)
AWAIT 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (7, 0) (7, 0)
""" """
from test import support from test import support
......
...@@ -498,10 +498,11 @@ def _tokenize(readline, encoding): ...@@ -498,10 +498,11 @@ def _tokenize(readline, encoding):
contline = None contline = None
indents = [0] indents = [0]
# 'stashed' and 'ctx' are used for async/await parsing # 'stashed' and 'async_*' are used for async/await parsing
stashed = None stashed = None
ctx = [('sync', 0)] async_def = False
in_async = 0 async_def_indent = 0
async_def_nl = False
if encoding is not None: if encoding is not None:
if encoding == "utf-8-sig": if encoding == "utf-8-sig":
...@@ -579,15 +580,18 @@ def _tokenize(readline, encoding): ...@@ -579,15 +580,18 @@ def _tokenize(readline, encoding):
("<tokenize>", lnum, pos, line)) ("<tokenize>", lnum, pos, line))
indents = indents[:-1] indents = indents[:-1]
cur_indent = indents[-1] if async_def and async_def_indent >= indents[-1]:
while len(ctx) > 1 and ctx[-1][1] >= cur_indent: async_def = False
if ctx[-1][0] == 'async': async_def_nl = False
in_async -= 1 async_def_indent = 0
assert in_async >= 0
ctx.pop()
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
else: # continued statement else: # continued statement
if not line: if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0)) raise TokenError("EOF in multi-line statement", (lnum, 0))
...@@ -609,8 +613,13 @@ def _tokenize(readline, encoding): ...@@ -609,8 +613,13 @@ def _tokenize(readline, encoding):
if stashed: if stashed:
yield stashed yield stashed
stashed = None stashed = None
yield TokenInfo(NL if parenlev > 0 else NEWLINE, if parenlev > 0:
token, spos, epos, line) yield TokenInfo(NL, token, spos, epos, line)
else:
yield TokenInfo(NEWLINE, token, spos, epos, line)
if async_def:
async_def_nl = True
elif initial == '#': elif initial == '#':
assert not token.endswith("\n") assert not token.endswith("\n")
if stashed: if stashed:
...@@ -644,7 +653,7 @@ def _tokenize(readline, encoding): ...@@ -644,7 +653,7 @@ def _tokenize(readline, encoding):
yield TokenInfo(STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if in_async: if async_def:
yield TokenInfo( yield TokenInfo(
ASYNC if token == 'async' else AWAIT, ASYNC if token == 'async' else AWAIT,
token, spos, epos, line) token, spos, epos, line)
...@@ -660,15 +669,13 @@ def _tokenize(readline, encoding): ...@@ -660,15 +669,13 @@ def _tokenize(readline, encoding):
and stashed.type == NAME and stashed.type == NAME
and stashed.string == 'async'): and stashed.string == 'async'):
ctx.append(('async', indents[-1])) async_def = True
in_async += 1 async_def_indent = indents[-1]
yield TokenInfo(ASYNC, stashed.string, yield TokenInfo(ASYNC, stashed.string,
stashed.start, stashed.end, stashed.start, stashed.end,
stashed.line) stashed.line)
stashed = None stashed = None
else:
ctx.append(('sync', indents[-1]))
if stashed: if stashed:
yield stashed yield stashed
......
...@@ -31,12 +31,6 @@ ...@@ -31,12 +31,6 @@
|| c == '_'\ || c == '_'\
|| (c >= 128)) || (c >= 128))
/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
and should be removed in 3.7, when async/await are regular
keywords. */
#define DEFTYPE_ASYNC 1
#define DEFTYPE_HAS_NL 2
extern char *PyOS_Readline(FILE *, FILE *, const char *); extern char *PyOS_Readline(FILE *, FILE *, const char *);
/* Return malloc'ed string including trailing \n; /* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF; empty malloc'ed string for EOF;
...@@ -133,12 +127,6 @@ tok_new(void) ...@@ -133,12 +127,6 @@ tok_new(void)
tok->indent = 0; tok->indent = 0;
tok->indstack[0] = 0; tok->indstack[0] = 0;
tok->def = 0;
tok->defstack[0] = 0;
tok->deftypestack[0] = 0;
tok->def_async_behind = 0;
tok->def_in_async = 0;
tok->atbol = 1; tok->atbol = 1;
tok->pendin = 0; tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL; tok->prompt = tok->nextprompt = NULL;
...@@ -159,6 +147,11 @@ tok_new(void) ...@@ -159,6 +147,11 @@ tok_new(void)
tok->decoding_readline = NULL; tok->decoding_readline = NULL;
tok->decoding_buffer = NULL; tok->decoding_buffer = NULL;
#endif #endif
tok->async_def = 0;
tok->async_def_indent = 0;
tok->async_def_nl = 0;
return tok; return tok;
} }
...@@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
int c; int c;
int blankline, nonascii; int blankline, nonascii;
int tok_len;
struct tok_state ahead_tok;
char *ahead_tok_start = NULL, *ahead_top_end = NULL;
int ahead_tok_kind;
*p_start = *p_end = NULL; *p_start = *p_end = NULL;
nextline: nextline:
tok->start = NULL; tok->start = NULL;
...@@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (tok->pendin != 0) { if (tok->pendin != 0) {
if (tok->pendin < 0) { if (tok->pendin < 0) {
tok->pendin++; tok->pendin++;
while (tok->def && tok->defstack[tok->def] >= tok->indent) {
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
}
return DEDENT; return DEDENT;
} }
else { else {
...@@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
} }
} }
if (!blankline && tok->level == 0 if (tok->async_def
&& tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL && !blankline
&& tok->defstack[tok->def] >= tok->indent) && tok->level == 0
/* There was a NEWLINE after ASYNC DEF,
so we're past the signature. */
&& tok->async_def_nl
/* Current indentation level is less than where
the async function was defined */
&& tok->async_def_indent >= tok->indent)
{ {
/* The top function on the stack did have a NEWLINE tok->async_def = 0;
token, but didn't have an INDENT. That means that tok->async_def_indent = 0;
it's a one-line function and it should now be removed tok->async_def_nl = 0;
from the stack. */
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
} }
again: again:
...@@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start; *p_start = tok->start;
*p_end = tok->cur; *p_end = tok->cur;
tok_len = tok->cur - tok->start; /* async/await parsing block. */
if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) { if (tok->cur - tok->start == 5) {
/* The current token is 'def'. */ /* Current token length is 5. */
if (tok->def + 1 >= MAXINDENT) { if (tok->async_def) {
tok->done = E_TOODEEP; /* We're inside an 'async def' function. */
tok->cur = tok->inp; if (memcmp(tok->start, "async", 5) == 0)
return ERRORTOKEN; return ASYNC;
if (memcmp(tok->start, "await", 5) == 0)
return AWAIT;
} }
else if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'.
Look ahead one token.*/
/* Advance defs stack. */ struct tok_state ahead_tok;
tok->def++; char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
tok->defstack[tok->def] = tok->indent; int ahead_tok_kind;
if (tok->def_async_behind) {
/* The previous token was 'async'. */
tok->def_async_behind = 0;
tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
tok->def_in_async++;
}
else {
/* This is a regular function (not async def). */
tok->deftypestack[tok->def] = 0;
}
}
else if (tok_len == 5) {
if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'. */
memcpy(&ahead_tok, tok, sizeof(ahead_tok)); memcpy(&ahead_tok, tok, sizeof(ahead_tok));
/* Try to look ahead one token. */
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
&ahead_top_end); &ahead_tok_end);
if (ahead_tok_kind == NAME if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3 && ahead_tok.cur - ahead_tok.start == 3
...@@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
{ {
/* The next token is going to be 'def', so instead of /* The next token is going to be 'def', so instead of
returning 'async' NAME token, we return ASYNC. */ returning 'async' NAME token, we return ASYNC. */
tok->def_async_behind = 1; tok->async_def_indent = tok->indent;
tok->async_def = 1;
return ASYNC; return ASYNC;
} }
else if (tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'async' token as ASYNC, instead of NAME. */
return ASYNC;
}
}
else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'await' token as AWAIT, instead of NAME. */
return AWAIT;
} }
} }
...@@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) ...@@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start; *p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */ *p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0; tok->cont_line = 0;
if (tok->def) { if (tok->async_def) {
/* Mark the top function on the stack that it had /* We're somewhere inside an 'async def' function, and
at least one NEWLINE. That will help us to we've encountered a NEWLINE after its signature. */
distinguish one-line functions from functions tok->async_def_nl = 1;
with multiple statements. */
tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
} }
return NEWLINE; return NEWLINE;
} }
......
...@@ -66,21 +66,12 @@ struct tok_state { ...@@ -66,21 +66,12 @@ struct tok_state {
const char* str; const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */ const char* input; /* Tokenizer's newline translated copy of the string. */
/* `def*` fields are for parsing async/await in a backwards compatible /* async/await related fields; can be removed in 3.7 when async and await
way. They should be removed in 3.7, when they will become become normal keywords. */
regular constants. See PEP 492 for more details. */ int async_def; /* =1 if tokens are inside an 'async def' body. */
int defstack[MAXINDENT]; /* Stack of funcs & indents where they int async_def_indent; /* Indentation level of the outermost 'async def'. */
were defined. */ int async_def_nl; /* =1 if the outermost 'async def' had at least one
int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_* NEWLINE token after it. */
constants. */
int def; /* Length of stack of func types/flags. */
int def_async_behind; /* 1 if there was an 'async' token before
a 'def' token. */
int def_in_async; /* Counter of how deep 'async def's
are nested. If greater than 0,
we are somewhere in an 'async def'
body, so 'async' and 'await' should
be parsed as keywords.*/
}; };
extern struct tok_state *PyTokenizer_FromString(const char *, int); extern struct tok_state *PyTokenizer_FromString(const char *, int);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment