Commit c4ef4896 authored by Ammar Askar's avatar Ammar Askar Committed by Tal Einat

bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891)

Most of the change involves fixing up the test suite, which previously made
the assumption that there wouldn't be a new line if the input didn't end in
one.

Contributed by Ammar Askar.
parent 3c8aae9f
from test import support from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens) open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE)
from io import BytesIO, StringIO from io import BytesIO, StringIO
import unittest import unittest
from unittest import TestCase, mock from unittest import TestCase, mock
...@@ -11,27 +12,51 @@ import os ...@@ -11,27 +12,51 @@ import os
import token import token
# Converts a source string into a list of textual representation
# of the tokens such as:
# ` NAME 'if' (1, 0) (1, 2)`
# to make writing tests easier.
def stringify_tokens_from_source(token_generator, source_string):
result = []
num_lines = len(source_string.splitlines())
missing_trailing_nl = source_string[-1] not in '\r\n'
for type, token, start, end, line in token_generator:
if type == ENDMARKER:
break
# Ignore the new line on the last line if the input lacks one
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
continue
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
return result
class TokenizeTest(TestCase): class TokenizeTest(TestCase):
# Tests for the tokenize module. # Tests for the tokenize module.
# The tests can be really simple. Given a small fragment of source # The tests can be really simple. Given a small fragment of source
# code, print out a table with tokens. The ENDMARKER is omitted for # code, print out a table with tokens. The ENDMARKER, ENCODING and
# brevity. # final NEWLINE are omitted for brevity.
def check_tokenize(self, s, expected): def check_tokenize(self, s, expected):
# Format the tokens in s in a table format. # Format the tokens in s in a table format.
# The ENDMARKER is omitted. # The ENDMARKER and final NEWLINE are omitted.
result = []
f = BytesIO(s.encode('utf-8')) f = BytesIO(s.encode('utf-8'))
for type, token, start, end, line in tokenize(f.readline): result = stringify_tokens_from_source(tokenize(f.readline), s)
if type == ENDMARKER:
break
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
self.assertEqual(result, self.assertEqual(result,
[" ENCODING 'utf-8' (0, 0) (0, 0)"] + [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines()) expected.rstrip().splitlines())
def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
# when the input lacks a trailing new line.
f = BytesIO("x".encode('utf-8'))
tokens = list(tokenize(f.readline))
self.assertEqual(tokens[-2].type, NEWLINE)
self.assertEqual(tokens[-1].type, ENDMARKER)
def test_basic(self): def test_basic(self):
self.check_tokenize("1 + 1", """\ self.check_tokenize("1 + 1", """\
NUMBER '1' (1, 0) (1, 1) NUMBER '1' (1, 0) (1, 1)
...@@ -922,14 +947,9 @@ async def f(): ...@@ -922,14 +947,9 @@ async def f():
class GenerateTokensTest(TokenizeTest): class GenerateTokensTest(TokenizeTest):
def check_tokenize(self, s, expected): def check_tokenize(self, s, expected):
# Format the tokens in s in a table format. # Format the tokens in s in a table format.
# The ENDMARKER is omitted. # The ENDMARKER and final NEWLINE are omitted.
result = []
f = StringIO(s) f = StringIO(s)
for type, token, start, end, line in generate_tokens(f.readline): result = stringify_tokens_from_source(generate_tokens(f.readline), s)
if type == ENDMARKER:
break
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
self.assertEqual(result, expected.rstrip().splitlines()) self.assertEqual(result, expected.rstrip().splitlines())
...@@ -1022,8 +1042,8 @@ class Test_Tokenize(TestCase): ...@@ -1022,8 +1042,8 @@ class Test_Tokenize(TestCase):
else: else:
return b'' return b''
# skip the initial encoding token and the end token # skip the initial encoding token and the end tokens
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens, self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding") "bytes not decoded with encoding")
...@@ -1039,8 +1059,8 @@ class Test_Tokenize(TestCase): ...@@ -1039,8 +1059,8 @@ class Test_Tokenize(TestCase):
else: else:
return b'' return b''
# skip the end token # skip the end tokens
tokens = list(_tokenize(readline, encoding=None))[:-1] tokens = list(_tokenize(readline, encoding=None))[:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens, self.assertEqual(tokens, expected_tokens,
"string not tokenized when encoding is None") "string not tokenized when encoding is None")
...@@ -1351,18 +1371,21 @@ class TestTokenize(TestCase): ...@@ -1351,18 +1371,21 @@ class TestTokenize(TestCase):
# Test that 500 consequent, one-line defs is OK # Test that 500 consequent, one-line defs is OK
toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
# [-2] is always NEWLINE
def assertExactTypeEqual(self, opstr, *optypes): def assertExactTypeEqual(self, opstr, *optypes):
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
num_optypes = len(optypes) num_optypes = len(optypes)
self.assertEqual(len(tokens), 2 + num_optypes) self.assertEqual(len(tokens), 3 + num_optypes)
self.assertEqual(tok_name[tokens[0].exact_type], self.assertEqual(tok_name[tokens[0].exact_type],
tok_name[ENCODING]) tok_name[ENCODING])
for i in range(num_optypes): for i in range(num_optypes):
self.assertEqual(tok_name[tokens[i + 1].exact_type], self.assertEqual(tok_name[tokens[i + 1].exact_type],
tok_name[optypes[i]]) tok_name[optypes[i]])
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
tok_name[token.NEWLINE])
self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
tok_name[token.ENDMARKER]) tok_name[token.ENDMARKER])
def test_exact_type(self): def test_exact_type(self):
...@@ -1515,7 +1538,7 @@ class TestRoundtrip(TestCase): ...@@ -1515,7 +1538,7 @@ class TestRoundtrip(TestCase):
self.check_roundtrip("if x == 1:\n" self.check_roundtrip("if x == 1:\n"
" print(x)\n") " print(x)\n")
self.check_roundtrip("# This is a comment\n" self.check_roundtrip("# This is a comment\n"
"# This also") "# This also\n")
# Some people use different formatting conventions, which makes # Some people use different formatting conventions, which makes
# untokenize a little trickier. Note that this test involves trailing # untokenize a little trickier. Note that this test involves trailing
......
...@@ -492,8 +492,15 @@ def _tokenize(readline, encoding): ...@@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
# BOM will already have been stripped. # BOM will already have been stripped.
encoding = "utf-8" encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
last_line = b''
line = b''
while True: # loop over lines in stream while True: # loop over lines in stream
try: try:
# We capture the value of the line variable here because
# readline uses the empty string '' to signal end of input,
# hence `line` itself will always be overwritten at the end
# of this loop.
last_line = line
line = readline() line = readline()
except StopIteration: except StopIteration:
line = b'' line = b''
...@@ -648,6 +655,9 @@ def _tokenize(readline, encoding): ...@@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
(lnum, pos), (lnum, pos+1), line) (lnum, pos), (lnum, pos+1), line)
pos += 1 pos += 1
# Add an implicit NEWLINE if the input doesn't end in one
if last_line and last_line[-1] not in '\r\n':
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
for indent in indents[1:]: # pop remaining indent levels for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
......
Tokenize module now implicitly emits a NEWLINE when provided with input that
does not have a trailing new line. This behavior now matches what the C
tokenizer does internally. Contributed by Ammar Askar.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment