Commit a1f45ec7 authored by Tal Einat's avatar Tal Einat Committed by Benjamin Peterson

bpo-33899: Revert tokenize module adding an implicit final NEWLINE (GH-10072)

This reverts commit 7829bba4.
parent 56a4a3aa
from test import test_support from test import test_support
from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, NEWLINE, from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP,
STRING, ENDMARKER, tok_name, Untokenizer, tokenize) STRING, ENDMARKER, tok_name, Untokenizer, tokenize)
from StringIO import StringIO from StringIO import StringIO
import os import os
from unittest import TestCase from unittest import TestCase
# Converts a source string into a list of textual representation
# of the tokens such as:
# ` NAME 'if' (1, 0) (1, 2)`
# to make writing tests easier.
def stringify_tokens_from_source(token_generator, source_string):
result = []
num_lines = len(source_string.splitlines())
missing_trailing_nl = source_string[-1] not in '\r\n'
for type, token, start, end, line in token_generator:
if type == ENDMARKER:
break
# Ignore the new line on the last line if the input lacks one
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
continue
type = tok_name[type]
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
locals())
return result
class TokenizeTest(TestCase): class TokenizeTest(TestCase):
# Tests for the tokenize module. # Tests for the tokenize module.
# The tests can be really simple. Given a small fragment of source # The tests can be really simple. Given a small fragment of source
# code, print out a table with tokens. The ENDMARKER, ENCODING and # code, print out a table with tokens. The ENDMARKER is omitted for
# final NEWLINE are omitted for brevity. # brevity.
def check_tokenize(self, s, expected): def check_tokenize(self, s, expected):
# Format the tokens in s in a table format. # Format the tokens in s in a table format.
# The ENDMARKER is omitted.
result = []
f = StringIO(s) f = StringIO(s)
result = stringify_tokens_from_source(generate_tokens(f.readline), s) for type, token, start, end, line in generate_tokens(f.readline):
if type == ENDMARKER:
break
type = tok_name[type]
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
locals())
self.assertEqual(result, self.assertEqual(result,
expected.rstrip().splitlines()) expected.rstrip().splitlines())
def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
# when the input lacks a trailing new line.
f = StringIO("x")
tokens = list(generate_tokens(f.readline))
self.assertEqual(tokens[-2][0], NEWLINE)
self.assertEqual(tokens[-1][0], ENDMARKER)
def test_basic(self): def test_basic(self):
self.check_tokenize("1 + 1", """\ self.check_tokenize("1 + 1", """\
...@@ -638,7 +616,7 @@ class TestRoundtrip(TestCase): ...@@ -638,7 +616,7 @@ class TestRoundtrip(TestCase):
self.check_roundtrip("if x == 1:\n" self.check_roundtrip("if x == 1:\n"
" print x\n") " print x\n")
self.check_roundtrip("# This is a comment\n" self.check_roundtrip("# This is a comment\n"
"# This also\n") "# This also")
# Some people use different formatting conventions, which makes # Some people use different formatting conventions, which makes
# untokenize a little trickier. Note that this test involves trailing # untokenize a little trickier. Note that this test involves trailing
......
...@@ -306,15 +306,8 @@ def generate_tokens(readline): ...@@ -306,15 +306,8 @@ def generate_tokens(readline):
contline = None contline = None
indents = [0] indents = [0]
last_line = b''
line = b''
while 1: # loop over lines in stream while 1: # loop over lines in stream
try: try:
# We capture the value of the line variable here because
# readline uses the empty string '' to signal end of input,
# hence `line` itself will always be overwritten at the end
# of this loop.
last_line = line
line = readline() line = readline()
except StopIteration: except StopIteration:
line = '' line = ''
...@@ -444,9 +437,6 @@ def generate_tokens(readline): ...@@ -444,9 +437,6 @@ def generate_tokens(readline):
(lnum, pos), (lnum, pos+1), line) (lnum, pos), (lnum, pos+1), line)
pos += 1 pos += 1
# Add an implicit NEWLINE if the input doesn't end in one
if last_line and last_line[-1] not in '\r\n':
yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
for indent in indents[1:]: # pop remaining indent levels for indent in indents[1:]: # pop remaining indent levels
yield (DEDENT, '', (lnum, 0), (lnum, 0), '') yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
......
Tokenize module now implicitly emits a NEWLINE when provided with input that
does not have a trailing new line. This behavior now matches what the C
tokenizer does internally. Contributed by Ammar Askar.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment