Commit c56b17bd authored by Thomas Kluyver's avatar Thomas Kluyver Committed by Carol Willing

bpo-12486: Document tokenize.generate_tokens() as public API (#6957)

* Document tokenize.generate_tokens()

* Add news file

* Add test for generate_tokens

* Document behaviour around ENCODING token

* Add generate_tokens to __all__
parent c2745d2d
...@@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`: ...@@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`:
:func:`.tokenize` determines the source encoding of the file by looking for a :func:`.tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`. UTF-8 BOM or encoding cookie, according to :pep:`263`.
.. function:: generate_tokens(readline)
Tokenize a source reading unicode strings instead of bytes.
Like :func:`.tokenize`, the *readline* argument is a callable returning
a single line of input. However, :func:`generate_tokens` expects *readline*
to return a str object rather than bytes.
The result is an iterator yielding named tuples, exactly like
:func:`.tokenize`. It does not yield an :data:`~token.ENCODING` token.
All constants from the :mod:`token` module are also exported from All constants from the :mod:`token` module are also exported from
:mod:`tokenize`. :mod:`tokenize`.
...@@ -79,7 +89,8 @@ write back the modified script. ...@@ -79,7 +89,8 @@ write back the modified script.
positions) may change. positions) may change.
It returns bytes, encoded using the :data:`~token.ENCODING` token, which It returns bytes, encoded using the :data:`~token.ENCODING` token, which
is the first token sequence output by :func:`.tokenize`. is the first token sequence output by :func:`.tokenize`. If there is no
encoding token in the input, it returns a str instead.
:func:`.tokenize` needs to detect the encoding of source files it tokenizes. The :func:`.tokenize` needs to detect the encoding of source files it tokenizes. The
......
from test import support from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer) open as tokenize_open, Untokenizer, generate_tokens)
from io import BytesIO from io import BytesIO, StringIO
import unittest import unittest
from unittest import TestCase, mock from unittest import TestCase, mock
from test.test_grammar import (VALID_UNDERSCORE_LITERALS, from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
...@@ -919,6 +919,19 @@ async def f(): ...@@ -919,6 +919,19 @@ async def f():
DEDENT '' (7, 0) (7, 0) DEDENT '' (7, 0) (7, 0)
""") """)
class GenerateTokensTest(TokenizeTest):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER is omitted.
result = []
f = StringIO(s)
for type, token, start, end, line in generate_tokens(f.readline):
if type == ENDMARKER:
break
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
self.assertEqual(result, expected.rstrip().splitlines())
def decistmt(s): def decistmt(s):
result = [] result = []
......
...@@ -37,7 +37,7 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) ...@@ -37,7 +37,7 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token import token
__all__ = token.__all__ + ["tokenize", "detect_encoding", __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"] "untokenize", "TokenInfo"]
del token del token
...@@ -653,9 +653,12 @@ def _tokenize(readline, encoding): ...@@ -653,9 +653,12 @@ def _tokenize(readline, encoding):
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline): def generate_tokens(readline):
"""Tokenize a source reading Python code as unicode strings.
This has the same API as tokenize(), except that it expects the *readline*
callable to return str objects instead of bytes.
"""
return _tokenize(readline, None) return _tokenize(readline, None)
def main(): def main():
......
:func:`tokenize.generate_tokens` is now documented as a public API to
tokenize unicode strings. It was previously present but undocumented.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment