Commit fc869632 authored by Jelle Zijlstra's avatar Jelle Zijlstra

f strings: initial parsing work

Parses f-strings into ExprNodes, but with no support for compiling them. Some
known bugs are marked as TODOs.
parent 8953bd8d
......@@ -2866,6 +2866,32 @@ class RawCNameExprNode(ExprNode):
pass
#-------------------------------------------------------------------
#
# F-strings
#
#-------------------------------------------------------------------
class JoinedStrNode(ExprNode):
# F-strings
#
# values [UnicodeNode|FormattedValueNode] Substrings of the f-string
#
subexprs = ['values']
class FormattedValueNode(ExprNode):
# {}-delimited portions of an f-string
#
# value ExprNode The expression itself
# conversion str or None Type conversion (!s, !r, !a, or none)
# format_spec JoinedStrNode or None Format string passed to __format__
subexprs = ['value', 'format_spec']
conversion_chars = 'sra'
#-------------------------------------------------------------------
#
# Parallel nodes (cython.parallel.thread(savailable|id))
......
......@@ -7,7 +7,7 @@ from __future__ import absolute_import
raw_prefixes = "rR"
bytes_prefixes = "bB"
string_prefixes = "uU" + bytes_prefixes
string_prefixes = "fFuU" + bytes_prefixes
char_prefixes = "cC"
any_string_prefix = raw_prefixes + string_prefixes + char_prefixes
IDENT = 'IDENT'
......@@ -40,8 +40,8 @@ def make_lexicon():
fltconst = (decimal_fract + Opt(exponent)) | (decimal + exponent)
imagconst = (intconst | fltconst) + Any("jJ")
beginstring = Opt(Any(string_prefixes) + Opt(Any(raw_prefixes)) |
Any(raw_prefixes) + Opt(Any(bytes_prefixes)) |
# invalid combinations of prefixes are caught in p_string_literal
beginstring = Opt(Rep(Any(string_prefixes + raw_prefixes)) |
Any(char_prefixes)
) + (Str("'") | Str('"') | Str("'''") | Str('"""'))
two_oct = octdigit + octdigit
......
......@@ -15,12 +15,13 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
re=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint)
from io import StringIO
import re
import sys
from unicodedata import lookup as lookup_unicodechar
from functools import partial, reduce
from .Scanning import PyrexScanner, FileSourceDescriptor
from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
from . import Nodes
from . import ExprNodes
from . import Builtin
......@@ -693,8 +694,12 @@ def p_atom(s):
return ExprNodes.UnicodeNode(pos, value = unicode_value, bytes_value = bytes_value)
elif kind == 'b':
return ExprNodes.BytesNode(pos, value = bytes_value)
else:
elif kind == 'f':
return ExprNodes.JoinedStrNode(pos, values = unicode_value)
elif kind == '':
return ExprNodes.StringNode(pos, value = bytes_value, unicode_value = unicode_value)
else:
s.error("invalid string kind '%s'" % kind)
elif sy == 'IDENT':
name = s.systring
s.next()
......@@ -788,29 +793,44 @@ def wrap_compile_time_constant(pos, value):
def p_cat_string_literal(s):
# A sequence of one or more adjacent string literals.
# Returns (kind, bytes_value, unicode_value)
# where kind in ('b', 'c', 'u', '')
# where kind in ('b', 'c', 'u', 'f', '')
pos = s.position()
kind, bytes_value, unicode_value = p_string_literal(s)
if kind == 'c' or s.sy != 'BEGIN_STRING':
return kind, bytes_value, unicode_value
bstrings, ustrings = [bytes_value], [unicode_value]
bstrings, ustrings, positions = [bytes_value], [unicode_value], [pos]
bytes_value = unicode_value = None
while s.sy == 'BEGIN_STRING':
pos = s.position()
next_kind, next_bytes_value, next_unicode_value = p_string_literal(s)
if next_kind == 'c':
error(pos, "Cannot concatenate char literal with another string or char literal")
continue
elif next_kind != kind:
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
(kind, next_kind))
else:
bstrings.append(next_bytes_value)
ustrings.append(next_unicode_value)
# concatenating f strings and normal strings is allowed and leads to an f string
if {kind, next_kind} == {'f', 'u'}:
kind = 'f'
else:
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
(kind, next_kind))
continue
bstrings.append(next_bytes_value)
ustrings.append(next_unicode_value)
positions.append(pos)
# join and rewrap the partial literals
if kind in ('b', 'c', '') or kind == 'u' and None not in bstrings:
# Py3 enforced unicode literals are parsed as bytes/unicode combination
bytes_value = bytes_literal(StringEncoding.join_bytes(bstrings), s.source_encoding)
if kind in ('u', ''):
unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) )
if kind == 'f':
unicode_value = []
for u, pos in zip(ustrings, positions):
if isinstance(u, list):
unicode_value += u
else:
# non-f-string concatenated into the f-string
unicode_value.append(ExprNodes.UnicodeNode(pos, value = EncodedString(u)))
return kind, bytes_value, unicode_value
def p_opt_string_literal(s, required_type='u'):
......@@ -833,36 +853,52 @@ def check_for_non_ascii_characters(string):
def p_string_literal(s, kind_override=None):
# A single string or char literal. Returns (kind, bvalue, uvalue)
# where kind in ('b', 'c', 'u', ''). The 'bvalue' is the source
# where kind in ('b', 'c', 'u', 'f', ''). The 'bvalue' is the source
# code byte sequence of the string literal, 'uvalue' is the
# decoded Unicode string. Either of the two may be None depending
# on the 'kind' of string, only unprefixed strings have both
# representations.
# representations. In f-strings, the uvalue is a list of the Unicode
# strings and f-string expressions that make up the f-string.
# s.sy == 'BEGIN_STRING'
pos = s.position()
is_raw = False
is_python3_source = s.context.language_level >= 3
has_non_ascii_literal_characters = False
kind = s.systring[:1].lower()
if kind == 'r':
# Py3 allows both 'br' and 'rb' as prefix
if s.systring[1:2].lower() == 'b':
kind = 'b'
else:
kind = ''
is_raw = True
elif kind in 'ub':
is_raw = s.systring[1:2].lower() == 'r'
elif kind != 'c':
kind_string = s.systring.rstrip('"\'').lower()
if len(set(kind_string)) != len(kind_string):
s.error('Duplicate string prefix character')
if 'b' in kind_string and 'u' in kind_string:
s.error('String prefixes b and u cannot be combined')
if 'b' in kind_string and 'f' in kind_string:
s.error('String prefixes b and f cannot be combined')
if 'u' in kind_string and 'f' in kind_string:
s.error('String prefixes u and f cannot be combined')
is_raw = 'r' in kind_string
if 'c' in kind_string:
# this should never happen, since the lexer does not allow combining c
# with other prefix characters
if len(kind_string) != 1:
s.error('Invalid string prefix for character literal')
kind = 'c'
elif 'f' in kind_string:
kind = 'f' # u is ignored
elif 'b' in kind_string:
kind = 'b'
elif 'u' in kind_string:
kind = 'u'
else:
kind = ''
if kind == '' and kind_override is None and Future.unicode_literals in s.context.future_directives:
chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
kind = 'u'
else:
if kind_override is not None and kind_override in 'ub':
kind = kind_override
if kind == 'u':
if kind in 'uf': # f-strings are scanned exactly like Unicode literals, but are parsed further later
chars = StringEncoding.UnicodeLiteralBuilder()
elif kind == '':
chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
......@@ -873,7 +909,7 @@ def p_string_literal(s, kind_override=None):
s.next()
sy = s.sy
systr = s.systring
#print "p_string_literal: sy =", sy, repr(s.systring) ###
# print "p_string_literal: sy =", sy, repr(s.systring) ###
if sy == 'CHARS':
chars.append(systr)
if is_python3_source and not has_non_ascii_literal_characters and check_for_non_ascii_characters(systr):
......@@ -943,14 +979,142 @@ def p_string_literal(s, kind_override=None):
bytes_value, unicode_value = chars.getstrings()
if is_python3_source and has_non_ascii_literal_characters:
# Python 3 forbids literal non-ASCII characters in byte strings
if kind != 'u':
if kind not in 'uf':
s.error("bytes can only contain ASCII literal characters.",
pos=pos, fatal=False)
bytes_value = None
if kind == 'f':
unicode_value = p_f_string(s, unicode_value, pos)
s.next()
return (kind, bytes_value, unicode_value)
def p_f_string(s, unicode_value, pos):
# Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes
# or FormattedValueNodes.
values = []
i = 0
size = len(unicode_value)
current_literal_start = 0
while i < size:
c = unicode_value[i]
if c == '}':
s.error("single '}' encountered in format string")
elif c == '{':
# double { escapes it
if i + 1 < size and unicode_value[i + 1] == '{':
i += 2
else:
encoded_str = EncodedString(unicode_value[current_literal_start:i])
values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
i, expr_node = p_f_string_expr(s, unicode_value, pos, i + 1)
current_literal_start = i
values.append(expr_node)
else:
i += 1
encoded_str = EncodedString(unicode_value[current_literal_start:])
values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
print "F-STRING VALUES", values
return values
def p_f_string_expr(s, unicode_value, pos, starting_index):
# Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode
# and the index in the string that follows the expression.
i = starting_index
size = len(unicode_value)
conversion_char = None
format_spec_str = None
nested_depth = 0
while True:
if i >= size:
s.error("missing '}' in format string expression")
c = unicode_value[i]
# TODO strings
if c in '{[(':
nested_depth += 1
elif nested_depth != 0 and c in '}])':
nested_depth -= 1
elif c == '#':
s.error("format string cannot include #")
elif nested_depth == 0 and c in '!:}':
# allow != as a special case
if c == '!' and i + 1 < size and unicode_value[i + 1] == '=':
continue
terminal_char = c
break
i += 1
# the expression is parsed as if it is surrounded by parentheses
expr_str = u'(%s)' % unicode_value[starting_index:i]
if terminal_char == '!':
i += 1
if i >= size:
s.error("invalid conversion char at end of string")
conversion_char = unicode_value[i]
i += 1
if i >= size:
s.error("invalid conversion char at end of string")
terminal_char = unicode_value[i]
if terminal_char == ':':
nested_depth = 0
start_format_spec = i + 1
while True:
if i >= size:
s.error("missing '}' in format specifier")
c = unicode_value[i]
if c == '{':
if nested_depth >= 1:
s.error("nesting of '{' in format specifier is not allowed")
nested_depth += 1
elif c == '}' and nested_depth == 0:
terminal_char = c
break
elif c == '}':
nested_depth -= 1
i += 1
format_spec_str = unicode_value[start_format_spec:i]
if terminal_char != '}':
s.error("missing '}' in format string expression'")
print 'expr=%r, conversion_char=%r, format_spec=%r' % (expr_str, conversion_char, format_spec_str)
# parse the expression
name = 'format string expression'
code_source = StringSourceDescriptor(name, expr_str)
buf = StringIO(expr_str)
# scanner = PyrexScanner(buf, code_source, source_encoding = encoding,
# scope = scope, context = context, initial_pos = initial_pos)
scanner = PyrexScanner(buf, code_source, parent_scanner=s) # TODO other params
expr = p_testlist(scanner) # TODO is testlist right here?
# validate the conversion char
if conversion_char is not None and conversion_char not in ExprNodes.FormattedValueNode.conversion_chars:
s.error("invalid conversion character '%s'" % conversion_char)
# the format spec is itself treated like an f-string
if format_spec_str is not None:
format_spec = p_f_string(s, format_spec_str, pos)
else:
format_spec = None
return i + 1, ExprNodes.FormattedValueNode(
s.position(), value = expr, conversion_char = conversion_char,
format_spec = format_spec)
# since PEP 448:
# list_display ::= "[" [listmaker] "]"
# listmaker ::= (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
......@@ -3271,6 +3435,7 @@ def p_ignorable_statement(s):
def p_doc_string(s):
if s.sy == 'BEGIN_STRING':
pos = s.position()
# TODO: should this support f-strings?
kind, bytes_result, unicode_result = p_cat_string_literal(s)
s.expect_newline("Syntax error in doc string", ignore_semicolon=True)
if kind in ('u', ''):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment