Commit 6f6e1b10 authored by Stefan Behnel's avatar Stefan Behnel

Merge branch 'f_string' into f_strings_2

parents dec1d449 c6a07689
......@@ -2954,6 +2954,115 @@ class RawCNameExprNode(ExprNode):
pass
#-------------------------------------------------------------------
#
# F-strings
#
#-------------------------------------------------------------------
class JoinedStrNode(ExprNode):
# F-strings
#
# values [UnicodeNode|FormattedValueNode] Substrings of the f-string
#
type = py_object_type
subexprs = ['values']
def analyse_types(self, env):
self.values = [v.analyse_types(env) for v in self.values]
self.values = [v.coerce_to_pyobject(env) for v in self.values]
self.is_temp = 1
if len(self.values) == 1:
# this is not uncommon because f-string format specs are parsed into JoinedStrNodes
return self.values[0]
else:
return self
def generate_result_code(self, code):
list_var = Naming.quick_temp_cname
num_items = len(self.values)
code.putln('{')
code.putln('PyObject *%s = PyList_New(%s); %s' % (
list_var,
num_items,
code.error_goto_if_null(list_var, self.pos)))
code.put_gotref(list_var)
for i, value in enumerate(self.values):
code.put_incref(value.result(), value.ctype())
code.put_giveref(value.py_result())
code.putln('PyList_SET_ITEM(%s, %s, %s);' % (list_var, i, value.py_result()))
code.putln('%s = PyUnicode_Join(%s, %s); __Pyx_DECREF(%s); %s' % (
self.result(),
Naming.empty_unicode,
list_var,
list_var,
code.error_goto_if_null(list_var, self.pos)))
code.put_gotref(self.py_result())
code.putln('}')
class FormattedValueNode(ExprNode):
# {}-delimited portions of an f-string
#
# value ExprNode The expression itself
# conversion_char str or None Type conversion (!s, !r, !a, or none)
# format_spec JoinedStrNode or None Format string passed to __format__
subexprs = ['value', 'format_spec']
conversion_chars = 'sra'
type = py_object_type
def analyse_types(self, env):
value = self.value.analyse_types(env)
format_spec = self.format_spec.analyse_types(env)
self.value = value.coerce_to_pyobject(env)
self.format_spec = format_spec.coerce_to_pyobject(env)
self.is_temp = True
return self
def generate_result_code(self, code):
value_result = self.value.py_result()
conversion_result = Naming.quick_temp_cname
format_spec_result = self.format_spec.py_result()
if self.conversion_char == 's':
fn = 'PyObject_Str'
elif self.conversion_char == 'r':
fn = 'PyObject_Repr'
elif self.conversion_char == 'a':
fn = 'PyObject_ASCII'
else:
fn = None
code.putln('{')
if fn is not None:
code.putln('PyObject *%s = %s(%s); %s' % (
conversion_result,
fn,
value_result,
code.error_goto_if_null(conversion_result, self.pos)
))
else:
code.putln('PyObject *%s = %s;' % (conversion_result, value_result))
#code.put_incref(conversion_result, py_object_type)
# TODO this should need more refcounting, figure out whether this is correct
#code.put_gotref(conversion_result)
#code.put_decref(value_result, self.value.ctype())
decref_line = '' # '__Pyx_DECREF(%s);' % conversion_result
code.putln("%s = PyObject_Format(%s, %s); %s %s" % (
self.result(),
conversion_result,
format_spec_result,
decref_line,
code.error_goto_if_null(self.result(), self.pos)))
code.put_gotref(self.py_result())
code.putln('}')
#-------------------------------------------------------------------
#
# Parallel nodes (cython.parallel.thread(savailable|id))
......
......@@ -7,7 +7,7 @@ from __future__ import absolute_import
raw_prefixes = "rR"
bytes_prefixes = "bB"
string_prefixes = "uU" + bytes_prefixes
string_prefixes = "fFuU" + bytes_prefixes
char_prefixes = "cC"
any_string_prefix = raw_prefixes + string_prefixes + char_prefixes
IDENT = 'IDENT'
......@@ -40,8 +40,8 @@ def make_lexicon():
fltconst = (decimal_fract + Opt(exponent)) | (decimal + exponent)
imagconst = (intconst | fltconst) + Any("jJ")
beginstring = Opt(Any(string_prefixes) + Opt(Any(raw_prefixes)) |
Any(raw_prefixes) + Opt(Any(bytes_prefixes)) |
# invalid combinations of prefixes are caught in p_string_literal
beginstring = Opt(Rep(Any(string_prefixes + raw_prefixes)) |
Any(char_prefixes)
) + (Str("'") | Str('"') | Str("'''") | Str('"""'))
two_oct = octdigit + octdigit
......
......@@ -695,6 +695,7 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
code.putln('static PyObject *%s;' % Naming.builtins_cname)
code.putln('static PyObject *%s;' % Naming.empty_tuple)
code.putln('static PyObject *%s;' % Naming.empty_bytes)
code.putln('static PyObject *%s;' % Naming.empty_unicode)
if Options.pre_import is not None:
code.putln('static PyObject *%s;' % Naming.preimport_cname)
code.putln('static int %s;' % Naming.lineno_cname)
......@@ -2123,6 +2124,8 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
Naming.empty_tuple, code.error_goto_if_null(Naming.empty_tuple, self.pos)))
code.putln("%s = PyBytes_FromStringAndSize(\"\", 0); %s" % (
Naming.empty_bytes, code.error_goto_if_null(Naming.empty_bytes, self.pos)))
code.putln("%s = PyUnicode_FromStringAndSize(\"\", 0); %s" % (
Naming.empty_unicode, code.error_goto_if_null(Naming.empty_unicode, self.pos)))
for ext_type in ('CyFunction', 'FusedFunction', 'Coroutine', 'Generator', 'StopAsyncIteration'):
code.putln("#ifdef __Pyx_%s_USED" % ext_type)
......
......@@ -96,6 +96,7 @@ gilstate_cname = pyrex_prefix + "state"
skip_dispatch_cname = pyrex_prefix + "skip_dispatch"
empty_tuple = pyrex_prefix + "empty_tuple"
empty_bytes = pyrex_prefix + "empty_bytes"
empty_unicode = pyrex_prefix + "empty_unicode"
print_function = pyrex_prefix + "print"
print_function_kwargs = pyrex_prefix + "print_kwargs"
cleanup_cname = pyrex_prefix + "module_cleanup"
......
......@@ -15,12 +15,13 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
re=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint)
from io import StringIO
import re
import sys
from unicodedata import lookup as lookup_unicodechar
from functools import partial, reduce
from .Scanning import PyrexScanner, FileSourceDescriptor
from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
from . import Nodes
from . import ExprNodes
from . import Builtin
......@@ -693,8 +694,12 @@ def p_atom(s):
return ExprNodes.UnicodeNode(pos, value = unicode_value, bytes_value = bytes_value)
elif kind == 'b':
return ExprNodes.BytesNode(pos, value = bytes_value)
else:
elif kind == 'f':
return ExprNodes.JoinedStrNode(pos, values = unicode_value)
elif kind == '':
return ExprNodes.StringNode(pos, value = bytes_value, unicode_value = unicode_value)
else:
s.error("invalid string kind '%s'" % kind)
elif sy == 'IDENT':
name = s.systring
s.next()
......@@ -788,29 +793,44 @@ def wrap_compile_time_constant(pos, value):
def p_cat_string_literal(s):
# A sequence of one or more adjacent string literals.
# Returns (kind, bytes_value, unicode_value)
# where kind in ('b', 'c', 'u', '')
# where kind in ('b', 'c', 'u', 'f', '')
pos = s.position()
kind, bytes_value, unicode_value = p_string_literal(s)
if kind == 'c' or s.sy != 'BEGIN_STRING':
return kind, bytes_value, unicode_value
bstrings, ustrings = [bytes_value], [unicode_value]
bstrings, ustrings, positions = [bytes_value], [unicode_value], [pos]
bytes_value = unicode_value = None
while s.sy == 'BEGIN_STRING':
pos = s.position()
next_kind, next_bytes_value, next_unicode_value = p_string_literal(s)
if next_kind == 'c':
error(pos, "Cannot concatenate char literal with another string or char literal")
continue
elif next_kind != kind:
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
(kind, next_kind))
else:
bstrings.append(next_bytes_value)
ustrings.append(next_unicode_value)
# concatenating f strings and normal strings is allowed and leads to an f string
if {kind, next_kind} == {'f', 'u'} or {kind, next_kind} == {'f', ''}:
kind = 'f'
else:
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
(kind, next_kind))
continue
bstrings.append(next_bytes_value)
ustrings.append(next_unicode_value)
positions.append(pos)
# join and rewrap the partial literals
if kind in ('b', 'c', '') or kind == 'u' and None not in bstrings:
# Py3 enforced unicode literals are parsed as bytes/unicode combination
bytes_value = bytes_literal(StringEncoding.join_bytes(bstrings), s.source_encoding)
if kind in ('u', ''):
unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) )
if kind == 'f':
unicode_value = []
for u, pos in zip(ustrings, positions):
if isinstance(u, list):
unicode_value += u
else:
# non-f-string concatenated into the f-string
unicode_value.append(ExprNodes.UnicodeNode(pos, value = EncodedString(u)))
return kind, bytes_value, unicode_value
def p_opt_string_literal(s, required_type='u'):
......@@ -833,36 +853,52 @@ def check_for_non_ascii_characters(string):
def p_string_literal(s, kind_override=None):
# A single string or char literal. Returns (kind, bvalue, uvalue)
# where kind in ('b', 'c', 'u', ''). The 'bvalue' is the source
# where kind in ('b', 'c', 'u', 'f', ''). The 'bvalue' is the source
# code byte sequence of the string literal, 'uvalue' is the
# decoded Unicode string. Either of the two may be None depending
# on the 'kind' of string, only unprefixed strings have both
# representations.
# representations. In f-strings, the uvalue is a list of the Unicode
# strings and f-string expressions that make up the f-string.
# s.sy == 'BEGIN_STRING'
pos = s.position()
is_raw = False
is_python3_source = s.context.language_level >= 3
has_non_ascii_literal_characters = False
kind = s.systring[:1].lower()
if kind == 'r':
# Py3 allows both 'br' and 'rb' as prefix
if s.systring[1:2].lower() == 'b':
kind = 'b'
else:
kind = ''
is_raw = True
elif kind in 'ub':
is_raw = s.systring[1:2].lower() == 'r'
elif kind != 'c':
kind_string = s.systring.rstrip('"\'').lower()
if len(set(kind_string)) != len(kind_string):
s.error('Duplicate string prefix character')
if 'b' in kind_string and 'u' in kind_string:
s.error('String prefixes b and u cannot be combined')
if 'b' in kind_string and 'f' in kind_string:
s.error('String prefixes b and f cannot be combined')
if 'u' in kind_string and 'f' in kind_string:
s.error('String prefixes u and f cannot be combined')
is_raw = 'r' in kind_string
if 'c' in kind_string:
# this should never happen, since the lexer does not allow combining c
# with other prefix characters
if len(kind_string) != 1:
s.error('Invalid string prefix for character literal')
kind = 'c'
elif 'f' in kind_string:
kind = 'f' # u is ignored
elif 'b' in kind_string:
kind = 'b'
elif 'u' in kind_string:
kind = 'u'
else:
kind = ''
if kind == '' and kind_override is None and Future.unicode_literals in s.context.future_directives:
chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
kind = 'u'
else:
if kind_override is not None and kind_override in 'ub':
kind = kind_override
if kind == 'u':
if kind in {'u', 'f'}: # f-strings are scanned exactly like Unicode literals, but are parsed further later
chars = StringEncoding.UnicodeLiteralBuilder()
elif kind == '':
chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
......@@ -873,7 +909,7 @@ def p_string_literal(s, kind_override=None):
s.next()
sy = s.sy
systr = s.systring
#print "p_string_literal: sy =", sy, repr(s.systring) ###
# print "p_string_literal: sy =", sy, repr(s.systring) ###
if sy == 'CHARS':
chars.append(systr)
if is_python3_source and not has_non_ascii_literal_characters and check_for_non_ascii_characters(systr):
......@@ -901,7 +937,7 @@ def p_string_literal(s, kind_override=None):
else:
s.error("Invalid hex escape '%s'" % systr,
fatal=False)
elif c in u'NUu' and kind in ('u', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1
if c == u'N':
try:
......@@ -943,14 +979,156 @@ def p_string_literal(s, kind_override=None):
bytes_value, unicode_value = chars.getstrings()
if is_python3_source and has_non_ascii_literal_characters:
# Python 3 forbids literal non-ASCII characters in byte strings
if kind != 'u':
if kind not in ('u', 'f'):
s.error("bytes can only contain ASCII literal characters.",
pos=pos, fatal=False)
bytes_value = None
if kind == 'f':
unicode_value = p_f_string(s, unicode_value, pos)
s.next()
return (kind, bytes_value, unicode_value)
def p_f_string(s, unicode_value, pos):
# Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes
# or FormattedValueNodes.
values = []
i = 0
size = len(unicode_value)
current_literal_start = 0
while i < size:
c = unicode_value[i]
if c in ('{', '}'):
if i + 1 < size and unicode_value[i + 1] == c:
encoded_str = EncodedString(unicode_value[current_literal_start:i + 1])
values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
i += 2
current_literal_start = i
elif c == '}':
s.error("single '}' encountered in format string")
else:
encoded_str = EncodedString(unicode_value[current_literal_start:i])
values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
i, expr_node = p_f_string_expr(s, unicode_value, pos, i + 1)
current_literal_start = i
values.append(expr_node)
else:
i += 1
encoded_str = EncodedString(unicode_value[current_literal_start:])
values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
return values
def p_f_string_expr(s, unicode_value, pos, starting_index):
# Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode
# and the index in the string that follows the expression.
i = starting_index
size = len(unicode_value)
conversion_char = None
format_spec_str = u''
nested_depth = 0
quote_char = None
in_triple_quotes = False
while True:
if i >= size:
s.error("missing '}' in format string expression")
c = unicode_value[i]
if quote_char is not None:
if c == '\\':
i += 1
elif c == quote_char:
if in_triple_quotes:
if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
in_triple_quotes = False
quote_char = None
i += 2
else:
quote_char = None
elif c in '\'"':
quote_char = c
if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
in_triple_quotes = True
i += 2
elif c in '{[(':
nested_depth += 1
elif nested_depth != 0 and c in '}])':
nested_depth -= 1
elif c == '#':
s.error("format string cannot include #")
elif nested_depth == 0 and c in '!:}':
# allow != as a special case
if c == '!' and i + 1 < size and unicode_value[i + 1] == '=':
i += 1
continue
terminal_char = c
break
i += 1
# the expression is parsed as if it is surrounded by parentheses
expr_str = u'(%s)' % unicode_value[starting_index:i]
if terminal_char == '!':
i += 1
if i >= size:
s.error("invalid conversion char at end of string")
conversion_char = unicode_value[i]
i += 1
if i >= size:
s.error("invalid conversion char at end of string")
terminal_char = unicode_value[i]
if terminal_char == ':':
nested_depth = 0
start_format_spec = i + 1
while True:
if i >= size:
s.error("missing '}' in format specifier")
c = unicode_value[i]
if c == '{':
if nested_depth >= 1:
s.error("nesting of '{' in format specifier is not allowed")
nested_depth += 1
elif c == '}' and nested_depth == 0:
terminal_char = c
break
elif c == '}':
nested_depth -= 1
i += 1
format_spec_str = unicode_value[start_format_spec:i]
if terminal_char != '}':
s.error("missing '}' in format string expression'")
# parse the expression
name = 'format string expression'
code_source = StringSourceDescriptor(name, expr_str)
buf = StringIO(expr_str)
scanner = PyrexScanner(buf, code_source, parent_scanner=s, source_encoding=s.source_encoding)
expr = p_testlist(scanner) # TODO is testlist right here?
# validate the conversion char
if conversion_char is not None and conversion_char not in ExprNodes.FormattedValueNode.conversion_chars:
s.error("invalid conversion character '%s'" % conversion_char)
# the format spec is itself treated like an f-string
if format_spec_str is not None:
format_spec = ExprNodes.JoinedStrNode(pos, values = p_f_string(s, format_spec_str, pos))
else:
format_spec = None
return i + 1, ExprNodes.FormattedValueNode(
s.position(), value = expr, conversion_char = conversion_char,
format_spec = format_spec)
# since PEP 448:
# list_display ::= "[" [listmaker] "]"
# listmaker ::= (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment