Merge branch 'f_string' into f_strings_2

6f6e1b10 · Stefan Behnel · dec1d449 · c6a07689 · 6f6e1b10 · 6f6e1b10
Commit 6f6e1b10 authored Mar 21, 2016 by Stefan Behnel
5 changed files
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -2954,6 +2954,115 @@ class RawCNameExprNode(ExprNode):
        pass


+#-------------------------------------------------------------------
+#
+#  F-strings
+#
+#-------------------------------------------------------------------
+
+
+class JoinedStrNode(ExprNode):
+    # F-strings
+    #
+    # values [UnicodeNode|FormattedValueNode] Substrings of the f-string
+    #
+    type = py_object_type
+
+    subexprs = ['values']
+
+    def analyse_types(self, env):
+        self.values = [v.analyse_types(env) for v in self.values]
+        self.values = [v.coerce_to_pyobject(env) for v in self.values]
+        self.is_temp = 1
+        if len(self.values) == 1:
+            # this is not uncommon because f-string format specs are parsed into JoinedStrNodes
+            return self.values[0]
+        else:
+            return self
+
+    def generate_result_code(self, code):
+        list_var = Naming.quick_temp_cname
+        num_items = len(self.values)
+
+        code.putln('{')
+        code.putln('PyObject *%s = PyList_New(%s); %s' % (
+            list_var,
+            num_items,
+            code.error_goto_if_null(list_var, self.pos)))
+        code.put_gotref(list_var)
+        for i, value in enumerate(self.values):
+            code.put_incref(value.result(), value.ctype())
+            code.put_giveref(value.py_result())
+            code.putln('PyList_SET_ITEM(%s, %s, %s);' % (list_var, i, value.py_result()))
+        code.putln('%s = PyUnicode_Join(%s, %s); __Pyx_DECREF(%s); %s' % (
+            self.result(),
+            Naming.empty_unicode,
+            list_var,
+            list_var,
+            code.error_goto_if_null(list_var, self.pos)))
+        code.put_gotref(self.py_result())
+        code.putln('}')
+
+
+class FormattedValueNode(ExprNode):
+    # {}-delimited portions of an f-string
+    #
+    # value           ExprNode                The expression itself
+    # conversion_char str or None             Type conversion (!s, !r, !a, or none)
+    # format_spec     JoinedStrNode or None   Format string passed to __format__
+    subexprs = ['value', 'format_spec']
+
+    conversion_chars = 'sra'
+    type = py_object_type
+
+    def analyse_types(self, env):
+        value = self.value.analyse_types(env)
+        format_spec = self.format_spec.analyse_types(env)
+        self.value = value.coerce_to_pyobject(env)
+        self.format_spec = format_spec.coerce_to_pyobject(env)
+        self.is_temp = True
+        return self
+
+    def generate_result_code(self, code):
+        value_result = self.value.py_result()
+        conversion_result = Naming.quick_temp_cname
+        format_spec_result = self.format_spec.py_result()
+        if self.conversion_char == 's':
+            fn = 'PyObject_Str'
+        elif self.conversion_char == 'r':
+            fn = 'PyObject_Repr'
+        elif self.conversion_char == 'a':
+            fn = 'PyObject_ASCII'
+        else:
+            fn = None
+
+        code.putln('{')
+
+        if fn is not None:
+            code.putln('PyObject *%s = %s(%s); %s' % (
+                conversion_result,
+                fn,
+                value_result,
+                code.error_goto_if_null(conversion_result, self.pos)
+            ))
+        else:
+            code.putln('PyObject *%s = %s;' % (conversion_result, value_result))
+            #code.put_incref(conversion_result, py_object_type)
+        # TODO this should need more refcounting, figure out whether this is correct
+        #code.put_gotref(conversion_result)
+        #code.put_decref(value_result, self.value.ctype())
+        decref_line = '' # '__Pyx_DECREF(%s);' % conversion_result
+
+        code.putln("%s = PyObject_Format(%s, %s); %s %s" % (
+            self.result(),
+            conversion_result,
+            format_spec_result,
+            decref_line,
+            code.error_goto_if_null(self.result(), self.pos)))
+        code.put_gotref(self.py_result())
+        code.putln('}')
+
+
 #-------------------------------------------------------------------
 #
 #  Parallel nodes (cython.parallel.thread(savailable|id))

--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -7,7 +7,7 @@ from __future__ import absolute_import

 raw_prefixes = "rR"
 bytes_prefixes = "bB"
-string_prefixes = "uU" + bytes_prefixes
+string_prefixes = "fFuU" + bytes_prefixes
 char_prefixes = "cC"
 any_string_prefix = raw_prefixes + string_prefixes + char_prefixes
 IDENT = 'IDENT'
@@ -40,8 +40,8 @@ def make_lexicon():
    fltconst = (decimal_fract + Opt(exponent)) | (decimal + exponent)
    imagconst = (intconst | fltconst) + Any("jJ")

-    beginstring = Opt(Any(string_prefixes) + Opt(Any(raw_prefixes)) |
-                      Any(raw_prefixes) + Opt(Any(bytes_prefixes)) |
+    # invalid combinations of prefixes are caught in p_string_literal
+    beginstring = Opt(Rep(Any(string_prefixes + raw_prefixes)) |
                      Any(char_prefixes)
                      ) + (Str("'") | Str('"') | Str("'''") | Str('"""'))
    two_oct = octdigit + octdigit

--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
@@ -695,6 +695,7 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
        code.putln('static PyObject *%s;' % Naming.builtins_cname)
        code.putln('static PyObject *%s;' % Naming.empty_tuple)
        code.putln('static PyObject *%s;' % Naming.empty_bytes)
+        code.putln('static PyObject *%s;' % Naming.empty_unicode)
        if Options.pre_import is not None:
            code.putln('static PyObject *%s;' % Naming.preimport_cname)
        code.putln('static int %s;' % Naming.lineno_cname)
@@ -2123,6 +2124,8 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
            Naming.empty_tuple, code.error_goto_if_null(Naming.empty_tuple, self.pos)))
        code.putln("%s = PyBytes_FromStringAndSize(\"\", 0); %s" % (
            Naming.empty_bytes, code.error_goto_if_null(Naming.empty_bytes, self.pos)))
+        code.putln("%s = PyUnicode_FromStringAndSize(\"\", 0); %s" % (
+            Naming.empty_unicode, code.error_goto_if_null(Naming.empty_unicode, self.pos)))

        for ext_type in ('CyFunction', 'FusedFunction', 'Coroutine', 'Generator', 'StopAsyncIteration'):
            code.putln("#ifdef __Pyx_%s_USED" % ext_type)

--- a/Cython/Compiler/Naming.py
+++ b/Cython/Compiler/Naming.py
@@ -96,6 +96,7 @@ gilstate_cname   = pyrex_prefix + "state"
 skip_dispatch_cname = pyrex_prefix + "skip_dispatch"
 empty_tuple      = pyrex_prefix + "empty_tuple"
 empty_bytes      = pyrex_prefix + "empty_bytes"
+empty_unicode	 = pyrex_prefix + "empty_unicode"
 print_function   = pyrex_prefix + "print"
 print_function_kwargs   = pyrex_prefix + "print_kwargs"
 cleanup_cname    = pyrex_prefix + "module_cleanup"

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -15,12 +15,13 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
               re=object, _unicode=object, _bytes=object,
               partial=object, reduce=object, _IS_PY3=cython.bint)

+from io import StringIO
 import re
 import sys
 from unicodedata import lookup as lookup_unicodechar
 from functools import partial, reduce

-from .Scanning import PyrexScanner, FileSourceDescriptor
+from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
 from . import Nodes
 from . import ExprNodes
 from . import Builtin
@@ -693,8 +694,12 @@ def p_atom(s):
            return ExprNodes.UnicodeNode(pos, value = unicode_value, bytes_value = bytes_value)
        elif kind == 'b':
            return ExprNodes.BytesNode(pos, value = bytes_value)
-        else:
+        elif kind == 'f':
+            return ExprNodes.JoinedStrNode(pos, values = unicode_value)
+        elif kind == '':
            return ExprNodes.StringNode(pos, value = bytes_value, unicode_value = unicode_value)
+        else:
+            s.error("invalid string kind '%s'" % kind)
    elif sy == 'IDENT':
        name = s.systring
        s.next()
@@ -788,29 +793,44 @@ def wrap_compile_time_constant(pos, value):
 def p_cat_string_literal(s):
    # A sequence of one or more adjacent string literals.
    # Returns (kind, bytes_value, unicode_value)
-    # where kind in ('b', 'c', 'u', '')
+    # where kind in ('b', 'c', 'u', 'f', '')
+    pos = s.position()
    kind, bytes_value, unicode_value = p_string_literal(s)
    if kind == 'c' or s.sy != 'BEGIN_STRING':
        return kind, bytes_value, unicode_value
-    bstrings, ustrings = [bytes_value], [unicode_value]
+    bstrings, ustrings, positions = [bytes_value], [unicode_value], [pos]
    bytes_value = unicode_value = None
    while s.sy == 'BEGIN_STRING':
        pos = s.position()
        next_kind, next_bytes_value, next_unicode_value = p_string_literal(s)
        if next_kind == 'c':
            error(pos, "Cannot concatenate char literal with another string or char literal")
+            continue
        elif next_kind != kind:
-            error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
-                  (kind, next_kind))
-        else:
-            bstrings.append(next_bytes_value)
-            ustrings.append(next_unicode_value)
+            # concatenating f strings and normal strings is allowed and leads to an f string
+            if {kind, next_kind} == {'f', 'u'} or {kind, next_kind} == {'f', ''}:
+                kind = 'f'
+            else:
+                error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
+                      (kind, next_kind))
+                continue
+        bstrings.append(next_bytes_value)
+        ustrings.append(next_unicode_value)
+        positions.append(pos)
    # join and rewrap the partial literals
    if kind in ('b', 'c', '') or kind == 'u' and None not in bstrings:
        # Py3 enforced unicode literals are parsed as bytes/unicode combination
        bytes_value = bytes_literal(StringEncoding.join_bytes(bstrings), s.source_encoding)
    if kind in ('u', ''):
        unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) )
+    if kind == 'f':
+        unicode_value = []
+        for u, pos in zip(ustrings, positions):
+            if isinstance(u, list):
+                unicode_value += u
+            else:
+                # non-f-string concatenated into the f-string
+                unicode_value.append(ExprNodes.UnicodeNode(pos, value = EncodedString(u)))
    return kind, bytes_value, unicode_value

 def p_opt_string_literal(s, required_type='u'):
@@ -833,36 +853,52 @@ def check_for_non_ascii_characters(string):

 def p_string_literal(s, kind_override=None):
    # A single string or char literal.  Returns (kind, bvalue, uvalue)
-    # where kind in ('b', 'c', 'u', '').  The 'bvalue' is the source
+    # where kind in ('b', 'c', 'u', 'f', '').  The 'bvalue' is the source
    # code byte sequence of the string literal, 'uvalue' is the
    # decoded Unicode string.  Either of the two may be None depending
    # on the 'kind' of string, only unprefixed strings have both
-    # representations.
+    # representations. In f-strings, the uvalue is a list of the Unicode
+    # strings and f-string expressions that make up the f-string.

    # s.sy == 'BEGIN_STRING'
    pos = s.position()
    is_raw = False
    is_python3_source = s.context.language_level >= 3
    has_non_ascii_literal_characters = False
-    kind = s.systring[:1].lower()
-    if kind == 'r':
-        # Py3 allows both 'br' and 'rb' as prefix
-        if s.systring[1:2].lower() == 'b':
-            kind = 'b'
-        else:
-            kind = ''
-        is_raw = True
-    elif kind in 'ub':
-        is_raw = s.systring[1:2].lower() == 'r'
-    elif kind != 'c':
+    kind_string = s.systring.rstrip('"\'').lower()
+    if len(set(kind_string)) != len(kind_string):
+        s.error('Duplicate string prefix character')
+    if 'b' in kind_string and 'u' in kind_string:
+        s.error('String prefixes b and u cannot be combined')
+    if 'b' in kind_string and 'f' in kind_string:
+        s.error('String prefixes b and f cannot be combined')
+    if 'u' in kind_string and 'f' in kind_string:
+        s.error('String prefixes u and f cannot be combined')
+
+    is_raw = 'r' in kind_string
+
+    if 'c' in kind_string:
+        # this should never happen, since the lexer does not allow combining c
+        # with other prefix characters
+        if len(kind_string) != 1:
+            s.error('Invalid string prefix for character literal')
+        kind = 'c'
+    elif 'f' in kind_string:
+        kind = 'f'  # u is ignored
+    elif 'b' in kind_string:
+        kind = 'b'
+    elif 'u' in kind_string:
+        kind = 'u'
+    else:
        kind = ''
+
    if kind == '' and kind_override is None and Future.unicode_literals in s.context.future_directives:
        chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
        kind = 'u'
    else:
        if kind_override is not None and kind_override in 'ub':
            kind = kind_override
-        if kind == 'u':
+        if kind in {'u', 'f'}:  # f-strings are scanned exactly like Unicode literals, but are parsed further later
            chars = StringEncoding.UnicodeLiteralBuilder()
        elif kind == '':
            chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
@@ -873,7 +909,7 @@ def p_string_literal(s, kind_override=None):
        s.next()
        sy = s.sy
        systr = s.systring
-        #print "p_string_literal: sy =", sy, repr(s.systring) ###
+        # print "p_string_literal: sy =", sy, repr(s.systring) ###
        if sy == 'CHARS':
            chars.append(systr)
            if is_python3_source and not has_non_ascii_literal_characters and check_for_non_ascii_characters(systr):
@@ -901,7 +937,7 @@ def p_string_literal(s, kind_override=None):
                    else:
                        s.error("Invalid hex escape '%s'" % systr,
                                fatal=False)
-                elif c in u'NUu' and kind in ('u', ''):   # \uxxxx, \Uxxxxxxxx, \N{...}
+                elif c in u'NUu' and kind in ('u', 'f', ''):   # \uxxxx, \Uxxxxxxxx, \N{...}
                    chrval = -1
                    if c == u'N':
                        try:
@@ -943,14 +979,156 @@ def p_string_literal(s, kind_override=None):
        bytes_value, unicode_value = chars.getstrings()
        if is_python3_source and has_non_ascii_literal_characters:
            # Python 3 forbids literal non-ASCII characters in byte strings
-            if kind != 'u':
+            if kind not in ('u', 'f'):
                s.error("bytes can only contain ASCII literal characters.",
                        pos=pos, fatal=False)
            bytes_value = None
+    if kind == 'f':
+        unicode_value = p_f_string(s, unicode_value, pos)
    s.next()
    return (kind, bytes_value, unicode_value)


+def p_f_string(s, unicode_value, pos):
+    # Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes
+    # or FormattedValueNodes.
+    values = []
+    i = 0
+    size = len(unicode_value)
+    current_literal_start = 0
+    while i < size:
+        c = unicode_value[i]
+        if c in ('{', '}'):
+            if i + 1 < size and unicode_value[i + 1] == c:
+                encoded_str = EncodedString(unicode_value[current_literal_start:i + 1])
+                values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
+                i += 2
+                current_literal_start = i
+            elif c == '}':
+                s.error("single '}' encountered in format string")
+            else:
+                encoded_str = EncodedString(unicode_value[current_literal_start:i])
+                values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
+                i, expr_node = p_f_string_expr(s, unicode_value, pos, i + 1)
+                current_literal_start = i
+                values.append(expr_node)
+        else:
+            i += 1
+
+    encoded_str = EncodedString(unicode_value[current_literal_start:])
+    values.append(ExprNodes.UnicodeNode(pos, value = encoded_str))
+    return values
+
+
+def p_f_string_expr(s, unicode_value, pos, starting_index):
+    # Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode
+    # and the index in the string that follows the expression.
+    i = starting_index
+    size = len(unicode_value)
+    conversion_char = None
+    format_spec_str = u''
+
+    nested_depth = 0
+    quote_char = None
+    in_triple_quotes = False
+
+    while True:
+        if i >= size:
+            s.error("missing '}' in format string expression")
+        c = unicode_value[i]
+
+        if quote_char is not None:
+            if c == '\\':
+                i += 1
+            elif c == quote_char:
+                if in_triple_quotes:
+                    if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
+                        in_triple_quotes = False
+                        quote_char = None
+                        i += 2
+                else:
+                    quote_char = None
+        elif c in '\'"':
+            quote_char = c
+            if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
+                in_triple_quotes = True
+                i += 2
+        elif c in '{[(':
+            nested_depth += 1
+        elif nested_depth != 0 and c in '}])':
+            nested_depth -= 1
+        elif c == '#':
+            s.error("format string cannot include #")
+        elif nested_depth == 0 and c in '!:}':
+            # allow != as a special case
+            if c == '!' and i + 1 < size and unicode_value[i + 1] == '=':
+                i += 1
+                continue
+
+            terminal_char = c
+            break
+        i += 1
+
+    # the expression is parsed as if it is surrounded by parentheses
+    expr_str = u'(%s)' % unicode_value[starting_index:i]
+
+    if terminal_char == '!':
+        i += 1
+        if i >= size:
+            s.error("invalid conversion char at end of string")
+
+        conversion_char = unicode_value[i]
+
+        i += 1
+        if i >= size:
+            s.error("invalid conversion char at end of string")
+        terminal_char = unicode_value[i]
+
+    if terminal_char == ':':
+        nested_depth = 0
+        start_format_spec = i + 1
+        while True:
+            if i >= size:
+                s.error("missing '}' in format specifier")
+            c = unicode_value[i]
+            if c == '{':
+                if nested_depth >= 1:
+                    s.error("nesting of '{' in format specifier is not allowed")
+                nested_depth += 1
+            elif c == '}' and nested_depth == 0:
+                terminal_char = c
+                break
+            elif c == '}':
+                nested_depth -= 1
+            i += 1
+
+        format_spec_str = unicode_value[start_format_spec:i]
+
+    if terminal_char != '}':
+        s.error("missing '}' in format string expression'")
+
+    # parse the expression
+    name = 'format string expression'
+    code_source = StringSourceDescriptor(name, expr_str)
+    buf = StringIO(expr_str)
+    scanner = PyrexScanner(buf, code_source, parent_scanner=s, source_encoding=s.source_encoding)
+    expr = p_testlist(scanner)  # TODO is testlist right here?
+
+    # validate the conversion char
+    if conversion_char is not None and conversion_char not in ExprNodes.FormattedValueNode.conversion_chars:
+        s.error("invalid conversion character '%s'" % conversion_char)
+
+    # the format spec is itself treated like an f-string
+    if format_spec_str is not None:
+        format_spec = ExprNodes.JoinedStrNode(pos, values = p_f_string(s, format_spec_str, pos))
+    else:
+        format_spec = None
+
+    return i + 1, ExprNodes.FormattedValueNode(
+        s.position(), value = expr, conversion_char = conversion_char,
+        format_spec = format_spec)
+
+
 # since PEP 448:
 # list_display  ::=     "[" [listmaker] "]"
 # listmaker     ::=     (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )