Commit 171d69cb authored by Stefan Behnel's avatar Stefan Behnel

implement proper encoding support for new str literals

parent 52e6c71b
......@@ -304,13 +304,15 @@ class StringConst(object):
self.escaped_value = StringEncoding.escape_byte_string(byte_string)
self.py_strings = None
def get_py_string_const(self, encoding, identifier=None):
def get_py_string_const(self, encoding, identifier=None, is_str=False):
py_strings = self.py_strings
text = self.text
if encoding is not None:
encoding = encoding.upper()
key = (bool(identifier), encoding)
is_str = identifier or bool(is_str)
key = (is_str, encoding)
if py_strings is not None and key in py_strings:
py_string = py_strings[key]
else:
......@@ -333,11 +335,11 @@ class StringConst(object):
pystring_cname = "%s%s%s_%s" % (
prefix,
is_unicode and 'u' or 'b',
identifier and 'i' or '',
is_str and 's' or '',
self.cname[len(Naming.const_prefix):])
py_string = PyStringConst(
pystring_cname, is_unicode, bool(identifier), intern)
pystring_cname, encoding, is_unicode, is_str, intern)
self.py_strings[key] = py_string
return py_string
......@@ -346,14 +348,16 @@ class PyStringConst(object):
"""Global info about a Python string constant held by GlobalState.
"""
# cname string
# unicode boolean
# encoding string
# intern boolean
# identifier boolean
# is_unicode boolean
# is_str boolean
def __init__(self, cname, is_unicode, identifier=False, intern=False):
def __init__(self, cname, encoding, is_unicode, is_str=False, intern=False):
self.cname = cname
self.identifier = identifier
self.unicode = is_unicode
self.encoding = encoding
self.is_str = is_str
self.is_unicode = is_unicode
self.intern = intern
def __lt__(self, other):
......@@ -550,10 +554,10 @@ class GlobalState(object):
c = self.new_string_const(text, byte_string)
return c
def get_py_string_const(self, text, identifier=None):
def get_py_string_const(self, text, identifier=None, is_str=False):
# return a Python string constant, creating a new one if necessary
c_string = self.get_string_const(text)
py_string = c_string.get_py_string_const(text.encoding, identifier)
py_string = c_string.get_py_string_const(text.encoding, identifier, is_str)
return py_string
def new_string_const(self, text, byte_string):
......@@ -601,7 +605,7 @@ class GlobalState(object):
def add_cached_builtin_decl(self, entry):
if Options.cache_builtins:
if self.should_declare(entry.cname, entry):
interned_cname = self.get_py_string_const(entry.name, True).cname
interned_cname = self.intern_identifier(entry.name).cname
self.put_pyobject_decl(entry)
w = self.parts['cached_builtins']
w.putln('%s = __Pyx_GetName(%s, %s); if (!%s) %s' % (
......@@ -649,18 +653,26 @@ class GlobalState(object):
w.putln("static __Pyx_StringTabEntry %s[] = {" %
Naming.stringtab_cname)
for c_cname, _, py_string in py_strings:
if not py_string.is_str or not py_string.encoding or \
py_string.encoding in ('ASCII', 'USASCII', 'US-ASCII',
'UTF8', 'UTF-8'):
encoding = '0'
else:
encoding = '"%s"' % py_string.encoding.lower()
decls_writer.putln(
"static PyObject *%s;" % py_string.cname)
w.putln(
"{&%s, %s, sizeof(%s), %d, %d, %d}," % (
"{&%s, %s, sizeof(%s), %s, %d, %d, %d}," % (
py_string.cname,
c_cname,
c_cname,
py_string.unicode,
py_string.intern,
py_string.identifier
encoding,
py_string.is_unicode,
py_string.is_str,
py_string.intern
))
w.putln("{0, 0, 0, 0, 0, 0}")
w.putln("{0, 0, 0, 0, 0, 0, 0}")
w.putln("};")
init_globals = self.parts['init_globals']
......@@ -894,8 +906,8 @@ class CCodeWriter(object):
def get_string_const(self, text):
return self.globalstate.get_string_const(text).cname
def get_py_string_const(self, text, identifier=None):
return self.globalstate.get_py_string_const(text, identifier).cname
def get_py_string_const(self, text, identifier=None, is_str=False):
return self.globalstate.get_py_string_const(text, identifier, is_str).cname
def get_argument_default_const(self, type):
return self.globalstate.get_py_const(type).cname
......@@ -904,7 +916,7 @@ class CCodeWriter(object):
return self.get_py_string_const(text)
def intern_identifier(self, text):
return self.get_py_string_const(text, True)
return self.get_py_string_const(text, identifier=True)
# code generation
......
......@@ -801,6 +801,10 @@ class FloatNode(ConstNode):
class BytesNode(ConstNode):
# A char* or bytes literal
#
# value BytesLiteral
type = PyrexTypes.c_char_ptr_type
def compile_time_value(self, denv):
......@@ -899,27 +903,32 @@ class StringNode(PyConstNode):
# A Python str object, i.e. a byte string in Python 2.x and a
# unicode string in Python 3.x
#
# Can be coerced to a BytesNode (and thus to C types), but not to
# a UnicodeNode.
#
# value BytesLiteral
# is_identifier boolean
type = Builtin.str_type
is_identifier = False
def coerce_to(self, dst_type, env):
if dst_type is Builtin.str_type:
return self
if dst_type is not py_object_type and dst_type is not Builtin.str_type:
# if dst_type is Builtin.bytes_type:
# # special case: bytes = 'str literal'
# return BytesNode(self.pos, value=self.value)
if not dst_type.is_pyobject:
return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env)
if dst_type is not py_object_type:
self.check_for_coercion_error(dst_type, fail=True)
# this will be a unicode string in Py3, so make sure we can decode it
try:
self.value.decode(self.value.encoding)
except UnicodeDecodeError:
error(self.pos, "String decoding as '%s' failed. Consider using a byte string or unicode string explicitly, or adjust the source code encoding." % self.value.encoding)
return self
def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value, True)
self.result_code = code.get_py_string_const(
self.value, identifier=self.is_identifier, is_str=True)
def get_constant_c_result_code(self):
return None
......@@ -931,6 +940,12 @@ class StringNode(PyConstNode):
return self.value
class IdentifierStringNode(StringNode):
# A special str value that represents an identifier (bytes in Py2,
# unicode in Py3).
is_identifier = True
class LongNode(AtomicExprNode):
# Python long integer literal
#
......
......@@ -4757,7 +4757,7 @@ utility_function_predeclarations = \
#define INLINE
#endif
typedef struct {PyObject **p; char *s; long n; char is_unicode; char intern; char is_identifier;} __Pyx_StringTabEntry; /*proto*/
typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
"""
......@@ -5518,7 +5518,7 @@ impl = """
static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
while (t->p) {
#if PY_MAJOR_VERSION < 3
if (t->is_unicode && (!t->is_identifier)) {
if (t->is_unicode) {
*t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
} else if (t->intern) {
*t->p = PyString_InternFromString(t->s);
......@@ -5526,10 +5526,14 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
*t->p = PyString_FromStringAndSize(t->s, t->n - 1);
}
#else /* Python 3+ has unicode identifiers */
if (t->is_identifier || (t->is_unicode && t->intern)) {
if (t->is_unicode | t->is_str) {
if (t->intern) {
*t->p = PyUnicode_InternFromString(t->s);
} else if (t->is_unicode) {
} else if (t->encoding) {
*t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
} else {
*t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
}
} else {
*t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
}
......
......@@ -348,7 +348,7 @@ def p_call(s, function):
s.error("Expected an identifier before '='",
pos = arg.pos)
encoded_name = EncodedString(arg.name)
keyword = ExprNodes.StringNode(arg.pos, value = encoded_name)
keyword = ExprNodes.IdentifierStringNode(arg.pos, value = encoded_name)
arg = p_simple_expr(s)
keyword_args.append((keyword, arg))
else:
......@@ -1128,14 +1128,14 @@ def p_import_statement(s):
else:
if as_name and "." in dotted_name:
name_list = ExprNodes.ListNode(pos, args = [
ExprNodes.StringNode(pos, value = EncodedString("*"))])
ExprNodes.IdentifierStringNode(pos, value = EncodedString("*"))])
else:
name_list = None
stat = Nodes.SingleAssignmentNode(pos,
lhs = ExprNodes.NameNode(pos,
name = as_name or target_name),
rhs = ExprNodes.ImportNode(pos,
module_name = ExprNodes.StringNode(
module_name = ExprNodes.IdentifierStringNode(
pos, value = dotted_name),
name_list = name_list))
stats.append(stat)
......@@ -1193,7 +1193,7 @@ def p_from_import_statement(s, first_statement = 0):
for (name_pos, name, as_name, kind) in imported_names:
encoded_name = EncodedString(name)
imported_name_strings.append(
ExprNodes.StringNode(name_pos, value = encoded_name))
ExprNodes.IdentifierStringNode(name_pos, value = encoded_name))
items.append(
(name,
ExprNodes.NameNode(name_pos,
......@@ -1203,7 +1203,7 @@ def p_from_import_statement(s, first_statement = 0):
dotted_name = EncodedString(dotted_name)
return Nodes.FromImportStatNode(pos,
module = ExprNodes.ImportNode(dotted_name_pos,
module_name = ExprNodes.StringNode(pos, value = dotted_name),
module_name = ExprNodes.IdentifierStringNode(pos, value = dotted_name),
name_list = import_list),
items = items)
......@@ -1713,7 +1713,7 @@ def p_positional_and_keyword_args(s, end_sy_set, type_positions=(), type_keyword
parsed_type = True
else:
arg = p_simple_expr(s)
keyword_node = ExprNodes.StringNode(
keyword_node = ExprNodes.IdentifierStringNode(
arg.pos, value = EncodedString(ident))
keyword_args.append((keyword_node, arg))
was_keyword = True
......
# -*- coding: latin-1 -*-
__doc__ = (u"""
>>> a == 'abc'
True
>>> isinstance(a, str)
True
>>> isinstance(s, str)
True
>>> len(s)
6
>>> s == 'ao'
True
>>> isinstance(add(), str)
True
>>> len(add())
9
>>> add() == 'abcao'
True
>>> isinstance(add_literal(), str)
True
>>> len(add_literal())
9
>>> add_literal() == 'abcao'
True
>>> isinstance(typed(), str)
True
>>> len(typed())
6
>>> typed() == ''
True
"""
# recoding/escaping is required to properly pass the literals to doctest
).encode('unicode_escape').decode('ASCII')
a = 'abc'
s = 'ao'
u = u'ao'
cdef str S = ''
def add():
return a+s
def add_literal():
return 'abc' + 'ao'
def typed():
return S
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment