Commit 171d69cb authored by Stefan Behnel's avatar Stefan Behnel

implement proper encoding support for new str literals

parent 52e6c71b
...@@ -304,13 +304,15 @@ class StringConst(object): ...@@ -304,13 +304,15 @@ class StringConst(object):
self.escaped_value = StringEncoding.escape_byte_string(byte_string) self.escaped_value = StringEncoding.escape_byte_string(byte_string)
self.py_strings = None self.py_strings = None
def get_py_string_const(self, encoding, identifier=None): def get_py_string_const(self, encoding, identifier=None, is_str=False):
py_strings = self.py_strings py_strings = self.py_strings
text = self.text text = self.text
if encoding is not None: if encoding is not None:
encoding = encoding.upper() encoding = encoding.upper()
key = (bool(identifier), encoding) is_str = identifier or bool(is_str)
key = (is_str, encoding)
if py_strings is not None and key in py_strings: if py_strings is not None and key in py_strings:
py_string = py_strings[key] py_string = py_strings[key]
else: else:
...@@ -333,11 +335,11 @@ class StringConst(object): ...@@ -333,11 +335,11 @@ class StringConst(object):
pystring_cname = "%s%s%s_%s" % ( pystring_cname = "%s%s%s_%s" % (
prefix, prefix,
is_unicode and 'u' or 'b', is_unicode and 'u' or 'b',
identifier and 'i' or '', is_str and 's' or '',
self.cname[len(Naming.const_prefix):]) self.cname[len(Naming.const_prefix):])
py_string = PyStringConst( py_string = PyStringConst(
pystring_cname, is_unicode, bool(identifier), intern) pystring_cname, encoding, is_unicode, is_str, intern)
self.py_strings[key] = py_string self.py_strings[key] = py_string
return py_string return py_string
...@@ -346,14 +348,16 @@ class PyStringConst(object): ...@@ -346,14 +348,16 @@ class PyStringConst(object):
"""Global info about a Python string constant held by GlobalState. """Global info about a Python string constant held by GlobalState.
""" """
# cname string # cname string
# unicode boolean # encoding string
# intern boolean # intern boolean
# identifier boolean # is_unicode boolean
# is_str boolean
def __init__(self, cname, is_unicode, identifier=False, intern=False): def __init__(self, cname, encoding, is_unicode, is_str=False, intern=False):
self.cname = cname self.cname = cname
self.identifier = identifier self.encoding = encoding
self.unicode = is_unicode self.is_str = is_str
self.is_unicode = is_unicode
self.intern = intern self.intern = intern
def __lt__(self, other): def __lt__(self, other):
...@@ -550,10 +554,10 @@ class GlobalState(object): ...@@ -550,10 +554,10 @@ class GlobalState(object):
c = self.new_string_const(text, byte_string) c = self.new_string_const(text, byte_string)
return c return c
def get_py_string_const(self, text, identifier=None): def get_py_string_const(self, text, identifier=None, is_str=False):
# return a Python string constant, creating a new one if necessary # return a Python string constant, creating a new one if necessary
c_string = self.get_string_const(text) c_string = self.get_string_const(text)
py_string = c_string.get_py_string_const(text.encoding, identifier) py_string = c_string.get_py_string_const(text.encoding, identifier, is_str)
return py_string return py_string
def new_string_const(self, text, byte_string): def new_string_const(self, text, byte_string):
...@@ -601,7 +605,7 @@ class GlobalState(object): ...@@ -601,7 +605,7 @@ class GlobalState(object):
def add_cached_builtin_decl(self, entry): def add_cached_builtin_decl(self, entry):
if Options.cache_builtins: if Options.cache_builtins:
if self.should_declare(entry.cname, entry): if self.should_declare(entry.cname, entry):
interned_cname = self.get_py_string_const(entry.name, True).cname interned_cname = self.intern_identifier(entry.name).cname
self.put_pyobject_decl(entry) self.put_pyobject_decl(entry)
w = self.parts['cached_builtins'] w = self.parts['cached_builtins']
w.putln('%s = __Pyx_GetName(%s, %s); if (!%s) %s' % ( w.putln('%s = __Pyx_GetName(%s, %s); if (!%s) %s' % (
...@@ -649,18 +653,26 @@ class GlobalState(object): ...@@ -649,18 +653,26 @@ class GlobalState(object):
w.putln("static __Pyx_StringTabEntry %s[] = {" % w.putln("static __Pyx_StringTabEntry %s[] = {" %
Naming.stringtab_cname) Naming.stringtab_cname)
for c_cname, _, py_string in py_strings: for c_cname, _, py_string in py_strings:
if not py_string.is_str or not py_string.encoding or \
py_string.encoding in ('ASCII', 'USASCII', 'US-ASCII',
'UTF8', 'UTF-8'):
encoding = '0'
else:
encoding = '"%s"' % py_string.encoding.lower()
decls_writer.putln( decls_writer.putln(
"static PyObject *%s;" % py_string.cname) "static PyObject *%s;" % py_string.cname)
w.putln( w.putln(
"{&%s, %s, sizeof(%s), %d, %d, %d}," % ( "{&%s, %s, sizeof(%s), %s, %d, %d, %d}," % (
py_string.cname, py_string.cname,
c_cname, c_cname,
c_cname, c_cname,
py_string.unicode, encoding,
py_string.intern, py_string.is_unicode,
py_string.identifier py_string.is_str,
py_string.intern
)) ))
w.putln("{0, 0, 0, 0, 0, 0}") w.putln("{0, 0, 0, 0, 0, 0, 0}")
w.putln("};") w.putln("};")
init_globals = self.parts['init_globals'] init_globals = self.parts['init_globals']
...@@ -894,8 +906,8 @@ class CCodeWriter(object): ...@@ -894,8 +906,8 @@ class CCodeWriter(object):
def get_string_const(self, text): def get_string_const(self, text):
return self.globalstate.get_string_const(text).cname return self.globalstate.get_string_const(text).cname
def get_py_string_const(self, text, identifier=None): def get_py_string_const(self, text, identifier=None, is_str=False):
return self.globalstate.get_py_string_const(text, identifier).cname return self.globalstate.get_py_string_const(text, identifier, is_str).cname
def get_argument_default_const(self, type): def get_argument_default_const(self, type):
return self.globalstate.get_py_const(type).cname return self.globalstate.get_py_const(type).cname
...@@ -904,7 +916,7 @@ class CCodeWriter(object): ...@@ -904,7 +916,7 @@ class CCodeWriter(object):
return self.get_py_string_const(text) return self.get_py_string_const(text)
def intern_identifier(self, text): def intern_identifier(self, text):
return self.get_py_string_const(text, True) return self.get_py_string_const(text, identifier=True)
# code generation # code generation
......
...@@ -801,6 +801,10 @@ class FloatNode(ConstNode): ...@@ -801,6 +801,10 @@ class FloatNode(ConstNode):
class BytesNode(ConstNode): class BytesNode(ConstNode):
# A char* or bytes literal
#
# value BytesLiteral
type = PyrexTypes.c_char_ptr_type type = PyrexTypes.c_char_ptr_type
def compile_time_value(self, denv): def compile_time_value(self, denv):
...@@ -899,27 +903,32 @@ class StringNode(PyConstNode): ...@@ -899,27 +903,32 @@ class StringNode(PyConstNode):
# A Python str object, i.e. a byte string in Python 2.x and a # A Python str object, i.e. a byte string in Python 2.x and a
# unicode string in Python 3.x # unicode string in Python 3.x
# #
# Can be coerced to a BytesNode (and thus to C types), but not to # value BytesLiteral
# a UnicodeNode. # is_identifier boolean
#
# value BytesLiteral
type = Builtin.str_type type = Builtin.str_type
is_identifier = False
def coerce_to(self, dst_type, env): def coerce_to(self, dst_type, env):
if dst_type is Builtin.str_type: if dst_type is not py_object_type and dst_type is not Builtin.str_type:
return self # if dst_type is Builtin.bytes_type:
# if dst_type is Builtin.bytes_type: # # special case: bytes = 'str literal'
# # special case: bytes = 'str literal' # return BytesNode(self.pos, value=self.value)
# return BytesNode(self.pos, value=self.value) if not dst_type.is_pyobject:
if not dst_type.is_pyobject: return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env)
return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env)
if dst_type is not py_object_type:
self.check_for_coercion_error(dst_type, fail=True) self.check_for_coercion_error(dst_type, fail=True)
# this will be a unicode string in Py3, so make sure we can decode it
try:
self.value.decode(self.value.encoding)
except UnicodeDecodeError:
error(self.pos, "String decoding as '%s' failed. Consider using a byte string or unicode string explicitly, or adjust the source code encoding." % self.value.encoding)
return self return self
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value, True) self.result_code = code.get_py_string_const(
self.value, identifier=self.is_identifier, is_str=True)
def get_constant_c_result_code(self): def get_constant_c_result_code(self):
return None return None
...@@ -931,6 +940,12 @@ class StringNode(PyConstNode): ...@@ -931,6 +940,12 @@ class StringNode(PyConstNode):
return self.value return self.value
class IdentifierStringNode(StringNode):
# A special str value that represents an identifier (bytes in Py2,
# unicode in Py3).
is_identifier = True
class LongNode(AtomicExprNode): class LongNode(AtomicExprNode):
# Python long integer literal # Python long integer literal
# #
......
...@@ -4757,7 +4757,7 @@ utility_function_predeclarations = \ ...@@ -4757,7 +4757,7 @@ utility_function_predeclarations = \
#define INLINE #define INLINE
#endif #endif
typedef struct {PyObject **p; char *s; long n; char is_unicode; char intern; char is_identifier;} __Pyx_StringTabEntry; /*proto*/ typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
""" """
...@@ -5518,7 +5518,7 @@ impl = """ ...@@ -5518,7 +5518,7 @@ impl = """
static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
while (t->p) { while (t->p) {
#if PY_MAJOR_VERSION < 3 #if PY_MAJOR_VERSION < 3
if (t->is_unicode && (!t->is_identifier)) { if (t->is_unicode) {
*t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
} else if (t->intern) { } else if (t->intern) {
*t->p = PyString_InternFromString(t->s); *t->p = PyString_InternFromString(t->s);
...@@ -5526,10 +5526,14 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { ...@@ -5526,10 +5526,14 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
*t->p = PyString_FromStringAndSize(t->s, t->n - 1); *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
} }
#else /* Python 3+ has unicode identifiers */ #else /* Python 3+ has unicode identifiers */
if (t->is_identifier || (t->is_unicode && t->intern)) { if (t->is_unicode | t->is_str) {
*t->p = PyUnicode_InternFromString(t->s); if (t->intern) {
} else if (t->is_unicode) { *t->p = PyUnicode_InternFromString(t->s);
*t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); } else if (t->encoding) {
*t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
} else {
*t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
}
} else { } else {
*t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
} }
......
...@@ -348,7 +348,7 @@ def p_call(s, function): ...@@ -348,7 +348,7 @@ def p_call(s, function):
s.error("Expected an identifier before '='", s.error("Expected an identifier before '='",
pos = arg.pos) pos = arg.pos)
encoded_name = EncodedString(arg.name) encoded_name = EncodedString(arg.name)
keyword = ExprNodes.StringNode(arg.pos, value = encoded_name) keyword = ExprNodes.IdentifierStringNode(arg.pos, value = encoded_name)
arg = p_simple_expr(s) arg = p_simple_expr(s)
keyword_args.append((keyword, arg)) keyword_args.append((keyword, arg))
else: else:
...@@ -1128,14 +1128,14 @@ def p_import_statement(s): ...@@ -1128,14 +1128,14 @@ def p_import_statement(s):
else: else:
if as_name and "." in dotted_name: if as_name and "." in dotted_name:
name_list = ExprNodes.ListNode(pos, args = [ name_list = ExprNodes.ListNode(pos, args = [
ExprNodes.StringNode(pos, value = EncodedString("*"))]) ExprNodes.IdentifierStringNode(pos, value = EncodedString("*"))])
else: else:
name_list = None name_list = None
stat = Nodes.SingleAssignmentNode(pos, stat = Nodes.SingleAssignmentNode(pos,
lhs = ExprNodes.NameNode(pos, lhs = ExprNodes.NameNode(pos,
name = as_name or target_name), name = as_name or target_name),
rhs = ExprNodes.ImportNode(pos, rhs = ExprNodes.ImportNode(pos,
module_name = ExprNodes.StringNode( module_name = ExprNodes.IdentifierStringNode(
pos, value = dotted_name), pos, value = dotted_name),
name_list = name_list)) name_list = name_list))
stats.append(stat) stats.append(stat)
...@@ -1193,7 +1193,7 @@ def p_from_import_statement(s, first_statement = 0): ...@@ -1193,7 +1193,7 @@ def p_from_import_statement(s, first_statement = 0):
for (name_pos, name, as_name, kind) in imported_names: for (name_pos, name, as_name, kind) in imported_names:
encoded_name = EncodedString(name) encoded_name = EncodedString(name)
imported_name_strings.append( imported_name_strings.append(
ExprNodes.StringNode(name_pos, value = encoded_name)) ExprNodes.IdentifierStringNode(name_pos, value = encoded_name))
items.append( items.append(
(name, (name,
ExprNodes.NameNode(name_pos, ExprNodes.NameNode(name_pos,
...@@ -1203,7 +1203,7 @@ def p_from_import_statement(s, first_statement = 0): ...@@ -1203,7 +1203,7 @@ def p_from_import_statement(s, first_statement = 0):
dotted_name = EncodedString(dotted_name) dotted_name = EncodedString(dotted_name)
return Nodes.FromImportStatNode(pos, return Nodes.FromImportStatNode(pos,
module = ExprNodes.ImportNode(dotted_name_pos, module = ExprNodes.ImportNode(dotted_name_pos,
module_name = ExprNodes.StringNode(pos, value = dotted_name), module_name = ExprNodes.IdentifierStringNode(pos, value = dotted_name),
name_list = import_list), name_list = import_list),
items = items) items = items)
...@@ -1713,7 +1713,7 @@ def p_positional_and_keyword_args(s, end_sy_set, type_positions=(), type_keyword ...@@ -1713,7 +1713,7 @@ def p_positional_and_keyword_args(s, end_sy_set, type_positions=(), type_keyword
parsed_type = True parsed_type = True
else: else:
arg = p_simple_expr(s) arg = p_simple_expr(s)
keyword_node = ExprNodes.StringNode( keyword_node = ExprNodes.IdentifierStringNode(
arg.pos, value = EncodedString(ident)) arg.pos, value = EncodedString(ident))
keyword_args.append((keyword_node, arg)) keyword_args.append((keyword_node, arg))
was_keyword = True was_keyword = True
......
# -*- coding: latin-1 -*-
__doc__ = (u"""
>>> a == 'abc'
True
>>> isinstance(a, str)
True
>>> isinstance(s, str)
True
>>> len(s)
6
>>> s == 'ao'
True
>>> isinstance(add(), str)
True
>>> len(add())
9
>>> add() == 'abcao'
True
>>> isinstance(add_literal(), str)
True
>>> len(add_literal())
9
>>> add_literal() == 'abcao'
True
>>> isinstance(typed(), str)
True
>>> len(typed())
6
>>> typed() == ''
True
"""
# recoding/escaping is required to properly pass the literals to doctest
).encode('unicode_escape').decode('ASCII')
a = 'abc'
s = 'ao'
u = u'ao'
cdef str S = ''
def add():
return a+s
def add_literal():
return 'abc' + 'ao'
def typed():
return S
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment