Commit bf558ecf authored by Stefan Behnel's avatar Stefan Behnel

support redundant parsing of string literals as unicode *and* bytes string,...

support redundant parsing of string literals as unicode *and* bytes string, fix 'str' literal assignments to char* targets when using Future.unicode_literals
parent 184482af
...@@ -961,8 +961,10 @@ class BytesNode(ConstNode): ...@@ -961,8 +961,10 @@ class BytesNode(ConstNode):
class UnicodeNode(PyConstNode): class UnicodeNode(PyConstNode):
# A Python unicode object # A Python unicode object
# #
# value EncodedString # value EncodedString
# bytes_value BytesLiteral the literal parsed as bytes string ('-3' unicode literals only)
bytes_value = None
type = unicode_type type = unicode_type
def coerce_to(self, dst_type, env): def coerce_to(self, dst_type, env):
...@@ -975,6 +977,9 @@ class UnicodeNode(PyConstNode): ...@@ -975,6 +977,9 @@ class UnicodeNode(PyConstNode):
int_value = ord(self.value) int_value = ord(self.value)
return IntNode(self.pos, value=int_value, constant_result=int_value) return IntNode(self.pos, value=int_value, constant_result=int_value)
elif not dst_type.is_pyobject: elif not dst_type.is_pyobject:
if dst_type.is_string and self.bytes_value is not None:
# special case: '-3' enforced unicode literal used in a C char* context
return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.") error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
elif dst_type is not py_object_type: elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type): if not self.check_for_coercion_error(dst_type):
...@@ -1015,11 +1020,13 @@ class StringNode(PyConstNode): ...@@ -1015,11 +1020,13 @@ class StringNode(PyConstNode):
# A Python str object, i.e. a byte string in Python 2.x and a # A Python str object, i.e. a byte string in Python 2.x and a
# unicode string in Python 3.x # unicode string in Python 3.x
# #
# value BytesLiteral or EncodedString # value BytesLiteral
# unicode_value EncodedString
# is_identifier boolean # is_identifier boolean
type = str_type type = str_type
is_identifier = None is_identifier = None
unicode_value = None
def coerce_to(self, dst_type, env): def coerce_to(self, dst_type, env):
if dst_type is not py_object_type and not str_type.subtype_of(dst_type): if dst_type is not py_object_type and not str_type.subtype_of(dst_type):
......
...@@ -593,15 +593,15 @@ def p_atom(s): ...@@ -593,15 +593,15 @@ def p_atom(s):
s.next() s.next()
return ExprNodes.ImagNode(pos, value = value) return ExprNodes.ImagNode(pos, value = value)
elif sy == 'BEGIN_STRING': elif sy == 'BEGIN_STRING':
kind, value = p_cat_string_literal(s) kind, bytes_value, unicode_value = p_cat_string_literal(s)
if kind == 'c': if kind == 'c':
return ExprNodes.CharNode(pos, value = value) return ExprNodes.CharNode(pos, value = bytes_value)
elif kind == 'u': elif kind == 'u':
return ExprNodes.UnicodeNode(pos, value = value) return ExprNodes.UnicodeNode(pos, value = unicode_value, bytes_value = bytes_value)
elif kind == 'b': elif kind == 'b':
return ExprNodes.BytesNode(pos, value = value) return ExprNodes.BytesNode(pos, value = bytes_value)
else: else:
return ExprNodes.StringNode(pos, value = value) return ExprNodes.StringNode(pos, value = bytes_value, unicode_value = unicode_value)
elif sy == 'IDENT': elif sy == 'IDENT':
name = EncodedString( s.systring ) name = EncodedString( s.systring )
s.next() s.next()
...@@ -642,38 +642,53 @@ def p_name(s, name): ...@@ -642,38 +642,53 @@ def p_name(s, name):
def p_cat_string_literal(s): def p_cat_string_literal(s):
# A sequence of one or more adjacent string literals. # A sequence of one or more adjacent string literals.
# Returns (kind, value) where kind in ('b', 'c', 'u', '') # Returns (kind, bytes_value, unicode_value)
kind, value = p_string_literal(s) # where kind in ('b', 'c', 'u', '')
if s.sy != 'BEGIN_STRING': kind, bytes_value, unicode_value = p_string_literal(s)
return kind, value if kind == 'c' or s.sy != 'BEGIN_STRING':
if kind != 'c': return kind, bytes_value, unicode_value
strings = [value] bstrings, ustrings = [bytes_value], [unicode_value]
while s.sy == 'BEGIN_STRING': bytes_value = unicode_value = None
pos = s.position() while s.sy == 'BEGIN_STRING':
next_kind, next_value = p_string_literal(s) pos = s.position()
if next_kind == 'c': next_kind, next_bytes_value, next_unicode_value = p_string_literal(s)
error(pos, "Cannot concatenate char literal with another string or char literal") if next_kind == 'c':
elif next_kind != kind: error(pos, "Cannot concatenate char literal with another string or char literal")
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" % elif next_kind != kind:
(kind, next_kind)) error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
else: (kind, next_kind))
strings.append(next_value)
if kind == 'u':
value = EncodedString( u''.join(strings) )
else: else:
value = BytesLiteral( StringEncoding.join_bytes(strings) ) bstrings.append(next_bytes_value)
value.encoding = s.source_encoding ustrings.append(next_unicode_value)
return kind, value # join and rewrap the partial literals
if kind in ('b', 'c', '') or kind == 'u' and bstrings[0] is not None:
def p_opt_string_literal(s): # Py3 enforced unicode literals are parsed as bytes/unicode combination
bytes_value = BytesLiteral( StringEncoding.join_bytes([ b for b in bstrings if b is not None ]) )
bytes_value.encoding = s.source_encoding
if kind in ('u', ''):
unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) )
return kind, bytes_value, unicode_value
def p_opt_string_literal(s, required_type='u'):
if s.sy == 'BEGIN_STRING': if s.sy == 'BEGIN_STRING':
return p_string_literal(s) kind, bytes_value, unicode_value = p_string_literal(s, required_type)
if required_type == 'u':
return unicode_value
elif required_type == 'b':
return bytes_value
else:
s.error("internal parser configuration error")
else: else:
return None return None
def p_string_literal(s, kind_override=None): def p_string_literal(s, kind_override=None):
# A single string or char literal. # A single string or char literal. Returns (kind, bvalue, uvalue)
# Returns (kind, value) where kind in ('b', 'c', 'u', '') # where kind in ('b', 'c', 'u', ''). The 'bvalue' is the source
# code byte sequence of the string literal, 'uvalue' is the
# decoded Unicode string. Either of the two may be None depending
# on the 'kind' of string, only unprefixed strings have both
# representations.
# s.sy == 'BEGIN_STRING' # s.sy == 'BEGIN_STRING'
pos = s.position() pos = s.position()
is_raw = 0 is_raw = 0
...@@ -685,15 +700,18 @@ def p_string_literal(s, kind_override=None): ...@@ -685,15 +700,18 @@ def p_string_literal(s, kind_override=None):
is_raw = s.systring[1:2].lower() == 'r' is_raw = s.systring[1:2].lower() == 'r'
elif kind != 'c': elif kind != 'c':
kind = '' kind = ''
if Future.unicode_literals in s.context.future_directives: if kind == '' and kind_override is None and Future.unicode_literals in s.context.future_directives:
if kind == '': chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
kind = 'u' kind = 'u'
if kind_override is not None and kind_override in 'ub': else:
kind = kind_override if kind_override is not None and kind_override in 'ub':
if kind == 'u': kind = kind_override
chars = StringEncoding.UnicodeLiteralBuilder() if kind == 'u':
else: chars = StringEncoding.UnicodeLiteralBuilder()
chars = StringEncoding.BytesLiteralBuilder(s.source_encoding) elif kind == '':
chars = StringEncoding.StrLiteralBuilder(s.source_encoding)
else:
chars = StringEncoding.BytesLiteralBuilder(s.source_encoding)
while 1: while 1:
s.next() s.next()
sy = s.sy sy = s.sy
...@@ -723,20 +741,18 @@ def p_string_literal(s, kind_override=None): ...@@ -723,20 +741,18 @@ def p_string_literal(s, kind_override=None):
StringEncoding.char_from_escape_sequence(systr)) StringEncoding.char_from_escape_sequence(systr))
elif c == u'\n': elif c == u'\n':
pass pass
elif c in u'Uux': elif c == u'x':
if kind == 'u' or c == 'x': chars.append_charval( int(systr[2:], 16) )
elif c in u'Uu':
if kind in ('u', ''):
chrval = int(systr[2:], 16) chrval = int(systr[2:], 16)
if chrval > 1114111: # sys.maxunicode: if chrval > 1114111: # sys.maxunicode:
s.error("Invalid unicode escape '%s'" % systr, s.error("Invalid unicode escape '%s'" % systr,
pos = pos) pos = pos)
elif chrval > 65535:
warning(s.position(),
"Unicode characters above 65535 are not "
"necessarily portable across Python installations", 1)
chars.append_charval(chrval)
else: else:
# unicode escapes in plain byte strings are not unescaped # unicode escapes in plain byte strings are not unescaped
chars.append(systr) chrval = None
chars.append_uescape(chrval, systr)
else: else:
chars.append(u'\\' + systr[1:]) chars.append(u'\\' + systr[1:])
elif sy == 'NEWLINE': elif sy == 'NEWLINE':
...@@ -750,14 +766,14 @@ def p_string_literal(s, kind_override=None): ...@@ -750,14 +766,14 @@ def p_string_literal(s, kind_override=None):
"Unexpected token %r:%r in string literal" % "Unexpected token %r:%r in string literal" %
(sy, s.systring)) (sy, s.systring))
if kind == 'c': if kind == 'c':
value = chars.getchar() unicode_value = None
if len(value) != 1: bytes_value = chars.getchar()
error(pos, u"invalid character literal: %r" % value) if len(bytes_value) != 1:
error(pos, u"invalid character literal: %r" % bytes_value)
else: else:
value = chars.getstring() bytes_value, unicode_value = chars.getstrings()
s.next() s.next()
#print "p_string_literal: value =", repr(value) ### return (kind, bytes_value, unicode_value)
return kind, value
# list_display ::= "[" [listmaker] "]" # list_display ::= "[" [listmaker] "]"
# listmaker ::= expression ( comp_for | ( "," expression )* [","] ) # listmaker ::= expression ( comp_for | ( "," expression )* [","] )
...@@ -1447,10 +1463,10 @@ def p_except_clause(s): ...@@ -1447,10 +1463,10 @@ def p_except_clause(s):
def p_include_statement(s, ctx): def p_include_statement(s, ctx):
pos = s.position() pos = s.position()
s.next() # 'include' s.next() # 'include'
_, include_file_name = p_string_literal(s) unicode_include_file_name = p_string_literal(s, 'u')[2]
s.expect_newline("Syntax error in include statement") s.expect_newline("Syntax error in include statement")
if s.compile_time_eval: if s.compile_time_eval:
include_file_name = include_file_name.decode(s.source_encoding) include_file_name = unicode_include_file_name
include_file_path = s.context.find_include_file(include_file_name, pos) include_file_path = s.context.find_include_file(include_file_name, pos)
if include_file_path: if include_file_path:
s.included_files.append(include_file_name) s.included_files.append(include_file_name)
...@@ -1986,10 +2002,9 @@ def p_sign_and_longness(s): ...@@ -1986,10 +2002,9 @@ def p_sign_and_longness(s):
return signed, longness return signed, longness
def p_opt_cname(s): def p_opt_cname(s):
literal = p_opt_string_literal(s) literal = p_opt_string_literal(s, 'u')
if literal: if literal is not None:
_, cname = literal cname = EncodedString(literal)
cname = EncodedString(cname)
cname.encoding = s.source_encoding cname.encoding = s.source_encoding
else: else:
cname = None cname = None
...@@ -2300,11 +2315,11 @@ def p_cdef_extern_block(s, pos, ctx): ...@@ -2300,11 +2315,11 @@ def p_cdef_extern_block(s, pos, ctx):
if s.sy == '*': if s.sy == '*':
s.next() s.next()
else: else:
_, include_file = p_string_literal(s) include_file = p_string_literal(s, 'u')[2]
ctx = ctx(cdef_flag = 1, visibility = 'extern') ctx = ctx(cdef_flag = 1, visibility = 'extern')
if s.systring == "namespace": if s.systring == "namespace":
s.next() s.next()
ctx.namespace = p_string_literal(s, kind_override='u')[1] ctx.namespace = p_string_literal(s, 'u')[2]
if p_nogil(s): if p_nogil(s):
ctx.nogil = 1 ctx.nogil = 1
body = p_suite(s, ctx) body = p_suite(s, ctx)
...@@ -2677,17 +2692,16 @@ def p_property_decl(s): ...@@ -2677,17 +2692,16 @@ def p_property_decl(s):
def p_doc_string(s): def p_doc_string(s):
if s.sy == 'BEGIN_STRING': if s.sy == 'BEGIN_STRING':
pos = s.position() pos = s.position()
kind, result = p_cat_string_literal(s) kind, bytes_result, unicode_result = p_cat_string_literal(s)
if s.sy != 'EOF': if s.sy != 'EOF':
s.expect_newline("Syntax error in doc string") s.expect_newline("Syntax error in doc string")
if kind != 'u': if kind in ('u', ''):
# warning(pos, "Python 3 requires docstrings to be unicode strings") return unicode_result
if kind == 'b': warning(pos, "Python 3 requires docstrings to be unicode strings")
result.encoding = None # force a unicode string return bytes_result
return result
else: else:
return None return None
def p_code(s, level=None): def p_code(s, level=None):
body = p_statement_list(s, Ctx(level = level), first_statement = 1) body = p_statement_list(s, Ctx(level = level), first_statement = 1)
if s.sy != 'EOF': if s.sy != 'EOF':
......
...@@ -44,9 +44,15 @@ class UnicodeLiteralBuilder(object): ...@@ -44,9 +44,15 @@ class UnicodeLiteralBuilder(object):
def append_charval(self, char_number): def append_charval(self, char_number):
self.chars.append( unichr(char_number) ) self.chars.append( unichr(char_number) )
def append_uescape(self, char_number, escape_string):
self.append_charval(char_number)
def getstring(self): def getstring(self):
return EncodedString(u''.join(self.chars)) return EncodedString(u''.join(self.chars))
def getstrings(self):
return (None, self.getstring())
class BytesLiteralBuilder(object): class BytesLiteralBuilder(object):
"""Assemble a byte string or char value. """Assemble a byte string or char value.
...@@ -64,6 +70,9 @@ class BytesLiteralBuilder(object): ...@@ -64,6 +70,9 @@ class BytesLiteralBuilder(object):
def append_charval(self, char_number): def append_charval(self, char_number):
self.chars.append( unichr(char_number).encode('ISO-8859-1') ) self.chars.append( unichr(char_number).encode('ISO-8859-1') )
def append_uescape(self, char_number, escape_string):
self.append(escape_string)
def getstring(self): def getstring(self):
# this *must* return a byte string! # this *must* return a byte string!
s = BytesLiteral(join_bytes(self.chars)) s = BytesLiteral(join_bytes(self.chars))
...@@ -74,6 +83,32 @@ class BytesLiteralBuilder(object): ...@@ -74,6 +83,32 @@ class BytesLiteralBuilder(object):
# this *must* return a byte string! # this *must* return a byte string!
return self.getstring() return self.getstring()
def getstrings(self):
return (self.getstring(), None)
class StrLiteralBuilder(object):
"""Assemble both a bytes and a unicode representation of a string.
"""
def __init__(self, target_encoding):
self._bytes = BytesLiteralBuilder(target_encoding)
self._unicode = UnicodeLiteralBuilder()
def append(self, characters):
self._bytes.append(characters)
self._unicode.append(characters)
def append_charval(self, char_number):
self._bytes.append_charval(char_number)
self._unicode.append_charval(char_number)
def append_uescape(self, char_number, escape_string):
self._bytes.append(escape_string)
self._unicode.append_charval(char_number)
def getstrings(self):
return (self._bytes.getstring(), self._unicode.getstring())
class EncodedString(_unicode): class EncodedString(_unicode):
# unicode string subclass to keep track of the original encoding. # unicode string subclass to keep track of the original encoding.
# 'encoding' is None for unicode strings and the source encoding # 'encoding' is None for unicode strings and the source encoding
......
...@@ -7,6 +7,8 @@ if sys.version_info[0] >= 3: ...@@ -7,6 +7,8 @@ if sys.version_info[0] >= 3:
True True
>>> isinstance(u, str) >>> isinstance(u, str)
True True
>>> isinstance(b, bytes)
True
""" """
else: else:
__doc__ = u""" __doc__ = u"""
...@@ -14,6 +16,11 @@ else: ...@@ -14,6 +16,11 @@ else:
True True
>>> isinstance(u, unicode) >>> isinstance(u, unicode)
True True
>>> isinstance(b, bytes)
True
""" """
u = "test" u = "test"
cdef char* s = "bytes test"
b = s
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment