Commit f09701b0 authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: New package + expose .quote() there

Move quote implementation from gcompat to strconv. Make quote work on both
unicode|bytes string input and produce the same output type. Preserve qq
(still remaining in gcompat) to always produce str, since that is used
in prints.
parent 94bae8e8
...@@ -134,6 +134,9 @@ without escaping printable UTF-8 characters:: ...@@ -134,6 +134,9 @@ without escaping printable UTF-8 characters::
`qq` accepts both `str` and `bytes` (`unicode` and `str` on Python2) `qq` accepts both `str` and `bytes` (`unicode` and `str` on Python2)
and also any other type that can be converted to `str`. and also any other type that can be converted to `str`.
Package `golang.strconv` provides direct access to conversion routines, for
example `strconv.quote`.
Benchmarking and testing Benchmarking and testing
------------------------ ------------------------
......
...@@ -19,109 +19,26 @@ ...@@ -19,109 +19,26 @@
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
"""Package gcompat provides Go-compatibility layer for Python""" """Package gcompat provides Go-compatibility layer for Python"""
import six, unicodedata from golang import strconv
from six.moves import range as xrange import six
# qq is substitute for %q, which is missing in python. # qq is substitute for %q, which is missing in python.
# #
# (python's automatic escape uses smartquotes quoting with either ' or "). # (python's automatic escape uses smartquotes quoting with either ' or ").
#
# like %s, %q automatically converts its argument to string.
def qq(obj): def qq(obj):
# go: like %s, %q automatically converts to string # make sure obj is text | bytes
decode_utf8 = False # py2: unicode | str
if isinstance(obj, bytes): # py2: str py3: bytes # py3: str | bytes
if six.PY3: if not isinstance(obj, (six.text_type, six.binary_type)):
decode_utf8 = True
elif not isinstance(obj, six.text_type): # py2: unicode py3: str
obj = str(obj) obj = str(obj)
if isinstance(obj, six.text_type): # py2: unicode py3: str -> bytes qobj = strconv.quote(obj)
obj = obj.encode('UTF-8')
decode_utf8 = True
qobj = _quote(obj)
if decode_utf8: # `printf('%s', qq(obj))` should work. For this make sure qobj is always a
# str - not bytes under py3 (if it was bytes it will print e.g. as b'...')
if six.PY3 and isinstance(qobj, bytes):
qobj = qobj.decode('UTF-8') qobj = qobj.decode('UTF-8')
return qobj
# _quote quotes bytes string into valid "..." bytes string always quoted with ".
def _quote(s):
outv = []
emit = outv.append
i = 0
while i < len(s):
c = s[i:i+1]
# fast path - ASCII only
if ord(c) < 0x80:
if c in b'\\"':
emit(b'\\'+c)
# printable ASCII
elif b' ' <= c <= b'\x7e':
emit(c)
# non-printable ASCII
elif c == b'\t':
emit(br'\t')
elif c == b'\n':
emit(br'\n')
elif c == b'\r':
emit(br'\r')
# everything else is non-printable
else:
emit(br'\x%02x' % ord(c))
i += 1
# slow path - full UTF-8 decoding + unicodedata
else:
r, size = _utf8_decode_rune(s[i:])
isize = i + size
# decode error - just emit raw byte as escaped
if r == _rune_error:
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
elif unicodedata.category(r)[0] in _printable_cat0:
emit(s[i:isize])
# everything else goes in numeric byte escapes return qobj
else:
for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1]))
i = isize
return b'"' + b''.join(outv) + b'"'
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(s):
if len(s) == 0:
return '', 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
try:
r = s[:l].decode('utf-8', 'strict')
except UnicodeDecodeError:
l -= 1
continue
if len(r) == 1:
return r, l
l -= 1
continue
# invalid UTF-8
return _rune_error, 1
...@@ -18,41 +18,4 @@ ...@@ -18,41 +18,4 @@
# See COPYING file for full licensing terms. # See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
from golang.gcompat import qq # qq is tested as part of strconv.
from six import int2byte as bchr
from six.moves import range as xrange
def byterange(start, stop):
b = b""
for i in xrange(start, stop):
b += bchr(i)
return b
def test_qq():
testv = (
# in want without leading/trailing "
('', r""),
(byterange(0,32), r'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'),
('\'', r"'"),
('"', r"\""),
('ab c\ndef', r"ab c\ndef"),
('a\'c\ndef', r"a'c\ndef"),
('a\"c\ndef', r"a\"c\ndef"),
(u'a\"c\ndef', u"a\\\"c\\ndef"),
(b'a\"c\ndef', r'a\"c\ndef'),
('привет\nмир', r"привет\nмир"),
(u'привет\nмир', u"привет\\nмир"),
# invalid utf-8
(b"\xd0a", r"\xd0a"),
# non-printable utf-8
(u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", u"\\x7f\\xc2\\x80\\xc2\\x81\\xc2\\x82\\xc2\\x83\\xc2\\x84\\xc2\\x85\\xc2\\x86\\xc2\\x87"),
)
for tin, twant in testv:
twant = '"' + twant + '"' # add lead/trail "
assert qq(tin) == twant
# -*- coding: utf-8 -*-
# Copyright (C) 2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Package strconv provides Go-compatible string conversions"""
import six, unicodedata, codecs
from six.moves import range as xrange
# _bstr converts str/unicode/bytes s to UTF-8 encoded bytestring.
#
# TypeError is raised if type(s) is not one of the above.
def _bstr(s): # -> sbytes, wasunicode
wasunicode = False
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, six.text_type): # py2: unicode py3: str
wasunicode = True
else:
raise TypeError("_bstr: invalid type %s", type(s))
if wasunicode: # py2: unicode py3: str -> bytes
s = s.encode('UTF-8')
return s, wasunicode
# quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ".
def quote(s):
s, wasunicode = _bstr(s)
qs = _quote(s)
if wasunicode:
qs = qs.decode('UTF-8')
return qs
def _quote(s):
assert isinstance(s, bytes)
outv = []
emit = outv.append
i = 0
while i < len(s):
c = s[i:i+1]
# fast path - ASCII only
if ord(c) < 0x80:
if c in b'\\"':
emit(b'\\'+c)
# printable ASCII
elif b' ' <= c <= b'\x7e':
emit(c)
# non-printable ASCII
elif c == b'\t':
emit(br'\t')
elif c == b'\n':
emit(br'\n')
elif c == b'\r':
emit(br'\r')
# everything else is non-printable
else:
emit(br'\x%02x' % ord(c))
i += 1
# slow path - full UTF-8 decoding + unicodedata
else:
r, size = _utf8_decode_rune(s[i:])
isize = i + size
# decode error - just emit raw byte as escaped
if r == _rune_error:
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
elif unicodedata.category(r)[0] in _printable_cat0:
emit(s[i:isize])
# everything else goes in numeric byte escapes
else:
for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1]))
i = isize
return b'"' + b''.join(outv) + b'"'
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(s):
assert isinstance(s, bytes)
if len(s) == 0:
return '', 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
try:
r = s[:l].decode('utf-8', 'strict')
except UnicodeDecodeError:
l -= 1
continue
if len(r) == 1:
return r, l
l -= 1
continue
# invalid UTF-8
return _rune_error, 1
# -*- coding: utf-8 -*-
# Copyright (C) 2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
from golang.strconv import quote
from golang.gcompat import qq
from six import int2byte as bchr, PY3
from six.moves import range as xrange
def byterange(start, stop):
b = b""
for i in xrange(start, stop):
b += bchr(i)
return b
def asstr(s):
if PY3 and isinstance(s, bytes):
s = s.decode('utf-8')
return s
def test_quote():
testv = (
# in quoted without leading/trailing "
('', r""),
(byterange(0,32), br'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'),
('\'', r"'"),
('"', r"\""),
('ab c\ndef', r"ab c\ndef"),
('a\'c\ndef', r"a'c\ndef"),
('a\"c\ndef', r"a\"c\ndef"),
(u'a\"c\ndef', u"a\\\"c\\ndef"),
(b'a\"c\ndef', br'a\"c\ndef'),
('привет\nмир', r"привет\nмир"),
(u'привет\nмир', u"привет\\nмир"),
# invalid utf-8
(b"\xd0a", br"\xd0a"),
# non-printable utf-8
(u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", u"\\x7f\\xc2\\x80\\xc2\\x81\\xc2\\x82\\xc2\\x83\\xc2\\x84\\xc2\\x85\\xc2\\x86\\xc2\\x87"),
)
for tin, tquoted in testv:
# quote(in) == quoted
q = b'"' if isinstance(tquoted, bytes) else '"'
tquoted = q + tquoted + q # add lead/trail "
assert quote(tin) == tquoted
# qq always gives str
assert qq(tin) == asstr(tquoted)
# also check how it works on complementary unicode/bytes input type
if isinstance(tin, bytes):
try:
tin = tin.decode('utf-8')
except UnicodeDecodeError:
# some inputs are not valid UTF-8
continue
tquoted = tquoted.decode('utf-8')
else:
# tin was unicode
tin = tin.encode('utf-8')
tquoted = tquoted.encode('utf-8')
assert quote(tin) == tquoted
# qq always gives str
assert qq(tin) == asstr(tquoted)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment