gcompat: Teach qq to accept both str and unicode + emit printable UTF-8 as is

This patch made its first step as a way to teach qq to also work on python3. However the differences in str / unicode and escapes in between py2 / py3 quickly popped out and then it became easier to just handle whole escaping logic myself. The implementation is based on go123@c0bbd06e and byproduct of manual handling is that now we don't escape printable UTF-8 characters.

gcompat: Teach qq to accept both str and unicode + emit printable UTF-8 as is
This patch made its first step as a way to teach qq to also work on python3. However the differences in str / unicode and escapes in between py2 / py3 quickly popped out and then it became easier to just handle whole escaping logic myself. The implementation is based on go123@c0bbd06e and byproduct of manual handling is that now we don't escape printable UTF-8 characters.
02dddb97 · Kirill Smelkov · 812e7ed7 · 02dddb97 · 02dddb97 · 02dddb97
Commit 02dddb97 authored Jul 02, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 22 deletions

golang/gcompat.py golang/gcompat.py +97 -14

golang/gcompat_test.py golang/gcompat_test.py +29 -8

setup.py setup.py +2 -0

No files found.
--- a/golang/gcompat.py
+++ b/golang/gcompat.py
@@ -19,26 +19,109 @@
 # See https://www.nexedi.com/licensing for rationale and options.
 """Package gcompat provides Go-compatibility layer for Python"""

+import six, unicodedata
+from six.moves import range as xrange
+
 # qq is substitute for %q, which is missing in python.
 #
 # (python's automatic escape uses smartquotes quoting with either ' or ").
 def qq(obj):
    # go: like %s, %q automatically converts to string
-    if not isinstance(obj, basestring):
+    decode_utf8 = False
+    if isinstance(obj, bytes):                      # py2: str      py3: bytes
+        if six.PY3:
+            decode_utf8 = True
+    elif not isinstance(obj, six.text_type):        # py2: unicode  py3: str
        obj = str(obj)
-    return _quote(obj)

-# _quote quotes string into valid "..." string always quoted with ".
+    if isinstance(obj, six.text_type):              # py2: unicode  py3: str    -> bytes
+        obj = obj.encode('UTF-8')
+        decode_utf8 = True
+
+    qobj = _quote(obj)
+
+    if decode_utf8:
+        qobj = qobj.decode('UTF-8')
+    return qobj
+
+
+# _quote quotes bytes string into valid "..." bytes string always quoted with ".
 def _quote(s):
-    # TODO also accept unicode as input.
-    # TODO output printable UTF-8 characters as-is, but escape non-printable UTF-8 and invalid UTF-8 bytes.
    outv = []
-    # we don't want ' to be escaped
-    for _ in s.split("'"):
-        # this escape almost everything except " character
-        # NOTE string_escape does not do smartquotes and always uses ' for quoting
-        # (repr(str) is the same except it does smartquoting picking ' or " automatically)
-        q = _.encode("string_escape")
-        q = q.replace('"', r'\"')
-        outv.append(q)
-    return '"' + "'".join(outv) + '"'
+    emit = outv.append
+    i = 0
+    while i < len(s):
+        c = s[i:i+1]
+        # fast path - ASCII only
+        if ord(c) < 0x80:
+            if c in b'\\"':
+                emit(b'\\'+c)
+
+            # printable ASCII
+            elif b' ' <= c <= b'\x7e':
+                emit(c)
+
+            # non-printable ASCII
+            elif c == b'\t':
+                emit(br'\t')
+            elif c == b'\n':
+                emit(br'\n')
+            elif c == b'\r':
+                emit(br'\r')
+
+            # everything else is non-printable
+            else:
+                emit(br'\x%02x' % ord(c))
+
+            i += 1
+
+        # slow path - full UTF-8 decoding + unicodedata
+        else:
+            r, size = _utf8_decode_rune(s[i:])
+            isize = i + size
+
+            # decode error - just emit raw byte as escaped
+            if r == _rune_error:
+                emit(br'\x%02x' % ord(c))
+
+            # printable utf-8 characters go as is
+            elif unicodedata.category(r)[0] in _printable_cat0:
+                emit(s[i:isize])
+
+            # everything else goes in numeric byte escapes
+            else:
+                for j in xrange(i, isize):
+                    emit(br'\x%02x' % ord(s[j:j+1]))
+
+            i = isize
+
+    return b'"' + b''.join(outv) + b'"'
+
+
+_printable_cat0 = frozenset(['L', 'N', 'P', 'S'])   # letters, numbers, punctuation, symbols
+
+_rune_error = u'\uFFFD' # unicode replacement character
+
+# _utf8_decode_rune decodes next UTF8-character from byte string s.
+#
+# _utf8_decode_rune(s) -> (r, size)
+def _utf8_decode_rune(s):
+    if len(s) == 0:
+        return '', 0
+
+    l = min(len(s), 4)  # max size of an UTF-8 encoded character
+    while l > 0:
+        try:
+            r = s[:l].decode('utf-8', 'strict')
+        except UnicodeDecodeError:
+            l -= 1
+            continue
+
+        if len(r) == 1:
+            return r, l
+
+        l -= 1
+        continue
+
+    # invalid UTF-8
+    return _rune_error, 1
--- a/golang/gcompat_test.py
+++ b/golang/gcompat_test.py
@@ -19,17 +19,38 @@
 # See https://www.nexedi.com/licensing for rationale and options.

 from golang.gcompat import qq
+from six import int2byte as bchr
+from six.moves import range as xrange
+
+def byterange(start, stop):
+    b = b""
+    for i in xrange(start, stop):
+        b += bchr(i)
+
+    return b

 def test_qq():
    testv = (
-        # in            want without leading/trailing "
-        ('',            r""),
-        ('\'',          r"'"),
-        ('"',           r"\""),
-        ('abc\ndef',    r"abc\ndef"),
-        ('a\'c\ndef',   r"a'c\ndef"),
-        ('a\"c\ndef',   r"a\"c\ndef"),
-        # ('привет',      r"привет"),       TODO
+        # in                want without leading/trailing "
+        ('',                r""),
+
+        (byterange(0,32),   r'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'),
+
+        ('\'',              r"'"),
+        ('"',               r"\""),
+        ('ab c\ndef',       r"ab c\ndef"),
+        ('a\'c\ndef',       r"a'c\ndef"),
+        ('a\"c\ndef',       r"a\"c\ndef"),
+        (u'a\"c\ndef',      u"a\\\"c\\ndef"),
+        (b'a\"c\ndef',      r'a\"c\ndef'),
+        ('привет\nмир',     r"привет\nмир"),
+        (u'привет\nмир',    u"привет\\nмир"),
+
+        # invalid utf-8
+        (b"\xd0a",          r"\xd0a"),
+
+        # non-printable utf-8
+        (u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", u"\\x7f\\xc2\\x80\\xc2\\x81\\xc2\\x82\\xc2\\x83\\xc2\\x84\\xc2\\x85\\xc2\\x86\\xc2\\x87"),
    )

    for tin, twant in testv:

--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,8 @@ setup(

    packages    = find_packages(),

+    install_requires = ['six'],
+
    extras_require = {
                  'test': ['pytest'],
    },