strconv += unquote(), unquote_next()

This are functions to decode quotation that was produced by strconv.quote().

strconv += unquote(), unquote_next()
This are functions to decode quotation that was produced by strconv.quote().
ed6b7895 · Kirill Smelkov · f09701b0 · ed6b7895 · ed6b7895 · ed6b7895
Commit ed6b7895 authored Dec 10, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 119 additions and 2 deletions

README.rst README.rst +1 -1

golang/strconv.py golang/strconv.py +87 -0

golang/strconv_test.py golang/strconv_test.py +31 -1

No files found.
--- a/README.rst
+++ b/README.rst
@@ -135,7 +135,7 @@ without escaping printable UTF-8 characters::
 and also any other type that can be converted to `str`.

 Package `golang.strconv` provides direct access to conversion routines, for
-example `strconv.quote`.
+example `strconv.quote` and `strconv.unquote`.


 Benchmarking and testing

--- a/golang/strconv.py
+++ b/golang/strconv.py
@@ -102,6 +102,93 @@ def _quote(s):
    return b'"' + b''.join(outv) + b'"'


+# unquote decodes unicode|byte string that was produced by quote.
+#
+# ValueError is raised if there are quoting syntax errors.
+def unquote(s):
+    us, tail = unquote_next(s)
+    if len(tail) != 0:
+        raise ValueError('non-empty tail after closing "')
+    return us
+
+# unquote_next decodes next unicode|byte string that was produced by quote.
+#
+# it returns -> (unquoted(s), tail-after-")
+#
+# ValueError is raised if there are quoting syntax errors.
+def unquote_next(s):
+    s, wasunicode = _bstr(s)
+    us, tail = _unquote_next(s)
+    if wasunicode:
+        us = us.decode('UTF-8')
+        tail = tail.decode('UTF-8')
+    return us, tail
+
+def _unquote_next(s):
+    assert isinstance(s, bytes)
+
+    if len(s) == 0 or s[0:0+1] != b'"':
+        raise ValueError('no starting "')
+
+    outv = []
+    emit= outv.append
+
+    s = s[1:]
+    while 1:
+        r, width = _utf8_decode_rune(s)
+        if width == 0:
+            raise ValueError('no closing "')
+
+        if r == u'"':
+            s = s[1:]
+            break
+
+        # regular UTF-8 character
+        if r != u'\\':
+            emit(s[:width])
+            s = s[width:]
+            continue
+
+        if len(s) < 2:
+            raise ValueError('unexpected EOL after \\')
+
+        c = s[1:1+1]
+
+        # \<c> -> <c>   ; c = \ "
+        if c in b'\\"':
+            emit(c)
+            s = s[2:]
+            continue
+
+        if c == b't':
+            emit(b'\t')
+            s = s[2:]
+            continue
+
+        if c == b'n':
+            emit(b'\n')
+            s = s[2:]
+            continue
+
+        if c == b'r':
+            emit(b'\r')
+            s = s[2:]
+            continue
+
+        if c == b'x':   # hex   XXX also handle octals?
+            if len(s) < 2+2:
+                raise ValueError('unexpected EOL after \\x')
+
+            b = codecs.decode(s[2:2+2], 'hex')
+            emit(b)
+            s = s[2+2:]
+            continue
+
+        raise ValueError('invalid escape \\%s' % chr(ord(c[0:0+1])))
+
+    return b''.join(outv), s
+
+
 _printable_cat0 = frozenset(['L', 'N', 'P', 'S'])   # letters, numbers, punctuation, symbols

 _rune_error = u'\uFFFD' # unicode replacement character

--- a/golang/strconv_test.py
+++ b/golang/strconv_test.py
@@ -18,11 +18,12 @@
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.

-from golang.strconv import quote
+from golang.strconv import quote, unquote, unquote_next
 from golang.gcompat import qq

 from six import int2byte as bchr, PY3
 from six.moves import range as xrange
+from pytest import raises

 def byterange(start, stop):
    b = b""
@@ -62,10 +63,16 @@ def test_quote():

    for tin, tquoted in testv:
        # quote(in) == quoted
+        # in = unquote(quoted)
        q = b'"' if isinstance(tquoted, bytes) else '"'
+        tail = b'123' if isinstance(tquoted, bytes) else '123'
        tquoted = q + tquoted + q   # add lead/trail "

        assert quote(tin) == tquoted
+        assert unquote(tquoted) == tin
+        assert unquote_next(tquoted) == (tin, type(tin)())
+        assert unquote_next(tquoted + tail) == (tin, tail)
+        raises(ValueError, 'unquote(tquoted + tail)')

        # qq always gives str
        assert qq(tin) == asstr(tquoted)
@@ -78,12 +85,35 @@ def test_quote():
                # some inputs are not valid UTF-8
                continue
            tquoted = tquoted.decode('utf-8')
+            tail = tail.decode('utf-8')
        else:
            # tin was unicode
            tin = tin.encode('utf-8')
            tquoted = tquoted.encode('utf-8')
+            tail = tail.encode('utf-8')

        assert quote(tin) == tquoted
+        assert unquote(tquoted) == tin
+        assert unquote_next(tquoted) == (tin, type(tin)())
+        assert unquote_next(tquoted + tail) == (tin, tail)
+        raises(ValueError, 'unquote(tquoted + tail)')

        # qq always gives str
        assert qq(tin) == asstr(tquoted)
+
+
+def test_unquote_bad():
+    testv = (
+        # in            error
+        ('x"zzz"',      'no starting "'),
+        ('"zzz',        'no closing "'),
+        ('"\\',         'unexpected EOL after \\'),
+        ('"\\x',        'unexpected EOL after \\x'),
+        ('"\\x0',       'unexpected EOL after \\x'),
+        ('"\\z"',       'invalid escape \\z'),
+    )
+
+    for tin, err in testv:
+        with raises(ValueError) as exc:
+            unquote(tin)
+        assert exc.value.args == (err,)