Commit c0bbd06e authored by Kirill Smelkov's avatar Kirill Smelkov

xfmt: Qpy & friends to quote string the way Python would do

This is somtimes needed for checking programs output bit-to-bit where on
python side repr(x), `x` or %r is used for output.
parent 1aa677c8
// Copyright (C) 2017 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Open Source Initiative approved licenses and Convey
// the resulting work. Corresponding source of such a combination shall include
// the source code for all other software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// quoting by Python rules
package xfmt
import (
"strconv"
"unicode/utf8"
"lab.nexedi.com/kirr/go123/mem"
"lab.nexedi.com/kirr/go123/xbytes"
)
// AppendQuotePy appends to buf Python quoting of s
func AppendQuotePy(buf []byte, s string) []byte {
return AppendQuotePyBytes(buf, mem.Bytes(s))
}
// AppendQuotePyBytes appends to buf Python quoting of b
func AppendQuotePyBytes(buf, b []byte) []byte {
// smartquotes: choose ' or " as quoting character
// https://github.com/python/cpython/blob/v2.7.13-116-g1aa1803b3d/Objects/stringobject.c#L947
quote := byte('\'')
if xbytes.ContainsByte(b, '\'') && !xbytes.ContainsByte(b, '"') {
quote = '"'
}
buf = append(buf, quote)
for i := 0; i < len(b); {
c := b[i]
switch {
// fast path - ASCII only - trying to avoid UTF-8 decoding
case c < utf8.RuneSelf:
switch {
case c == '\\' || c == quote:
buf = append(buf, '\\', c)
case ' ' <= c && c <= '\x7e':
// printable ASCII
buf = append(buf, c)
// below: non-printable ASCII
// NOTE python converts to \<letter> only \t \n \r (not e.g. \v)
// https://github.com/python/cpython/blob/v2.7.13-116-g1aa1803b3d/Objects/stringobject.c#L963
case c == '\t':
buf = append(buf, `\t`...)
case c == '\n':
buf = append(buf, `\n`...)
case c == '\r':
buf = append(buf, `\r`...)
default:
// NOTE c < ' ' or c == '\x7f' (the only non-printable ASCII character > space) here
// we already converted to \<letter> what python represents as such above
// everything else goes in numeric byte escapes
buf = append(buf, '\\', 'x', hexdigits[c>>4], hexdigits[c&0xf])
}
i++
// slow path - full UTF-8 decoding
default:
r, size := utf8.DecodeRune(b[i:])
isize := i + size
switch {
case r == utf8.RuneError:
// decode error - just emit raw byte as escaped
buf = append(buf, '\\', 'x', hexdigits[c>>4], hexdigits[c&0xf])
case strconv.IsPrint(r):
// printable utf-8 characters go as is
buf = append(buf, b[i:isize]...)
default:
// everything else goes in numeric byte escapes
for j := i; j < isize; j++ {
buf = append(buf, '\\', 'x', hexdigits[b[j]>>4], hexdigits[b[j]&0xf])
}
}
i = isize
}
}
buf = append(buf, quote)
return buf
}
// Qpy appends string quoted as Python would do
func (b *Buffer) Qpy(s string) *Buffer {
*b = AppendQuotePy(*b, s)
return b
}
// Qpyb appends []byte quoted as Python would do
func (b *Buffer) Qpyb(x []byte) *Buffer {
*b = AppendQuotePyBytes(*b, x)
return b
}
// Copyright (C) 2017 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Open Source Initiative approved licenses and Convey
// the resulting work. Corresponding source of such a combination shall include
// the source code for all other software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
package xfmt
import (
"testing"
)
// byterange returns []byte with element [start,stop)
func byterange(start, stop byte) []byte {
b := make([]byte, 0, stop-start)
for ; start < stop; start++ {
b = append(b, start)
}
return b
}
var pyQuoteTestv = []struct {in, quoted string} {
// empty
{``, `''`},
// special characters
{string(byterange(0, 32)), `'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'`},
// " vs '
{`hello world`, `'hello world'`},
{`hello ' world`, `"hello ' world"`},
{`hello ' " world`, `'hello \' " world'`},
// \
{`hello \ world`, `'hello \\ world'`},
// utf-8
// XXX python escapes non-ascii, but since FileStorage connot
// commit such strings we take the freedom and output them as
// readable.
//{`привет мир`, `'\xd0\xbf\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82 \xd0\xbc\xd0\xb8\xd1\x80'`},
{`привет мир`, `'привет мир'`},
// invalid utf-8
{"\xd0a", `'\xd0a'`},
// non-printable utf-8
{"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", `'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87'`},
}
func TestPyQuote(t *testing.T) {
buf := []byte{}
for _, tt := range pyQuoteTestv {
buf = buf[:0]
buf = AppendQuotePy(buf, tt.in)
quoted := string(buf)
if quoted != tt.quoted {
t.Errorf("pyQuote(%q) ->\nhave: %s\nwant: %s", tt.in, quoted, tt.quoted)
}
}
}
func BenchmarkPyQuote(b *testing.B) {
buf := []byte{}
for i := 0; i < b.N; i++ {
for _, tt := range pyQuoteTestv {
buf = buf[:0]
buf = AppendQuotePy(buf, tt.in)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment