Commit f62fe97f authored by Kirill Smelkov's avatar Kirill Smelkov Committed by Kamil Kisiel

Fix STRING encoding/decoding

STRING is text-based opcode which is used at protocol 0 and follows
\-escaped argument till EOL.

- for encoding we must not use Go's %q, since that will use \u and \U
  when seeing corresponding bytes, and since Python does not interpret
  \u or \U in string literals, the data received at Python side will be
  different.

- for decoding we must explicitly implement Python's 'string-escape'
  codec decoding which is used by Python's pickle for STRING opcode
  argument.

Updates: https://github.com/kisielk/og-rek/issues/48
parent 18004fbd
......@@ -295,7 +295,10 @@ func (e *Encoder) encodeString(s string) error {
// protocol 0: STRING
// XXX Python uses both ' and " for quoting - we quote with " only.
// XXX -> use https://godoc.org/lab.nexedi.com/kirr/go123/xfmt#AppendQuotePy ?
return e.emitf("%c%q\n", opString, s)
//
// don't use %q - that will use \u and \U in quoting which python won't
// interpret when decoding string literals.
return e.emitf("%c%s\n", opString, pyquote(s))
}
// encodeUnicode emits UTF-8 encoded string s as unicode pickle object.
......
......@@ -636,11 +636,6 @@ func (d *Decoder) reduce() error {
return nil
}
func decodeStringEscape(b []byte) string {
// TODO
return string(b)
}
// Push a string
func (d *Decoder) loadString() error {
line, err := d.readLine()
......@@ -666,7 +661,12 @@ func (d *Decoder) loadString() error {
return io.ErrUnexpectedEOF
}
d.push(decodeStringEscape(line[1 : len(line)-1]))
s, err := pydecodeStringEscape(string(line[1 : len(line)-1]))
if err != nil {
return err
}
d.push(s)
return nil
}
......
......@@ -261,6 +261,16 @@ var tests = []TestEntry{
// TODO BINUNICODE8
// str with many control characters at P0
// this exercises escape-based STRING coding
X(`str('\x80ми\nр\r\u2028\\u1234\\U00004321') # text escape`, "\x80ми\nр\r\u2028\\u1234\\U00004321",
P0("S\"\\x80ми\\\\r\\xe2\\x80\\xa8\\\\u1234\\\\U00004321\"\n."),
I("S\"\\x80ми\\\\r\\xe2\\x80\\xa8\\u1234\\U00004321\"\n.")), // \u and \U not decoded
X(`str("hel'lo")`, "hel'lo", I("S'hel'lo'\n.")), // non-escaped ' inside '-quotes
X(`str("hel\"lo")`, "hel\"lo", I("S\"hel\"lo\"\n.")), // non-escaped " inside "-quotes
X("dict({})", make(map[interface{}]interface{}),
P0("(d."), // MARK + DICT
......
package ogórek
import (
"fmt"
"strconv"
"unicode/utf8"
)
......@@ -10,8 +11,11 @@ import (
// We need to avoid \u and friends, since for regular strings Python translates
// \u to \\u, not an UTF-8 character.
//
// We must use Python - not Go - quoting, when emitting text strings with
// STRING opcode.
//
// Dumping strings in a way that is possible to copy/paste into Python and use
// pickletools.dis and pickle.loads there to verify a pickle is handy.
// pickletools.dis and pickle.loads there to verify a pickle is also handy.
func pyquote(s string) string {
const hexdigits = "0123456789abcdef"
out := make([]byte, 0, len(s))
......@@ -55,3 +59,83 @@ func pyquote(s string) string {
return "\"" + string(out) + "\""
}
// pydecodeStringEscape decodes input according to "string-escape" Python codec.
//
// The codec is essentially defined here:
// https://github.com/python/cpython/blob/v2.7.15-198-g69d0bc1430d/Objects/stringobject.c#L600
func pydecodeStringEscape(s string) (string, error) {
out := make([]byte, 0, len(s))
loop:
for {
r, width := utf8.DecodeRuneInString(s)
if width == 0 {
break
}
// regular UTF-8 character
if r != '\\' {
out = append(out, s[:width]...)
s = s[width:]
continue
}
if len(s) < 2 {
return "", strconv.ErrSyntax
}
switch c := s[1]; c {
// \ LF -> just skip
case '\n':
s = s[2:]
continue loop
// \\ -> \
case '\\':
out = append(out, '\\')
s = s[2:]
continue loop
// \' \" (yes, both quotes are allowed to be escaped).
//
// also: both quotes are allowed to be _unescaped_ - e.g. Python
// unpickles "S'hel'lo'\n." as "hel'lo".
case '\'', '"':
out = append(out, c)
s = s[2:]
continue loop
// \c (any character without special meaning) -> \ and proceed with C
default:
out = append(out, '\\')
s = s[1:] // not skipping c
continue loop
// escapes we handle (NOTE no \u \U for strings)
case 'b','f','t','n','r','v','a': // control characters
case '0','1','2','3','4','5','6','7': // octals
case 'x': // hex
}
// s starts with a good/known string escape prefix -> reuse unquoteChar.
r, _, tail, err := strconv.UnquoteChar(s, 0)
if err != nil {
return "", err
}
// all above escapes must produce single byte. This way we can
// append it directly, not play rune -> string UTF-8 encoding
// games (which break on e.g. "\x80" -> "\u0080" (= "\xc2x80").
c := byte(r)
if r != rune(c) {
panic(fmt.Sprintf("pydecode: string-escape: non-byte escaped rune %q (% x ; from %q)",
r, r, s))
}
out = append(out, c)
s = tail
}
return string(out), nil
}
package ogórek
import (
"testing"
)
// CodecTestCase represents 1 test case of a coder or decoder.
//
// Under the given transformation function in must be transformed to out.
type CodecTestCase struct {
in, out string
}
// testCodec tests transform func applied to all test cases from testv.
func testCodec(t *testing.T, transform func(in string)(string, error), testv []CodecTestCase) {
for _, tt := range testv {
s, err := transform(tt.in)
if err != nil {
t.Errorf("%q -> error: %s", tt.in, err)
continue
}
if s != tt.out {
t.Errorf("%q -> unexpected:\nhave: %q\nwant: %q", tt.in, s, tt.out)
}
}
}
func TestPyDecodeStringEscape(t *testing.T) {
testCodec(t, pydecodeStringEscape, []CodecTestCase{
{`hello`, "hello"},
{"hello\\\nworld", "helloworld"},
{`\\`, `\`},
{`\'\"`, `'"`},
{`\b\f\t\n\r\v\a`, "\b\f\t\n\r\v\a"},
{`\000\001\376\377`, "\000\001\376\377"},
{`\x00\x01\x7f\x80\xfe\xff`, "\x00\x01\x7f\x80\xfe\xff"},
// vvv stays as is
{`\u1234\U00001234\c`, `\u1234\U00001234\c`},
})
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment