Commit 9daf6a2a authored by Kirill Smelkov's avatar Kirill Smelkov Committed by Kamil Kisiel

Fix UNICODE decoding

UNICODE is text-based opcode, which is used at protocol 0 and follows
'raw-unicode-escape' encoded argument till EOL.

- for decoding we must explicitly implement Python's
  'raw-unicode-escape' codec decoding, which is used by Python's pickle
  for UNICODE argument.

Updates and hopefully fixes: https://github.com/kisielk/og-rek/issues/48
parent f62fe97f
......@@ -711,36 +711,16 @@ func (d *Decoder) loadShortBinString() error {
func (d *Decoder) loadUnicode() error {
line, err := d.readLine()
if err != nil {
return err
}
sline := string(line)
d.buf.Reset()
d.buf.Grow(len(line)) // approximation
for len(sline) > 0 {
var r rune
var err error
for len(sline) > 0 && sline[0] == '\'' {
d.buf.WriteByte(sline[0])
sline = sline[1:]
}
if len(sline) == 0 {
break
}
r, _, sline, err = unquoteChar(sline, '\'')
if err != nil {
return err
}
d.buf.WriteRune(r)
}
if len(sline) > 0 {
return fmt.Errorf("characters remaining after loadUnicode operation: %s", sline)
text, err := pydecodeRawUnicodeEscape(string(line))
if err != nil {
return err
}
d.push(d.buf.String())
d.push(text)
return nil
}
......
......@@ -235,10 +235,6 @@ var tests = []TestEntry{
I("S'abc'\np0\n."),
I("S'abc'\n.")),
// TODO: reenable after we fix string escape decoding (https://github.com/kisielk/og-rek/issues/48)
// X(`unicode('abc\r')`, "abc\r",
// I("Vabc\r\n.")),
X("unicode('日本語')", "日本語",
P0("S\"日本語\"\n."), // STRING
P12("U\x09日本語."), // SHORT_BINSTRING
......@@ -261,8 +257,8 @@ var tests = []TestEntry{
// TODO BINUNICODE8
// str with many control characters at P0
// this exercises escape-based STRING coding
// str/unicode with many control characters at P0
// this exercises escape-based STRING/UNICODE coding
X(`str('\x80ми\nр\r\u2028\\u1234\\U00004321') # text escape`, "\x80ми\nр\r\u2028\\u1234\\U00004321",
P0("S\"\\x80ми\\\\r\\xe2\\x80\\xa8\\\\u1234\\\\U00004321\"\n."),
......@@ -272,6 +268,12 @@ var tests = []TestEntry{
X(`str("hel\"lo")`, "hel\"lo", I("S\"hel\"lo\"\n.")), // non-escaped " inside "-quotes
X(`unicode(r'мир\n\r\x00'+'\r') # text escape`, `мир\n\r\x00`+"\r",
I("V\\u043c\\u0438\\u0440\\n\\r\\x00" + // only \u and \U are decoded - not \n \r ...
"\r" + // raw \r - ok, not lost
"\n.")),
X("dict({})", make(map[interface{}]interface{}),
P0("(d."), // MARK + DICT
P1_("}."), // EMPTY_DICT
......
......@@ -139,3 +139,56 @@ loop:
return string(out), nil
}
// pydecodeRawUnicodeEscape decodes input according to "raw-unicode-escape" Python codec.
//
// The codec is essentially defined here:
// https://github.com/python/cpython/blob/v2.7.15-198-g69d0bc1430d/Objects/unicodeobject.c#L3204
func pydecodeRawUnicodeEscape(s string) (string, error) {
out := make([]rune, 0, len(s))
loop:
for nescape := 0; len(s) > 0; {
c := s[0]
// non-escape bytes are interpreted as unicode ordinals
if c != '\\' {
out = append(out, rune(c))
s = s[1:]
nescape = 0
continue
}
nescape++
// \u are only interpreted if N(leading \) is odd.
if nescape % 2 == 0 || len(s) < 2 {
out = append(out, '\\')
s = s[1:]
continue
}
switch c = s[1]; c {
// \c (anything - including \\ - not \u or \U)
default:
out = append(out, '\\')
s = s[1:] // not skipping c
continue loop
// escapes we handle (NOTE no \n \r \x etc here)
case 'u', 'U': // unicode escapes
}
// here we have \u or \U escapes. Process it via UnquoteChar,
// similarly to string-escape.
r, _, tail, err := strconv.UnquoteChar(s, 0)
if err != nil {
return "", err
}
out = append(out, r)
s = tail
nescape = 0
}
return string(out), nil // encoded to UTF-8
}
......@@ -39,3 +39,25 @@ func TestPyDecodeStringEscape(t *testing.T) {
{`\u1234\U00001234\c`, `\u1234\U00001234\c`},
})
}
func TestPyDecodeRawUnicodeEscape(t *testing.T) {
testCodec(t, pydecodeRawUnicodeEscape, []CodecTestCase{
{`hello`, "hello"},
{"\x00\x01\x80\xfe\xff", "\u0000\u0001\u0080\u00fe\u00ff"},
{`\`, `\`},
{`\\`, `\\`},
{`\\\`, `\\\`},
{`\\\\`, `\\\\`},
{`\u1234\U00004321`, "\u1234\U00004321"},
{`\\u1234\\U00004321`, `\\u1234\\U00004321`},
{`\\\u1234\\\U00004321`, "\\\\\u1234\\\\\U00004321"},
{`\\\\u1234\\\\U00004321`, `\\\\u1234\\\\U00004321`},
{`\\\\\u1234\\\\\U00004321`, "\\\\\\\\\u1234\\\\\\\\\U00004321"},
// vvv stays as is
{"hello\\\nworld", "hello\\\nworld"},
{`\'\"`, `\'\"`},
{`\b\f\t\n\r\v\a`, `\b\f\t\n\r\v\a`},
{`\000\001\376\377`, `\000\001\376\377`},
{`\x00\x01\x7f\x80\xfe\xff`, `\x00\x01\x7f\x80\xfe\xff`},
})
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment