Fix UNICODE decoding

UNICODE is text-based opcode, which is used at protocol 0 and follows 'raw-unicode-escape' encoded argument till EOL. - for decoding we must explicitly implement Python's 'raw-unicode-escape' codec decoding, which is used by Python's pickle for UNICODE argument. Updates and hopefully fixes: https://github.com/kisielk/og-rek/issues/48

Fix UNICODE decoding
UNICODE is text-based opcode, which is used at protocol 0 and follows 'raw-unicode-escape' encoded argument till EOL. - for decoding we must explicitly implement Python's 'raw-unicode-escape' codec decoding, which is used by Python's pickle for UNICODE argument. Updates and hopefully fixes: https://github.com/kisielk/og-rek/issues/48
9daf6a2a · Kirill Smelkov · Kamil Kisiel · f62fe97f · 9daf6a2a · 9daf6a2a
Commit 9daf6a2a authored Sep 25, 2018 by Kirill Smelkov Committed by Kamil Kisiel Sep 25, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 30 deletions

ogorek.go ogorek.go +4 -24

ogorek_test.go ogorek_test.go +8 -6

pyquote.go pyquote.go +53 -0

pyquote_test.go pyquote_test.go +22 -0

No files found.
--- a/ogorek.go
+++ b/ogorek.go
@@ -711,36 +711,16 @@ func (d *Decoder) loadShortBinString() error {
 func (d *Decoder) loadUnicode() error {
 	line, err := d.readLine()
 	if err != nil {
 		return err
 	}
-	sline := string(line)
-	d.buf.Reset()
+	text, err := pydecodeRawUnicodeEscape(string(line))
-	d.buf.Grow(len(line)) // approximation
+	if err != nil {
+		return err
-	for len(sline) > 0 {
-		var r rune
-		var err error
-		for len(sline) > 0 && sline[0] == '\'' {
-			d.buf.WriteByte(sline[0])
-			sline = sline[1:]
-		}
-		if len(sline) == 0 {
-			break
-		}
-		r, _, sline, err = unquoteChar(sline, '\'')
-		if err != nil {
-			return err
-		}
-		d.buf.WriteRune(r)
-	}
-	if len(sline) > 0 {
-		return fmt.Errorf("characters remaining after loadUnicode operation: %s", sline)
 	}
-	d.push(d.buf.String())
+	d.push(text)
 	return nil
 }

--- a/ogorek_test.go
+++ b/ogorek_test.go
@@ -235,10 +235,6 @@ var tests = []TestEntry{
 		I("S'abc'\np0\n."),
 		I("S'abc'\n.")),
-	// TODO: reenable after we fix string escape decoding (https://github.com/kisielk/og-rek/issues/48)
-	// X(`unicode('abc\r')`, "abc\r",
-	//	I("Vabc\r\n.")),
 	X("unicode('日本語')", "日本語",
 		P0("S\"日本語\"\n."),                                 // STRING
 		P12("U\x09日本語."),                                  // SHORT_BINSTRING
@@ -261,8 +257,8 @@ var tests = []TestEntry{
 		// TODO BINUNICODE8
-	// str with many control characters at P0
+	// str/unicode with many control characters at P0
-	// this exercises escape-based STRING coding
+	// this exercises escape-based STRING/UNICODE coding
 	X(`str('\x80ми\nр\r\u2028\\u1234\\U00004321') # text escape`, "\x80ми\nр\r\u2028\\u1234\\U00004321",
 		P0("S\"\\x80ми\\nр\\r\\xe2\\x80\\xa8\\\\u1234\\\\U00004321\"\n."),
@@ -272,6 +268,12 @@ var tests = []TestEntry{
 	X(`str("hel\"lo")`, "hel\"lo", I("S\"hel\"lo\"\n.")), // non-escaped " inside "-quotes
+	X(`unicode(r'мир\n\r\x00'+'\r') # text escape`, `мир\n\r\x00`+"\r",
+		I("V\\u043c\\u0438\\u0440\\n\\r\\x00" + // only \u and \U are decoded - not \n \r ...
+			"\r" +                          // raw \r - ok, not lost
+			"\n.")),
 	X("dict({})", make(map[interface{}]interface{}),
 		P0("(d."), // MARK + DICT
 		P1_("}."), // EMPTY_DICT

--- a/pyquote.go
+++ b/pyquote.go
@@ -139,3 +139,56 @@ loop:
 	return string(out), nil
 }
+// pydecodeRawUnicodeEscape decodes input according to "raw-unicode-escape" Python codec.
+//
+// The codec is essentially defined here:
+// https://github.com/python/cpython/blob/v2.7.15-198-g69d0bc1430d/Objects/unicodeobject.c#L3204
+func pydecodeRawUnicodeEscape(s string) (string, error) {
+	out := make([]rune, 0, len(s))
+loop:
+	for nescape := 0; len(s) > 0; {
+		c := s[0]
+		// non-escape bytes are interpreted as unicode ordinals
+		if c != '\\' {
+			out = append(out, rune(c))
+			s = s[1:]
+			nescape = 0
+			continue
+		}
+		nescape++
+		// \u are only interpreted if N(leading \) is odd.
+		if nescape % 2 == 0 || len(s) < 2 {
+			out = append(out, '\\')
+			s = s[1:]
+			continue
+		}
+		switch c = s[1]; c {
+		// \c (anything - including \\ - not \u or \U)
+		default:
+			out = append(out, '\\')
+			s = s[1:] // not skipping c
+			continue loop
+		// escapes we handle (NOTE no \n \r \x etc here)
+		case 'u', 'U': // unicode escapes
+		}
+		// here we have \u or \U escapes. Process it via UnquoteChar,
+		// similarly to string-escape.
+		r, _, tail, err := strconv.UnquoteChar(s, 0)
+		if err != nil {
+			return "", err
+		}
+		out = append(out, r)
+		s = tail
+		nescape = 0
+	}
+	return string(out), nil // encoded to UTF-8
+}
--- a/pyquote_test.go
+++ b/pyquote_test.go
@@ -39,3 +39,25 @@ func TestPyDecodeStringEscape(t *testing.T) {
 		{`\u1234\U00001234\c`, `\u1234\U00001234\c`},
 	})
 }
+func TestPyDecodeRawUnicodeEscape(t *testing.T) {
+	testCodec(t, pydecodeRawUnicodeEscape, []CodecTestCase{
+		{`hello`, "hello"},
+		{"\x00\x01\x80\xfe\xff", "\u0000\u0001\u0080\u00fe\u00ff"},
+		{`\`, `\`},
+		{`\\`, `\\`},
+		{`\\\`, `\\\`},
+		{`\\\\`, `\\\\`},
+		{`\u1234\U00004321`, "\u1234\U00004321"},
+		{`\\u1234\\U00004321`, `\\u1234\\U00004321`},
+		{`\\\u1234\\\U00004321`, "\\\\\u1234\\\\\U00004321"},
+		{`\\\\u1234\\\\U00004321`, `\\\\u1234\\\\U00004321`},
+		{`\\\\\u1234\\\\\U00004321`, "\\\\\\\\\u1234\\\\\\\\\U00004321"},
+		// vvv stays as is
+		{"hello\\\nworld", "hello\\\nworld"},
+		{`\'\"`, `\'\"`},
+		{`\b\f\t\n\r\v\a`, `\b\f\t\n\r\v\a`},
+		{`\000\001\376\377`, `\000\001\376\377`},
+		{`\x00\x01\x7f\x80\xfe\xff`, `\x00\x01\x7f\x80\xfe\xff`},
+	})
+}