Add support for Python bytes

In Python bytes is immutable and read-only array of bytes. It is also hashable and so is different from go []byte in that it can be used as a dict key. Thus the closes approximation for Python bytes in Go is some type derived from Go's string - it will be different from string and at the same time will inherit from string it immutability property and being able to be used as map key. So - add ogórek.Bytes type to represent Python bytes - add support to decode BINBYTES* pickle opcodes (these are protocol 3 opcodes) - add support to encode ogórek.Bytes via those BINBYTES* opcodes - for protocols <= 2, where there is no opcodes to directly represent bytes, adopt the same approach as Python - by pickling bytes as _codecs.encode(byt.decode('latin1'), 'latin1') this way unpickling it on Python3 will give bytes, while unpickling it on Python2 will give str: In [1]: sys.version Out[1]: '3.6.6 (default, Jun 27 2018, 14:44:17) \n[GCC 8.1.0]' In [2]: byt = b'\x01\x02\x03' In [3]: _codecs.encode(byt.decode('latin1'), 'latin1') Out[3]: b'\x01\x02\x03' --- In [1]: sys.version Out[1]: '2.7.15+ (default, Aug 31 2018, 11:56:52) \n[GCC 8.2.0]' In [2]: byt = b'\x01\x02\x03' In [3]: _codecs.encode(byt.decode('latin1'), 'latin1') Out[3]: '\x01\x02\x03' - correspondingly teach decoder to recognize particular calls to _codecs.encode as being representation for bytes and decode it appropriately. - since we now have to emit byt.decode('latin1') as UNICODE - add, so far internal, `type unicode(string)` that instructs ogórek encoder to always emit the string with UNICODE opcodes (regular string is encoded to unicode pickle object only for protocol >= 3). - For []byte encoding preserve the current status - even though dispatching in Encoder.encode changes, the end result is the same - []byte was and stays currently encoded as just regular string. This was added in 555efd8f "first draft of dumb pickle encoder", and even though that might be not a good choice, changing it is a topic for another patch.

Add support for Python bytes
In Python bytes is immutable and read-only array of bytes. It is also hashable and so is different from go []byte in that it can be used as a dict key. Thus the closes approximation for Python bytes in Go is some type derived from Go's string - it will be different from string and at the same time will inherit from string it immutability property and being able to be used as map key. So - add ogórek.Bytes type to represent Python bytes - add support to decode BINBYTES* pickle opcodes (these are protocol 3 opcodes) - add support to encode ogórek.Bytes via those BINBYTES* opcodes - for protocols <= 2, where there is no opcodes to directly represent bytes, adopt the same approach as Python - by pickling bytes as _codecs.encode(byt.decode('latin1'), 'latin1') this way unpickling it on Python3 will give bytes, while unpickling it on Python2 will give str: In [1]: sys.version Out[1]: '3.6.6 (default, Jun 27 2018, 14:44:17) \n[GCC 8.1.0]' In [2]: byt = b'\x01\x02\x03' In [3]: _codecs.encode(byt.decode('latin1'), 'latin1') Out[3]: b'\x01\x02\x03' --- In [1]: sys.version Out[1]: '2.7.15+ (default, Aug 31 2018, 11:56:52) \n[GCC 8.2.0]' In [2]: byt = b'\x01\x02\x03' In [3]: _codecs.encode(byt.decode('latin1'), 'latin1') Out[3]: '\x01\x02\x03' - correspondingly teach decoder to recognize particular calls to _codecs.encode as being representation for bytes and decode it appropriately. - since we now have to emit byt.decode('latin1') as UNICODE - add, so far internal, `type unicode(string)` that instructs ogórek encoder to always emit the string with UNICODE opcodes (regular string is encoded to unicode pickle object only for protocol >= 3). - For []byte encoding preserve the current status - even though dispatching in Encoder.encode changes, the end result is the same - []byte was and stays currently encoded as just regular string. This was added in 555efd8f "first draft of dumb pickle encoder", and even though that might be not a good choice, changing it is a topic for another patch.
2fe0e876 · Kirill Smelkov · 619b90f8 · 2fe0e876 · 2fe0e876 · 2fe0e876
Commit 2fe0e876 authored Sep 27, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 167 additions and 8 deletions

encode.go encode.go +48 -4

ogorek.go ogorek.go +105 -4

ogorek_test.go ogorek_test.go +14 -0

No files found.
--- a/encode.go
+++ b/encode.go
@@ -13,6 +13,10 @@ import (

 const highestProtocol = 4 // highest protocol version we support generating

+// unicode is string that always encodes as unicode pickle object.
+// (regular string encodes to unicode pickle object only for protocol >= 3)
+type unicode string
+
 type TypeError struct {
 	typ string
 }
@@ -111,10 +115,17 @@ func (e *Encoder) encode(rv reflect.Value) error {
 	case reflect.Uint8, reflect.Uint64, reflect.Uint, reflect.Uint32, reflect.Uint16:
 		return e.encodeInt(reflect.Uint, int64(rv.Uint()))
 	case reflect.String:
-		return e.encodeString(rv.String())
+		switch rv.Interface().(type) {
+		case unicode:
+			return e.encodeUnicode(rv.String())
+		case Bytes:
+			return e.encodeBytes(Bytes(rv.String()))
+		default:
+			return e.encodeString(rv.String())
+		}
 	case reflect.Array, reflect.Slice:
 		if rv.Type().Elem().Kind() == reflect.Uint8 {
-			return e.encodeBytes(rv.Bytes())
+			return e.encodeString(string(rv.Bytes()))
 		} else if _, ok := rv.Interface().(Tuple); ok {
 			return e.encodeTuple(rv.Interface().(Tuple))
 		} else {
@@ -258,8 +269,41 @@ func (e *Encoder) encodeBool(b bool) error {
 	return err
 }

-func (e *Encoder) encodeBytes(byt []byte) error {
-	return e.encodeString(string(byt))
+func (e *Encoder) encodeBytes(byt Bytes) error {
+	l := len(byt)
+
+	// protocol >= 3  ->  BINBYTES*
+	if e.config.Protocol >= 3 {
+		if l < 256 {
+			err := e.emit(opShortBinbytes, byte(l))
+			if err != nil {
+				return err
+			}
+		} else {
+			var b = [1+4]byte{opBinbytes}
+
+			binary.LittleEndian.PutUint32(b[1:], uint32(l))
+			err := e.emitb(b[:])
+			if err != nil {
+				return err
+			}
+		}
+
+		return e.emits(string(byt))
+	}
+
+	// protocol 0..2 -> emit as `_codecs.encode(byt.decode('latin1'), 'latin1')`
+	// (as python3 does)
+	rlatin1 := make([]rune, len(byt))
+	for i := 0; i < l; i++ {
+		rlatin1[i] = rune(byt[i]) // decode as latin1
+	}
+	ulatin1 := unicode(rlatin1) // -> UTF8
+
+	return e.encodeCall(&Call{
+		Callable: Class{Module: "_codecs", Name: "encode"},
+		Args:     Tuple{ulatin1, "latin1"},
+	})
 }

 func (e *Encoder) encodeString(s string) error {

--- a/ogorek.go
+++ b/ogorek.go
@@ -126,6 +126,9 @@ type None struct{}
 // Tuple is a representation of Python's tuple.
 type Tuple []interface{}

+// Bytes represents Python's bytes.
+type Bytes string
+
 // Decoder is a decoder for pickle streams.
 type Decoder struct {
 	r      *bufio.Reader
@@ -285,6 +288,10 @@ loop:
 			err = d.loadSetItems()
 		case opBinfloat:
 			err = d.binFloat()
+		case opBinbytes:
+			err = d.loadBinBytes()
+		case opShortBinbytes:
+			err = d.loadShortBinBytes()
 		case opFrame:
 			err = d.loadFrame()
 		case opShortBinUnicode:
@@ -632,8 +639,41 @@ func (d *Decoder) reduce() error {
 	if !ok {
 		return fmt.Errorf("pickle: reduce: invalid class: %T", xclass)
 	}
-	d.push(Call{Callable: class, Args: args})
-	return nil
+
+	// try to handle the call.
+	// If the call is unknown - represent it symbolically with Call{...} .
+	err := d.handleCall(class, args)
+	if err == errCallNotHandled {
+		d.push(Call{Callable: class, Args: args})
+		err = nil
+	}
+
+	return err
+}
+
+// errCallNotHandled is internal error via which handleCall signals that it did
+// not handled the call.
+var errCallNotHandled = errors.New("handleCall: call not handled")
+
+// handleCall translates known python calls to appropriate Go objects.
+//
+// for example _codecs.encode(..., 'latin1') is handled as conversion to []byte.
+func (d *Decoder) handleCall(class Class, argv Tuple) error {
+	// for protocols <= 2 Python3 encodes bytes as `_codecs.encode(byt.decode('latin1'), 'latin1')`
+	if class.Module == "_codecs" && class.Name == "encode" &&
+		len(argv) == 2 && argv[1] == "latin1" {
+
+		// bytes as latin1-decoded unicode
+		data, err := decodeLatin1Bytes(argv[0])
+		if err != nil {
+			return fmt.Errorf("_codecs.encode: %s", err)
+		}
+
+		d.push(Bytes(data))
+		return nil
+	}
+
+	return errCallNotHandled
 }

 // Push a string
@@ -670,7 +710,9 @@ func (d *Decoder) loadString() error {
 	return nil
 }

-func (d *Decoder) loadBinString() error {
+// bufLoadBinBytes decodes `len(LE32) [len]data` into d.buf .
+// it serves loadBin{String,Bytes}.
+func (d *Decoder) bufLoadBinBytes() error {
 	var b [4]byte
 	_, err := io.ReadFull(d.r, b[:])
 	if err != nil {
@@ -689,11 +731,30 @@ func (d *Decoder) loadBinString() error {
 	if err != nil {
 		return err
 	}
+	return nil
+}
+
+func (d *Decoder) loadBinString() error {
+	err := d.bufLoadBinBytes()
+	if err != nil {
+		return err
+	}
 	d.push(d.buf.String())
 	return nil
 }

-func (d *Decoder) loadShortBinString() error {
+func (d *Decoder) loadBinBytes() error {
+	err := d.bufLoadBinBytes()
+	if err != nil {
+		return err
+	}
+	d.push(Bytes(d.buf.Bytes()))
+	return nil
+}
+
+// bufLoadShortBinBytes decodes `len(U8) [len]data` into d.buf .
+// it serves loadShortBin{String,Bytes} .
+func (d *Decoder) bufLoadShortBinBytes() error {
 	b, err := d.r.ReadByte()
 	if err != nil {
 		return err
@@ -705,10 +766,27 @@ func (d *Decoder) loadShortBinString() error {
 	if err != nil {
 		return err
 	}
+	return nil
+}
+
+func (d *Decoder) loadShortBinString() error {
+	err := d.bufLoadShortBinBytes()
+	if err != nil {
+		return err
+	}
 	d.push(d.buf.String())
 	return nil
 }

+func (d *Decoder) loadShortBinBytes() error {
+	err := d.bufLoadShortBinBytes()
+	if err != nil {
+		return err
+	}
+	d.push(Bytes(d.buf.Bytes()))
+	return nil
+}
+
 func (d *Decoder) loadUnicode() error {
 	line, err := d.readLine()
 	if err != nil {
@@ -1200,3 +1278,26 @@ func decodeLong(data string) (*big.Int, error) {
 	}
 	return decoded, nil
 }
+
+// decodeLatin1Bytes tries to decode bytes from arg assuming it is latin1-encoded unicode.
+//
+// Python uses such representation of bytes for protocols <= 2 - where there is
+// no BYTES* opcodes.
+func decodeLatin1Bytes(arg interface{}) ([]byte, error) {
+	// bytes as latin1-decoded unicode
+	ulatin1, ok := arg.(string)
+	if !ok {
+		return nil, fmt.Errorf("latin1: arg must be string, not %T", arg)
+	}
+
+	data := make([]byte, 0, len(ulatin1))
+	for _, r := range ulatin1 {
+		if r >= 0x100 {
+			return nil, fmt.Errorf("latin1: cannot encode %q", r)
+		}
+
+		data = append(data, byte(r))
+	}
+
+	return data, nil
+}
--- a/ogorek_test.go
+++ b/ogorek_test.go
@@ -274,6 +274,20 @@ var tests = []TestEntry{
 			"\n.")),


+	X(`bytes(b"hello\nмир\x01")`, Bytes("hello\nмир\x01"),
+		// GLOBAL + MARK + UNICODE + STRING + TUPLE + REDUCE
+		P0("c_codecs\nencode\n(Vhello\\u000aмир\x01\nS\"latin1\"\ntR."),
+
+		// GLOBAL + MARK + BINUNICODE + SHORT_BINSTRING + TUPLE + REDUCE
+		P1("c_codecs\nencode\n(X\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01U\x06latin1tR."),
+
+		// GLOBAL + BINUNICODE + SHORT_BINSTRING + TUPLE2 + REDUCE
+		P2("c_codecs\nencode\nX\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01U\x06latin1\x86R."),
+
+		P3_("C\x0dhello\nмир\x01."),            // SHORT_BINBYTES
+		I("B\x0d\x00\x00\x00hello\nмир\x01.")), // BINBYTES
+
+
 	X("dict({})", make(map[interface{}]interface{}),
 		P0("(d."), // MARK + DICT
 		P1_("}."), // EMPTY_DICT