Commit 2fe0e876 authored by Kirill Smelkov's avatar Kirill Smelkov

Add support for Python bytes

In Python bytes is immutable and read-only array of bytes. It is
also hashable and so is different from go []byte in that it can be
used as a dict key. Thus the closes approximation for Python bytes in Go
is some type derived from Go's string - it will be different from string
and at the same time will inherit from string it immutability property
and being able to be used as map key. So

- add ogórek.Bytes type to represent Python bytes
- add support to decode BINBYTES* pickle opcodes (these are protocol 3 opcodes)
- add support to encode ogórek.Bytes via those BINBYTES* opcodes
- for protocols <= 2, where there is no opcodes to directly represent
  bytes, adopt the same approach as Python - by pickling bytes as

	_codecs.encode(byt.decode('latin1'), 'latin1')

  this way unpickling it on Python3 will give bytes, while unpickling it
  on Python2 will give str:

	In [1]: sys.version
	Out[1]: '3.6.6 (default, Jun 27 2018, 14:44:17) \n[GCC 8.1.0]'

	In [2]: byt = b'\x01\x02\x03'

	In [3]: _codecs.encode(byt.decode('latin1'), 'latin1')
	Out[3]: b'\x01\x02\x03'

  ---

	In [1]: sys.version
	Out[1]: '2.7.15+ (default, Aug 31 2018, 11:56:52) \n[GCC 8.2.0]'

	In [2]: byt = b'\x01\x02\x03'

	In [3]: _codecs.encode(byt.decode('latin1'), 'latin1')
	Out[3]: '\x01\x02\x03'

- correspondingly teach decoder to recognize particular calls to
  _codecs.encode as being representation for bytes and decode it
  appropriately.

- since we now have to emit byt.decode('latin1') as UNICODE - add, so
  far internal, `type unicode(string)` that instructs ogórek encoder to
  always emit the string with UNICODE opcodes (regular string is encoded
  to unicode pickle object only for protocol >= 3).

- For []byte encoding preserve the current status - even though
  dispatching in Encoder.encode changes, the end result is the same -
  []byte was and stays currently encoded as just regular string.

  This was added in 555efd8f "first draft of dumb pickle encoder", and
  even though that might be not a good choice, changing it is a topic for
  another patch.
parent 619b90f8
......@@ -13,6 +13,10 @@ import (
const highestProtocol = 4 // highest protocol version we support generating
// unicode is string that always encodes as unicode pickle object.
// (regular string encodes to unicode pickle object only for protocol >= 3)
type unicode string
type TypeError struct {
typ string
}
......@@ -111,10 +115,17 @@ func (e *Encoder) encode(rv reflect.Value) error {
case reflect.Uint8, reflect.Uint64, reflect.Uint, reflect.Uint32, reflect.Uint16:
return e.encodeInt(reflect.Uint, int64(rv.Uint()))
case reflect.String:
return e.encodeString(rv.String())
switch rv.Interface().(type) {
case unicode:
return e.encodeUnicode(rv.String())
case Bytes:
return e.encodeBytes(Bytes(rv.String()))
default:
return e.encodeString(rv.String())
}
case reflect.Array, reflect.Slice:
if rv.Type().Elem().Kind() == reflect.Uint8 {
return e.encodeBytes(rv.Bytes())
return e.encodeString(string(rv.Bytes()))
} else if _, ok := rv.Interface().(Tuple); ok {
return e.encodeTuple(rv.Interface().(Tuple))
} else {
......@@ -258,8 +269,41 @@ func (e *Encoder) encodeBool(b bool) error {
return err
}
func (e *Encoder) encodeBytes(byt []byte) error {
return e.encodeString(string(byt))
func (e *Encoder) encodeBytes(byt Bytes) error {
l := len(byt)
// protocol >= 3 -> BINBYTES*
if e.config.Protocol >= 3 {
if l < 256 {
err := e.emit(opShortBinbytes, byte(l))
if err != nil {
return err
}
} else {
var b = [1+4]byte{opBinbytes}
binary.LittleEndian.PutUint32(b[1:], uint32(l))
err := e.emitb(b[:])
if err != nil {
return err
}
}
return e.emits(string(byt))
}
// protocol 0..2 -> emit as `_codecs.encode(byt.decode('latin1'), 'latin1')`
// (as python3 does)
rlatin1 := make([]rune, len(byt))
for i := 0; i < l; i++ {
rlatin1[i] = rune(byt[i]) // decode as latin1
}
ulatin1 := unicode(rlatin1) // -> UTF8
return e.encodeCall(&Call{
Callable: Class{Module: "_codecs", Name: "encode"},
Args: Tuple{ulatin1, "latin1"},
})
}
func (e *Encoder) encodeString(s string) error {
......
......@@ -126,6 +126,9 @@ type None struct{}
// Tuple is a representation of Python's tuple.
type Tuple []interface{}
// Bytes represents Python's bytes.
type Bytes string
// Decoder is a decoder for pickle streams.
type Decoder struct {
r *bufio.Reader
......@@ -285,6 +288,10 @@ loop:
err = d.loadSetItems()
case opBinfloat:
err = d.binFloat()
case opBinbytes:
err = d.loadBinBytes()
case opShortBinbytes:
err = d.loadShortBinBytes()
case opFrame:
err = d.loadFrame()
case opShortBinUnicode:
......@@ -632,8 +639,41 @@ func (d *Decoder) reduce() error {
if !ok {
return fmt.Errorf("pickle: reduce: invalid class: %T", xclass)
}
d.push(Call{Callable: class, Args: args})
return nil
// try to handle the call.
// If the call is unknown - represent it symbolically with Call{...} .
err := d.handleCall(class, args)
if err == errCallNotHandled {
d.push(Call{Callable: class, Args: args})
err = nil
}
return err
}
// errCallNotHandled is internal error via which handleCall signals that it did
// not handled the call.
var errCallNotHandled = errors.New("handleCall: call not handled")
// handleCall translates known python calls to appropriate Go objects.
//
// for example _codecs.encode(..., 'latin1') is handled as conversion to []byte.
func (d *Decoder) handleCall(class Class, argv Tuple) error {
// for protocols <= 2 Python3 encodes bytes as `_codecs.encode(byt.decode('latin1'), 'latin1')`
if class.Module == "_codecs" && class.Name == "encode" &&
len(argv) == 2 && argv[1] == "latin1" {
// bytes as latin1-decoded unicode
data, err := decodeLatin1Bytes(argv[0])
if err != nil {
return fmt.Errorf("_codecs.encode: %s", err)
}
d.push(Bytes(data))
return nil
}
return errCallNotHandled
}
// Push a string
......@@ -670,7 +710,9 @@ func (d *Decoder) loadString() error {
return nil
}
func (d *Decoder) loadBinString() error {
// bufLoadBinBytes decodes `len(LE32) [len]data` into d.buf .
// it serves loadBin{String,Bytes}.
func (d *Decoder) bufLoadBinBytes() error {
var b [4]byte
_, err := io.ReadFull(d.r, b[:])
if err != nil {
......@@ -689,11 +731,30 @@ func (d *Decoder) loadBinString() error {
if err != nil {
return err
}
return nil
}
func (d *Decoder) loadBinString() error {
err := d.bufLoadBinBytes()
if err != nil {
return err
}
d.push(d.buf.String())
return nil
}
func (d *Decoder) loadShortBinString() error {
func (d *Decoder) loadBinBytes() error {
err := d.bufLoadBinBytes()
if err != nil {
return err
}
d.push(Bytes(d.buf.Bytes()))
return nil
}
// bufLoadShortBinBytes decodes `len(U8) [len]data` into d.buf .
// it serves loadShortBin{String,Bytes} .
func (d *Decoder) bufLoadShortBinBytes() error {
b, err := d.r.ReadByte()
if err != nil {
return err
......@@ -705,10 +766,27 @@ func (d *Decoder) loadShortBinString() error {
if err != nil {
return err
}
return nil
}
func (d *Decoder) loadShortBinString() error {
err := d.bufLoadShortBinBytes()
if err != nil {
return err
}
d.push(d.buf.String())
return nil
}
func (d *Decoder) loadShortBinBytes() error {
err := d.bufLoadShortBinBytes()
if err != nil {
return err
}
d.push(Bytes(d.buf.Bytes()))
return nil
}
func (d *Decoder) loadUnicode() error {
line, err := d.readLine()
if err != nil {
......@@ -1200,3 +1278,26 @@ func decodeLong(data string) (*big.Int, error) {
}
return decoded, nil
}
// decodeLatin1Bytes tries to decode bytes from arg assuming it is latin1-encoded unicode.
//
// Python uses such representation of bytes for protocols <= 2 - where there is
// no BYTES* opcodes.
func decodeLatin1Bytes(arg interface{}) ([]byte, error) {
// bytes as latin1-decoded unicode
ulatin1, ok := arg.(string)
if !ok {
return nil, fmt.Errorf("latin1: arg must be string, not %T", arg)
}
data := make([]byte, 0, len(ulatin1))
for _, r := range ulatin1 {
if r >= 0x100 {
return nil, fmt.Errorf("latin1: cannot encode %q", r)
}
data = append(data, byte(r))
}
return data, nil
}
......@@ -274,6 +274,20 @@ var tests = []TestEntry{
"\n.")),
X(`bytes(b"hello\nмир\x01")`, Bytes("hello\nмир\x01"),
// GLOBAL + MARK + UNICODE + STRING + TUPLE + REDUCE
P0("c_codecs\nencode\n(Vhello\\u000aмир\x01\nS\"latin1\"\ntR."),
// GLOBAL + MARK + BINUNICODE + SHORT_BINSTRING + TUPLE + REDUCE
P1("c_codecs\nencode\n(X\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01U\x06latin1tR."),
// GLOBAL + BINUNICODE + SHORT_BINSTRING + TUPLE2 + REDUCE
P2("c_codecs\nencode\nX\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01U\x06latin1\x86R."),
P3_("C\x0dhello\nмир\x01."), // SHORT_BINBYTES
I("B\x0d\x00\x00\x00hello\nмир\x01.")), // BINBYTES
X("dict({})", make(map[interface{}]interface{}),
P0("(d."), // MARK + DICT
P1_("}."), // EMPTY_DICT
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment