Commit 8def3d02 authored by Kirill Smelkov's avatar Kirill Smelkov

Preliminary support for protocol 5

Pickle protocol 5 adds out-of-band data and BYTEARRAY8 opcode (see [1]
and [2]). We add support for BYTEARRAY8 here. Handling out-of-band data
would require more setup from users - on both decoding and encoding
side, and it is likely that currently no practical use-case exists to
work with pickles with out-of-band data on Go side. This way we
behave as if no out-of-band data was provided when seeing all protocol 5
opcodes besides BYTEARRAY8.

Hopefully fixes: https://github.com/kisielk/og-rek/issues/62

[1] https://www.python.org/dev/peps/pep-0574
[2] https://github.com/python/cpython/commit/91f4380cedba

/reviewed-by @kisielk
/reviewed-on https://github.com/kisielk/og-rek/pull/63
parent 24bb08c2
...@@ -54,7 +54,8 @@ ...@@ -54,7 +54,8 @@
// version 2 is the highest protocol version that is understood by standard // version 2 is the highest protocol version that is understood by standard
// pickle module of Python2. Protocol version 3 added ways to represent Python // pickle module of Python2. Protocol version 3 added ways to represent Python
// bytes objects from Python3(~). Protocol version 4 further enhances on // bytes objects from Python3(~). Protocol version 4 further enhances on
// version 3 and completely switches to binary-only encoding. Please see // version 3 and completely switches to binary-only encoding. Protocol
// version 5 added support for out-of-band data(%). Please see
// https://docs.python.org/3/library/pickle.html#data-stream-format for details. // https://docs.python.org/3/library/pickle.html#data-stream-format for details.
// //
// On decoding ogórek detects which protocol is being used and automatically // On decoding ogórek detects which protocol is being used and automatically
...@@ -112,4 +113,6 @@ ...@@ -112,4 +113,6 @@
// //
// (^) contrary to Python implementation, where malicious pickle can cause the // (^) contrary to Python implementation, where malicious pickle can cause the
// decoder to run arbitrary code, including e.g. os.system("rm -rf /"). // decoder to run arbitrary code, including e.g. os.system("rm -rf /").
//
// (%) ogórek currently does not support out-of-band data.
package ogórek package ogórek
...@@ -11,7 +11,7 @@ import ( ...@@ -11,7 +11,7 @@ import (
"strings" "strings"
) )
const highestProtocol = 4 // highest protocol version we support generating const highestProtocol = 5 // highest protocol version we support generating
// unicode is string that always encodes as unicode pickle object. // unicode is string that always encodes as unicode pickle object.
// (regular string encodes to unicode pickle object only for protocol >= 3) // (regular string encodes to unicode pickle object only for protocol >= 3)
...@@ -307,6 +307,18 @@ func (e *Encoder) encodeBytes(byt Bytes) error { ...@@ -307,6 +307,18 @@ func (e *Encoder) encodeBytes(byt Bytes) error {
} }
func (e *Encoder) encodeByteArray(bv []byte) error { func (e *Encoder) encodeByteArray(bv []byte) error {
// protocol >= 5 -> BYTEARRAY8
if e.config.Protocol >= 5 {
var b = [1+8]byte{opBytearray8}
binary.LittleEndian.PutUint64(b[1:], uint64(len(bv)))
err := e.emitb(b[:])
if err != nil {
return err
}
return e.emitb(bv)
}
// TODO protocol <= 2: pickle can be shorter if we emit -> bytearray(unicode, encoding) // TODO protocol <= 2: pickle can be shorter if we emit -> bytearray(unicode, encoding)
// instead of bytearray(_codecs.encode(unicode, encoding)) // instead of bytearray(_codecs.encode(unicode, encoding))
......
...@@ -96,6 +96,12 @@ const ( ...@@ -96,6 +96,12 @@ const (
opStackGlobal byte = '\x93' // same as OpGlobal but using names on the stacks opStackGlobal byte = '\x93' // same as OpGlobal but using names on the stacks
opMemoize byte = '\x94' // store top of the stack in memo opMemoize byte = '\x94' // store top of the stack in memo
opFrame byte = '\x95' // indicate the beginning of a new frame opFrame byte = '\x95' // indicate the beginning of a new frame
// Protocol 5
opBytearray8 byte = '\x96' // push a Python bytearray object (len ule64; [len]data)
opNextBuffer byte = '\x97' // push next out-of-band buffer
opReadOnlyBuffer byte = '\x98' // turn out-of-band buffer at stack top to be read-only
) )
var errNotImplemented = errors.New("unimplemented opcode") var errNotImplemented = errors.New("unimplemented opcode")
...@@ -301,11 +307,17 @@ loop: ...@@ -301,11 +307,17 @@ loop:
err = d.stackGlobal() err = d.stackGlobal()
case opMemoize: case opMemoize:
err = d.loadMemoize() err = d.loadMemoize()
case opBytearray8:
err = d.loadBytearray8()
case opNextBuffer:
err = d.loadNextBuffer()
case opReadOnlyBuffer:
err = d.readOnlyBuffer()
case opProto: case opProto:
var v byte var v byte
v, err = d.r.ReadByte() v, err = d.r.ReadByte()
if err == nil && !(0 <= v && v <= 4) { if err == nil && !(0 <= v && v <= 5) {
// We support protocol opcodes for up to protocol 4. // We support protocol opcodes for up to protocol 5.
// //
// The PROTO opcode documentation says protocol version must be in [2, 256). // The PROTO opcode documentation says protocol version must be in [2, 256).
// However CPython also loads PROTO with version 0 and 1 without error. // However CPython also loads PROTO with version 0 and 1 without error.
...@@ -740,24 +752,44 @@ func (d *Decoder) loadString() error { ...@@ -740,24 +752,44 @@ func (d *Decoder) loadString() error {
return nil return nil
} }
// bufLoadBinBytes decodes `len(LE32) [len]data` into d.buf . // bufLoadBinData4 decodes `len(LE32) [len]data` into d.buf .
// it serves loadBin{String,Bytes}. // it serves loadBin{String,Bytes}.
func (d *Decoder) bufLoadBinBytes() error { func (d *Decoder) bufLoadBinData4() error {
var b [4]byte var b [4]byte
_, err := io.ReadFull(d.r, b[:]) _, err := io.ReadFull(d.r, b[:])
if err != nil { if err != nil {
return err return err
} }
v := binary.LittleEndian.Uint32(b[:]) v := binary.LittleEndian.Uint32(b[:])
return d.bufLoadBytesData(uint64(v))
}
// bufLoadBinData8 decodes `len(LE64) [len]data into d.buf .
// it serves loadBytearray8 (and TODO loadBinBytes8, loadBinUnicode8)
func (d *Decoder) bufLoadBinData8() error {
var b [8]byte
_, err := io.ReadFull(d.r, b[:])
if err != nil {
return err
}
v := binary.LittleEndian.Uint64(b[:])
return d.bufLoadBytesData(v)
}
// bufLoadBytesData fetches [lel]data into d.buf.
// it serves bufloadBinBytes{4,8}
func (d *Decoder) bufLoadBytesData(l uint64) error {
d.buf.Reset() d.buf.Reset()
// don't allow malicious `BINSTRING <bigsize> nodata` to make us out of memory // don't allow malicious `BINSTRING <bigsize> nodata` to make us out of memory
prealloc := int(v) prealloc := int(l)
if maxgrow := 0x10000; prealloc > maxgrow { if maxgrow := 0x10000; prealloc > maxgrow {
prealloc = maxgrow prealloc = maxgrow
} }
d.buf.Grow(prealloc) d.buf.Grow(prealloc)
_, err = io.CopyN(&d.buf, d.r, int64(v)) if l > math.MaxInt64 {
return fmt.Errorf("size([]data) > maxint64")
}
_, err := io.CopyN(&d.buf, d.r, int64(l))
if err != nil { if err != nil {
return err return err
} }
...@@ -765,7 +797,7 @@ func (d *Decoder) bufLoadBinBytes() error { ...@@ -765,7 +797,7 @@ func (d *Decoder) bufLoadBinBytes() error {
} }
func (d *Decoder) loadBinString() error { func (d *Decoder) loadBinString() error {
err := d.bufLoadBinBytes() err := d.bufLoadBinData4()
if err != nil { if err != nil {
return err return err
} }
...@@ -774,7 +806,7 @@ func (d *Decoder) loadBinString() error { ...@@ -774,7 +806,7 @@ func (d *Decoder) loadBinString() error {
} }
func (d *Decoder) loadBinBytes() error { func (d *Decoder) loadBinBytes() error {
err := d.bufLoadBinBytes() err := d.bufLoadBinData4()
if err != nil { if err != nil {
return err return err
} }
...@@ -1234,6 +1266,26 @@ func (d *Decoder) loadMemoize() error { ...@@ -1234,6 +1266,26 @@ func (d *Decoder) loadMemoize() error {
return d.memoTop(strconv.Itoa(len(d.memo))) return d.memoTop(strconv.Itoa(len(d.memo)))
} }
func (d *Decoder) loadBytearray8() error {
err := d.bufLoadBinData8()
if err != nil {
return err
}
d.push(d.buf.Bytes())
d.buf = bytes.Buffer{} // fully reset .buf to unalias just pushed []byte
return nil
}
func (d *Decoder) loadNextBuffer() error {
// TODO consider adding support for out-of-band data in the future
return fmt.Errorf("next_buffer: no out-of-band data")
}
func (d *Decoder) readOnlyBuffer() error {
// TODO consider adding support for out-of-band data in the future
return fmt.Errorf("read_only_buffer: stack top is not buffer")
}
// unquoteChar is like strconv.UnquoteChar, but returns io.ErrUnexpectedEOF // unquoteChar is like strconv.UnquoteChar, but returns io.ErrUnexpectedEOF
// instead of strconv.ErrSyntax, when input is prematurely terminted. // instead of strconv.ErrSyntax, when input is prematurely terminted.
// //
......
...@@ -135,16 +135,18 @@ var ( ...@@ -135,16 +135,18 @@ var (
P1 = PP(1) P1 = PP(1)
P2 = PP(2) P2 = PP(2)
P3 = PP(3) P3 = PP(3)
P4 = PP(4)
P01 = PP(0,1) P01 = PP(0,1)
P0123 = PP(0,1,2,3) P0123 = PP(0,1,2,3)
P0_ = PP(0,1,2,3,4) P0_ = PP(0,1,2,3,4,5)
P12 = PP( 1,2) P12 = PP( 1,2)
P1_ = PP( 1,2,3,4) P1_ = PP( 1,2,3,4,5)
P23 = PP( 2,3) P23 = PP( 2,3)
P2_ = PP( 2,3,4) P2_ = PP( 2,3,4,5)
P3_ = PP( 3,4) P3_ = PP( 3,4,5)
P4_ = PP( 4) P4_ = PP( 4,5)
P5_ = PP( 5)
) )
// make sure we use test pickles in fuzz corpus // make sure we use test pickles in fuzz corpus
...@@ -307,7 +309,10 @@ var tests = []TestEntry{ ...@@ -307,7 +309,10 @@ var tests = []TestEntry{
P3("\x80\xffcbuiltins\nbytearray\nC\rhello\nмир\x01\x85R."), P3("\x80\xffcbuiltins\nbytearray\nC\rhello\nмир\x01\x85R."),
// PROTO + SHORT_BINUNICODE + STACK_GLOBAL + SHORT_BINBYTES + TUPLE1 + REDUCE // PROTO + SHORT_BINUNICODE + STACK_GLOBAL + SHORT_BINBYTES + TUPLE1 + REDUCE
P4_("\x80\xff\x8c\x08builtins\x8c\tbytearray\x93C\rhello\nмир\x01\x85R."), P4("\x80\xff\x8c\x08builtins\x8c\tbytearray\x93C\rhello\nмир\x01\x85R."),
// PROTO + BYTEARRAY8
P5_("\x80\xff\x96\x0d\x00\x00\x00\x00\x00\x00\x00hello\nмир\x01."),
// bytearray(text, encoding); GLOBAL + BINUNICODE + TUPLE + REDUCE // bytearray(text, encoding); GLOBAL + BINUNICODE + TUPLE + REDUCE
I("c__builtin__\nbytearray\nq\x00(X\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01q\x01X\x07\x00\x00\x00latin-1q\x02tq\x03Rq\x04.")), I("c__builtin__\nbytearray\nq\x00(X\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01q\x01X\x07\x00\x00\x00latin-1q\x02tq\x03Rq\x04.")),
...@@ -701,6 +706,10 @@ func TestDecodeError(t *testing.T) { ...@@ -701,6 +706,10 @@ func TestDecodeError(t *testing.T) {
// \r\n should not be read as combind EOL - only \n is // \r\n should not be read as combind EOL - only \n is
"L123L\r\n.", "L123L\r\n.",
"S'abc'\r\n.", "S'abc'\r\n.",
// out-of-band data (TODO might consider to add support for it in the future)
"\x97.", // NEXT_BUFFER
"\x98.", // READONLY_BUFFER
} }
for _, tt := range testv { for _, tt := range testv {
buf := bytes.NewBufferString(tt) buf := bytes.NewBufferString(tt)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment