Commit 06e06939 authored by Kirill Smelkov's avatar Kirill Smelkov Committed by Kamil Kisiel

encoder: Allow to specify pickle protocol version

There are many pickle protocol versions - 0 to 4. Python2 for example
understands only versions 0 - 2. However we currently unconditionally
emit opcodes from higher versions, for example STACK_GLOBAL - from
version 4 - when encoding a Class, which leads to inability to decode
pickles generated by ogórek on Python2.

Similarly protocol 0 states that only text opcodes should be used,
however we currently unconditionally emit e.g. BININT (from protocol 1)
when encoding integers.

Changing to always using protocol 0 opcodes would be not good, since many
opcodes for efficiently encoding either integers, booleans, unicode etc
are available only in protocol versions 2 and 4.

For this reason, similarly to Python[1], let's allow users to specify
desired pickle protocol when creating Encoder with config. For backward
compatibility and common sense the protocol version that plain
NewEncoder selects is 2.

This commit adds only above-described user interface and testing
infrastructure for verifying what was the result of encoding an object
at particular protocol version.

For now only a few of pickle test vectors are right wrt what the encoder
should be or currently generates. Thus in the next patches we'll be
step-by-step fixing encoder on this topic.

[1] https://docs.python.org/3/library/pickle.html#pickle.dump
parent 93075d82
......@@ -10,6 +10,8 @@ import (
"strings"
)
const highestProtocol = 4 // highest protocol version we support generating
type TypeError struct {
typ string
}
......@@ -26,6 +28,9 @@ type Encoder struct {
// EncoderConfig allows to tune Encoder.
type EncoderConfig struct {
// Protocol specifies which pickle protocol version should be used.
Protocol int
// PersistentRef, if !nil, will be used by encoder to encode objects as persistent references.
//
// Whenever the encoders sees pointer to a Go struct object, it will call
......@@ -39,7 +44,10 @@ type EncoderConfig struct {
// NewEncoder returns a new Encoder struct with default values
func NewEncoder(w io.Writer) *Encoder {
return NewEncoderWithConfig(w, &EncoderConfig{})
return NewEncoderWithConfig(w, &EncoderConfig{
// allow both Python2 and Python3 to decode what ogórek produces by default
Protocol: 2,
})
}
// NewEncoderWithConfig is similar to NewEncoder, but allows specifying the encoder configuration.
......@@ -49,6 +57,18 @@ func NewEncoderWithConfig(w io.Writer, config *EncoderConfig) *Encoder {
// Encode writes the pickle encoding of v to w, the encoder's writer
func (e *Encoder) Encode(v interface{}) error {
proto := e.config.Protocol
if !(0 <= proto && proto <= highestProtocol) {
return fmt.Errorf("pickle: encode: invalid protocol %d", proto)
}
// protocol >= 2 -> emit PROTO <protocol>
if proto >= 2 {
err := e.emit(opProto, byte(proto))
if err != nil {
return err
}
}
rv := reflectValueOf(v)
err := e.encode(rv)
if err != nil {
......
......@@ -34,14 +34,14 @@ func TestMarker(t *testing.T) {
}
}
// hexInput decodes hex-encoded data into string.
// hexInput decodes hex-encoded data into input TestPickle.
// it panics on decode errors.
func hexInput(hexdata string) string {
func hexInput(hexdata string) TestPickle {
data, err := hex.DecodeString(hexdata)
if err != nil {
panic(err)
}
return string(data)
return I(string(data))
}
var graphitePickle1 = hexInput("80025d71017d710228550676616c75657371035d71042847407d90000000000047407f100000000000474080e0000000000047409764000000000047409c40000000000047409d88000000000047409f74000000000047409c74000000000047409cdc00000000004740a10000000000004740a0d800000000004740938800000000004740a00e00000000004740988800000000004e4e655505737461727471054a00d87a5255047374657071064a805101005503656e6471074a00f08f5255046e616d657108552d5a5a5a5a2e55555555555555552e43434343434343432e4d4d4d4d4d4d4d4d2e5858585858585858582e545454710975612e")
......@@ -55,126 +55,188 @@ var graphiteObject3 = []interface{}{map[interface{}]interface{}{"intervals": []i
const longLine = "28,34,30,55,100,130,87,169,194,202,232,252,267,274,286,315,308,221,358,368,401,406,434,452,475,422,497,530,517,559,400,418,571,578,599,600,625,630,635,647,220,715,736,760,705,785,794,495,808,852,861,863,869,875,890,893,896,922,812,980,1074,1087,1145,1153,1163,1171,445,1195,1203,1242,1255,1274,52,1287,1319,636,1160,1339,1345,1353,1369,1391,1396,1405,1221,1410,1431,1451,1460,1470,1472,1492,1517,1528,419,1530,1532,1535,1573,1547,1574,1437,1594,1595,847,1551,983,1637,1647,1666,1672,1691,1726,1515,1731,1739,1741,1723,1776,1685,505,1624,1436,1890,728,1910,1931,1544,2013,2025,2030,2043,2069,1162,2129,2160,2199,2210,1911,2246,804,2276,1673,2299,2315,2322,2328,2355,2376,2405,1159,2425,2430,2452,1804,2442,2567,2577,1167,2611,2534,1879,2623,2682,2699,2652,2742,2754,2774,2782,2795,2431,2821,2751,2850,2090,513,2898,592,2932,2933,1555,2969,3003,3007,3010,2595,3064,3087,3105,3106,3110,151,3129,3132,304,3173,3205,3233,3245,3279,3302,3307,714,316,3331,3347,3360,3375,3380,3442,2620,3482,3493,3504,3516,3517,3518,3533,3511,2681,3530,3601,3606,3615,1210,3633,3651,3688,3690,3781,1907,3839,3840,3847,3867,3816,3899,3924,2345,3912,3966,982,4040,4056,4076,4084,4105,2649,4171,3873,1415,3567,4188,4221,4227,4231,2279,4250,4253,770,894,4343,4356,4289,4404,4438,2572,3124,4334,2114,3953,4522,4537,4561,4571,641,4629,4640,4664,4687,4702,4709,4740,4605,4746,4768,3856,3980,4814,2984,4895,4908,1249,4944,4947,4979,4988,4995,32,4066,5043,4956,5069,5072,5076,5084,5085,5137,4262,5152,479,5156,3114,1277,5183,5186,1825,5106,5216,963,5239,5252,5218,5284,1980,1972,5352,5364,5294,5379,5387,5391,5397,5419,5434,5468,5471,3350,5510,5522,5525,5538,5554,5573,5597,5610,5615,5624,842,2851,5641,5655,5656,5658,5678,5682,5696,5699,5709,5728,5753,851,5805,3528,5822,801,5855,2929,5871,5899,5918,5925,5927,5931,5935,5939,5958,778,5971,5980,5300,6009,6023,6030,6032,6016,6110,5009,6155,6197,1760,6253,6267,4886,5608,6289,6308,6311,6321,6316,6333,6244,6070,6349,6353,6186,6357,6366,6386,6387,6389,6399,6411,6421,6432,6437,6465,6302,6493,5602,6511,6529,6536,6170,6557,6561,6577,6581,6590,5290,5649,6231,6275,6635,6651,6652,5929,6692,6693,6695,6705,6711,6723,6738,6752,6753,3629,2975,6790,5845,338,6814,6826,6478,6860,6872,6882,880,356,6897,4102,6910,6611,1030,6934,6936,6987,6984,6999,827,6902,7027,7049,7051,4628,7084,7083,7071,7102,7137,5867,7152,6048,2410,3896,7168,7177,7224,6606,7233,1793,7261,7284,7290,7292,5212,7315,6964,3238,355,1969,4256,448,7325,908,2824,2981,3193,3363,3613,5325,6388,2247,1348,72,131,5414,7285,7343,7349,7362,7372,7381,7410,7418,7443,5512,7470,7487,7497,7516,7277,2622,2863,945,4344,3774,1024,2272,7523,4476,256,5643,3164,7539,7540,7489,1932,7559,7575,7602,7605,7609,7608,7619,7204,7652,7663,6907,7672,7654,7674,7687,7718,7745,1202,4030,7797,7801,7799,2924,7871,7873,7900,7907,7911,7912,7917,7923,7935,8007,8017,7636,8084,8087,3686,8114,8153,8158,8171,8175,8182,8205,8222,8225,8229,8232,8234,8244,8247,7256,8279,6929,8285,7040,8328,707,6773,7949,8468,5759,6344,8509,1635"
// TestPickle represents a test pickle that ogórek encoder produces at particular protocols.
//
// If protov is empty there is no connection in between ogórek encoder and the
// data. However the test data can still be used to feed ogórek decoder.
type TestPickle struct {
protov []int
data string // without `PROTO <ver>` prefix
err error // !nil if encoding should fail
}
// TestEntry represents one decode/encode test.
type TestEntry struct {
name string
// object(s) and []pickle. All pickle must decode to objectOut.
// Encoding objectIn must give some pickle that decodes to ObjectOut.
// object(s) and []TestPickle. All pickles must decode to objectOut.
// Encoding objectIn at particular protocol must give particular TestPickle.
//
// In the usual case objectIn == objectOut and they can only differ if
// objectIn contains a Go struct.
objectIn interface{}
picklev []string
picklev []TestPickle
objectOut interface{}
}
// X, I, P0, P1, P* form a language to describe decode/encode tests:
//
// - X(name, object, ...) represents one test entry. All pickles from "..."
// (see below) must decode to object. Encoding the object at particular
// settings (e.g. at protocol=1 for P1 pickle) must give specified pickle data.
//
// - I denotes arbitrary input. Decoding it must produce the object.
//
// - P* denotes a TestPickle. Encoding the object at particular setting (e.g. P1
// represents protocol=1, P1_ represents protocol >= 1) must give the pickle data.
// Decoding the pickle data must give the object.
// X is syntatic sugar to prepare one TestEntry.
func X(name string, object interface{}, picklev ...string) TestEntry {
func X(name string, object interface{}, picklev ...TestPickle) TestEntry {
return TestEntry{name: name, objectIn: object, objectOut: object, picklev: picklev}
}
// Xloosy is syntatic sugar to prepare one TestEntry with loosy incoding.
//
// It should be used only if objectIn contains Go structs.
func Xloosy(name string, objectIn, objectOut interface{}, picklev ...string) TestEntry {
func Xloosy(name string, objectIn, objectOut interface{}, picklev ...TestPickle) TestEntry {
return TestEntry{name: name, objectIn: objectIn, objectOut: objectOut, picklev: picklev}
}
func I(input string) TestPickle { return TestPickle{protov: nil, data: input, err: nil} }
// PP(protov) creates func PX(pickle) which in turn produces TestPickle{protocol: protov, pickle}.
func PP(protov ...int) func(xpickle interface{}) TestPickle {
return func(xpickle interface{}) TestPickle {
t := TestPickle{protov: protov}
switch x := xpickle.(type) {
case string:
t.data = x
case error:
t.err = x
default:
panic(fmt.Sprintf("P* accept only string|error, not %T (%v)", xpickle, xpickle))
}
return t
}
}
// PX creates TestPickle with .protov={x} .
// PX_ creates TestPickle with .protov={x,x+1,...} .
var (
P0 = PP(0)
P1 = PP(1)
P2 = PP(2)
P3 = PP(3)
P01 = PP(0,1)
P0123 = PP(0,1,2,3)
P0_ = PP(0,1,2,3,4)
P12 = PP( 1,2)
P1_ = PP( 1,2,3,4)
P23 = PP( 2,3)
P2_ = PP( 2,3,4)
P3_ = PP( 3,4)
P4_ = PP( 4)
)
// tests is the main registry for decode/encode tests.
var tests = []TestEntry{
X("None", None{},
"N."),
P0_("N.")), // NONE
X("int(5)", int64(5),
"I5\n."),
I("I5\n.")), // INT
X("int(0x123)", int64(0x123),
"I291\n."),
I("I291\n.")), // INT
X("int(0x12345)", int64(0x12345),
"I74565\n."),
I("I74565\n.")), // INT
X("float", float64(1.23),
"F1.23\n."),
I("F1.23\n.")), // FLOAT
X("long", bigInt("12321231232131231231"),
"L12321231232131231231L\n."),
P0("L12321231232131231231L\n.")), // LONG
X("tuple()", Tuple{},
"(t."),
I("(t.")), // MARK + TUPLE
X("tuple((1,))", Tuple{int64(1)},
"I1\n\x85."), // TUPLE1 + INT
I("I1\n\x85.")), // TUPLE1 + INT
X("tuple((1,2))", Tuple{int64(1), int64(2)},
"(I1\nI2\ntp0\n.", // MARK + TUPLE + INT
"I1\nI2\n\x86."), // TUPLE2 + INT
I("(I1\nI2\ntp0\n."), // MARK + TUPLE + INT
I("I1\nI2\n\x86.")), // TUPLE2 + INT
X("tuple((1,2,3))", Tuple{int64(1), int64(2), int64(3)},
"I1\nI2\nI3\n\x87."), // TUPLE3 + INT
I("I1\nI2\nI3\n\x87.")), // TUPLE3 + INT
X("tuple(((1,2), (3,4)))", Tuple{Tuple{int64(1), int64(2)}, Tuple{int64(3), int64(4)}},
"((I1\nI2\ntp0\n(I3\nI4\ntp1\ntp2\n."),
I("((I1\nI2\ntp0\n(I3\nI4\ntp1\ntp2\n.")),
X("list([])", []interface{}{},
"(lp0\n."),
I("(lp0\n.")),
X("list([1,2,3,True])", []interface{}{int64(1), int64(2), int64(3), true},
"(lp0\nI1\naI2\naI3\naI01\na."),
I("(lp0\nI1\naI2\naI3\naI01\na.")),
X("str('abc')", "abc",
"S'abc'\np0\n."),
I("S'abc'\np0\n.")),
X("unicode('日本語')", "日本語",
"V\\u65e5\\u672c\\u8a9e\np0\n.", // UNICODE
"\x8c\t\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\x94."), // SHORT_BINUNICODE
I("\x8c\t\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\x94."), // SHORT_BINUNICODE
I("V\\u65e5\\u672c\\u8a9e\np0\n.")), // UNICODE
X("unicode('\\' 知事少时烦恼少、识人多处是非多。')", "' 知事少时烦恼少、识人多处是非多。",
"V' \\u77e5\\u4e8b\\u5c11\\u65f6\\u70e6\\u607c\\u5c11\\u3001\\u8bc6\\u4eba\\u591a\\u5904\\u662f\\u975e\\u591a\\u3002\n."),
// UNICODE
I("V' \\u77e5\\u4e8b\\u5c11\\u65f6\\u70e6\\u607c\\u5c11\\u3001\\u8bc6\\u4eba\\u591a\\u5904\\u662f\\u975e\\u591a\\u3002\n.")),
X("dict({})", make(map[interface{}]interface{}),
"(dp0\n."),
I("(dp0\n.")),
X("dict({'a': '1', 'b': '2'})", map[interface{}]interface{}{"a": "1", "b": "2"},
"(dp0\nS'a'\np1\nS'1'\np2\nsS'b'\np3\nS'2'\np4\ns."),
I("(dp0\nS'a'\np1\nS'1'\np2\nsS'b'\np3\nS'2'\np4\ns.")),
X("foo.bar # global", Class{Module: "foo", Name: "bar"},
"S'foo'\nS'bar'\n\x93."),
I("S'foo'\nS'bar'\n\x93.")), // STRING + STACK_GLOBAL
X(`foo.bar("bing") # global + reduce`, Call{Callable: Class{Module: "foo", Name: "bar"}, Args: []interface{}{"bing"}},
"cfoo\nbar\nS'bing'\n\x85R."),
I("cfoo\nbar\nS'bing'\n\x85R.")), // GLOBAL + STRING + TUPLE1 + REDUCE
X(`persref("abc")`, Ref{"abc"},
"Pabc\n."),
P0("Pabc\n.")), // PERSID
X(`persref("abc\nd")`, Ref{"abc\nd"},
"U\x05abc\ndQ."),
P12("U\x05abc\ndQ.")), // SHORT_BINSTRING + BINPERSID
X(`persref((1, 2))`, Ref{Tuple{int64(1), int64(2)}},
"(I1\nI2\ntQ."),
I("(I1\nI2\ntQ.")),
// decode only
// TODO PUT + GET + BINGET + LONG_BINGET
X("LONG_BINPUT", []interface{}{int64(17)},
"(lr0000I17\na."),
I("(lr0000I17\na.")),
X("graphite message1", graphiteObject1, graphitePickle1),
X("graphite message2", graphiteObject2, graphitePickle2),
X("graphite message3", graphiteObject3, graphitePickle3),
X("too long line", longLine, "V" + longLine + "\n."),
X("too long line", longLine, I("V" + longLine + "\n.")),
// opcodes from protocol 4
X("FRAME opcode", int64(5),
"\x95\x00\x00\x00\x00\x00\x00\x00\x00I5\n."), // FRAME is just skipped
I("\x95\x00\x00\x00\x00\x00\x00\x00\x00I5\n.")), // FRAME is just skipped
// loosy encode: decoding back gives another object.
// the only case where ogórek encoding is loosy is for Go struct types.
Xloosy("[]ogórek.foo{\"Qux\", 4}", []foo{{"Qux", 4}},
[]interface{}{map[interface{}]interface{}{"Foo": "Qux", "Bar": int64(4)}},
"((S\"Foo\"\nS\"Qux\"\nS\"Bar\"\nI4\ndl."),
// MARK + STRING + INT + DICT + LIST
I("((S\"Foo\"\nS\"Qux\"\nS\"Bar\"\nI4\ndl.")),
}
// foo is a type to test how encoder handles Go structs.
......@@ -187,8 +249,12 @@ type foo struct {
func TestDecode(t *testing.T) {
for _, test := range tests {
for _, pickle := range test.picklev {
t.Run(fmt.Sprintf("%s/%q", test.name, pickle), func(t *testing.T) {
testDecode(t, test.objectOut, pickle)
if pickle.err != nil {
continue
}
t.Run(fmt.Sprintf("%s/%q", test.name, pickle.data), func(t *testing.T) {
testDecode(t, test.objectOut, pickle.data)
})
}
}
......@@ -197,9 +263,33 @@ func TestDecode(t *testing.T) {
// TestEncode verifies ogórek encoder.
func TestEncode(t *testing.T) {
for _, test := range tests {
t.Run(fmt.Sprintf("%s", test.name), func(t *testing.T) {
testEncode(t, test.objectIn, test.objectOut)
})
alreadyTested := make(map[int]bool) // protocols we tested encode with so far
for _, pickle := range test.picklev {
for _, proto := range pickle.protov {
dataOk := pickle.data
// protocols >= 2 must include "PROTO <ver>" prefix
if proto >= 2 && pickle.err == nil {
dataOk = string([]byte{opProto, byte(proto)}) + dataOk
}
t.Run(fmt.Sprintf("%s/proto=%d", test.name, proto), func(t *testing.T) {
testEncode(t, proto, test.objectIn, test.objectOut, dataOk, pickle.err)
})
alreadyTested[proto] = true
}
}
// test encode-decode roundtrip on not yet tested protocols
for proto := 0; proto <= highestProtocol; proto++ {
if alreadyTested[proto] {
continue
}
t.Run(fmt.Sprintf("%s/proto=%d(roundtrip)", test.name, proto), func(t *testing.T) {
testEncode(t, proto, test.objectIn, test.objectOut, "", nil)
})
}
}
}
......@@ -252,27 +342,46 @@ func testDecode(t *testing.T, object interface{}, input string) {
}
}
// testEncode encodes object and verifies it is ok.
// testEncode encodes object using proto for pickle protocol, and verifies the result == dataOk.
//
// It also verifies that encoder handles write errors via using it on all kinds
// of limited writers. The data, that encoder produces, must decode back to
// expected object.
func testEncode(t *testing.T, object, objectDecodedBack interface{}) {
//
// If dataOk == "" no `result == dataOk` check is done, but encoding + followup
// encode-back tests are still performed.
//
// If errOk != nil, object encoding must produce that error.
func testEncode(t *testing.T, proto int, object, objectDecodedBack interface{}, dataOk string, errOk error) {
buf := &bytes.Buffer{}
enc := NewEncoder(buf)
enc := NewEncoderWithConfig(buf, &EncoderConfig{
Protocol: proto,
})
// encode(object)
// encode(object) == expected data
err := enc.Encode(object)
if errOk != nil {
if err != errOk {
t.Errorf("encode: expected error:\nhave: %#v\nwant: %#v", err, errOk)
}
return
}
if err != nil {
t.Fatalf("encode error: %s", err)
}
data := buf.String()
if dataOk != "" && data != dataOk {
t.Errorf("encode:\nhave: %q\nwant: %q", data, dataOk)
}
// encode | limited writer -> write error
for l := int64(len(data))-1; l >= 0; l-- {
buf.Reset()
enc = NewEncoder(LimitWriter(buf, l))
enc = NewEncoderWithConfig(LimitWriter(buf, l), &EncoderConfig{
Protocol: proto,
})
err = enc.Encode(object)
if err != io.EOF {
t.Errorf("encoder did not handle write error @%d: got %#v", l, err)
......@@ -531,7 +640,9 @@ func BenchmarkDecode(b *testing.B) {
npickle := 0
for _, test := range tests {
for _, pickle := range test.picklev {
input = append(input, pickle...)
// not prepending `PROTO <ver>` - decoder should be
// able to decode without it.
input = append(input, pickle.data...)
npickle++
}
}
......@@ -564,7 +675,7 @@ func BenchmarkEncode(b *testing.B) {
approxOutSize := 0
for _, test := range tests {
input = append(input, test.objectIn)
approxOutSize += len(test.picklev[0])
approxOutSize += len(test.picklev[0].data)
}
buf := bytes.NewBuffer(make([]byte, approxOutSize))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment