decoder: Don't treat \r\n as combined EOL

Currently we use bufio.Reader.ReadLine which accepts either \n or \r\n as line ending. That is however not correct: - we should not accept e.g. "S'abc'\r\n." pickle, because it is invalid: In [32]: pickle.loads(b"S'abc'\r\n.") --------------------------------------------------------------------------- UnpicklingError Traceback (most recent call last) <ipython-input-32-b1da1988bae1> in <module>() ----> 1 pickle.loads(b"S'abc'\r\n.") UnpicklingError: the STRING opcode argument must be quoted - we should not accept e.g. "L123L\r\n.", because it is also invalid: In [33]: pickle.loads(b"L123L\r\n.") --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-33-7231ec07f5c4> in <module>() ----> 1 pickle.loads(b"L123L\r\n.") ValueError: invalid literal for int() with base 10: '123L\r\n' - treating \r as part of EOL in e.g. UNICODE pickle would just drop encoded information: # python In [34]: pickle.loads(b"Vabc\r\n.") Out[34]: 'abc\r' while ogórek currently decodes it as just 'abc' (no trailing \r). For this reason let's fix Decoder.readLine to treat only \n as EOL. Besides this fix, we now get another property: previously, when internally using bufio.Reader.ReadLine we were not able to distinguish two situations: - a line was abruptly ended without any EOL characters at all, - a line was properly ended with EOL character. Now after we switched to internally using bufio.Reader.ReadSlice, we will be able to properly detect EOF and return that as error. This property will be needed in the following patch.

decoder: Don't treat \r\n as combined EOL
Currently we use bufio.Reader.ReadLine which accepts either \n or \r\n as line ending. That is however not correct: - we should not accept e.g. "S'abc'\r\n." pickle, because it is invalid: In [32]: pickle.loads(b"S'abc'\r\n.") --------------------------------------------------------------------------- UnpicklingError Traceback (most recent call last) <ipython-input-32-b1da1988bae1> in <module>() ----> 1 pickle.loads(b"S'abc'\r\n.") UnpicklingError: the STRING opcode argument must be quoted - we should not accept e.g. "L123L\r\n.", because it is also invalid: In [33]: pickle.loads(b"L123L\r\n.") --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-33-7231ec07f5c4> in <module>() ----> 1 pickle.loads(b"L123L\r\n.") ValueError: invalid literal for int() with base 10: '123L\r\n' - treating \r as part of EOL in e.g. UNICODE pickle would just drop encoded information: # python In [34]: pickle.loads(b"Vabc\r\n.") Out[34]: 'abc\r' while ogórek currently decodes it as just 'abc' (no trailing \r). For this reason let's fix Decoder.readLine to treat only \n as EOL. Besides this fix, we now get another property: previously, when internally using bufio.Reader.ReadLine we were not able to distinguish two situations: - a line was abruptly ended without any EOL characters at all, - a line was properly ended with EOL character. Now after we switched to internally using bufio.Reader.ReadSlice, we will be able to properly detect EOF and return that as error. This property will be needed in the following patch.
57137139 · Kirill Smelkov · Kamil Kisiel · 9d1344ba · 57137139 · 57137139
Commit 57137139 authored Sep 21, 2018 by Kirill Smelkov Committed by Kamil Kisiel Sep 21, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 9 deletions

ogorek.go ogorek.go +18 -9

ogorek_test.go ogorek_test.go +8 -0

No files found.
--- a/ogorek.go
+++ b/ogorek.go
@@ -325,23 +325,32 @@ loop:
 	return d.popUser()
 }

-// readLine reads next line from pickle stream
-// returned line is valid only till next call to readLine
+// readLine reads next line from pickle stream.
+//
+// returned line does not contain \n.
+// returned line is valid only till next call to readLine.
 func (d *Decoder) readLine() ([]byte, error) {
 	var (
 		data     []byte
-		isPrefix = true
 		err      error
 	)
 	d.line = d.line[:0]
-	for isPrefix {
-		data, isPrefix, err = d.r.ReadLine()
-		if err != nil {
-			return d.line, err
-		}
+	for {
+		data, err = d.r.ReadSlice('\n')
 		d.line = append(d.line, data...)
+
+		// either have read till \n or got another error
+		if err != bufio.ErrBufferFull {
+			break
+		}
 	}
-	return d.line, nil
+
+	// trim trailing \n
+	if l := len(d.line); l > 0 && d.line[l-1] == '\n' {
+		d.line = d.line[:l-1]
+	}
+
+	return d.line, err
 }

 // userOK tells whether it is ok to return all objects to user.

--- a/ogorek_test.go
+++ b/ogorek_test.go
@@ -231,6 +231,10 @@ var tests = []TestEntry{
 		I("S'abc'\np0\n."),
 		I("S'abc'\n.")),

+	// TODO: reenable after we fix string escape decoding (https://github.com/kisielk/og-rek/issues/48)
+	// X(`unicode('abc\r')`, "abc\r",
+	//	I("Vabc\r\n.")),
+
 	X("unicode('日本語')", "日本語",
 		P0("S\"日本語\"\n."),                                 // STRING
 		P12("U\x09日本語."),                                  // SHORT_BINSTRING
@@ -583,6 +587,10 @@ func TestDecodeError(t *testing.T) {
 		"}I1\n(s.",                  // EMPTY_DICT + INT + MARK + SETITEM
 		"}(I1\ns.",                  // EMPTY_DICT + MARK + INT + SETITEM
 		"(Q.",                       // MARK + BINPERSID
+
+		// \r\n should not be read as combind EOL - only \n is
+		"L123L\r\n.",
+		"S'abc'\r\n.",
 	}
 	for _, tt := range testv {
 		buf := bytes.NewBufferString(tt)