go/zodb/zodbtools: Dump

Add `zodb dump` command to dump arbitrary ZODB database in generic format. The actual dump protocol being used here is the same as in zodbtools/py with https://lab.nexedi.com/zodbtools/merge_requests/3 applied. (the MR there is OK and is just waiting for upstream ZODB to negotiate a way to retrieve transaction extension data in raw form).

go/zodb/zodbtools: Dump
Add `zodb dump` command to dump arbitrary ZODB database in generic format. The actual dump protocol being used here is the same as in zodbtools/py with https://lab.nexedi.com/zodbtools/merge_requests/3 applied. (the MR there is OK and is just waiting for upstream ZODB to negotiate a way to retrieve transaction extension data in raw form).
dbb63f65 · Kirill Smelkov · c6457cf7 · dbb63f65 · dbb63f65 · dbb63f65
Commit dbb63f65 authored Jan 15, 2018 by Kirill Smelkov
5 changed files
--- a/go/zodb/zodbtools/dump.go
+++ b/go/zodb/zodbtools/dump.go
+// Copyright (C) 2016-2017  Nexedi SA and Contributors.
+//                          Kirill Smelkov <kirr@nexedi.com>
+//
+// This program is free software: you can Use, Study, Modify and Redistribute
+// it under the terms of the GNU General Public License version 3, or (at your
+// option) any later version, as published by the Free Software Foundation.
+//
+// You can also Link and Combine this program with other software covered by
+// the terms of any of the Free Software licenses or any of the Open Source
+// Initiative approved licenses and Convey the resulting work. Corresponding
+// source of such a combination shall include the source code for all other
+// software used.
+//
+// This program is distributed WITHOUT ANY WARRANTY; without even the implied
+// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See COPYING file for full licensing terms.
+// See https://www.nexedi.com/licensing for rationale and options.
+
+/*
+Zodbdump - Tool to dump content of a ZODB database
+
+This program dumps content of a ZODB database.
+It uses ZODB Storage iteration API to get list of transactions and for every
+transaction prints transaction's header and information about changed objects.
+
+The information dumped is complete raw information as stored in ZODB storage
+and should be suitable for restoring the database from the dump file bit-to-bit
+identical to its original. It is dumped in semi text-binary format where
+object data is output as raw binary and everything else is text.
+
+There is also shortened mode activated via -hashonly where only hash of object
+data is printed without content.
+
+Dump format:
+
+    txn <tid> <status|quote>
+    user <user|quote>
+    description <description|quote>
+    extension <extension|quote>
+    obj <oid> (delete | from <tid> | <size> <hashfunc>:<hash> (-|LF <raw-content>)) LF
+    obj ...
+    ...
+    obj ...
+    LF
+    txn ...
+
+quote:      quote string with " with non-printable and control characters \-escaped
+hashfunc:   one of sha1, sha256, sha512 ...
+
+TODO also protect txn record by hash.
+*/
+
+package zodbtools
+
+import (
+	"context"
+	"crypto/sha1"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+
+	"lab.nexedi.com/kirr/go123/prog"
+	"lab.nexedi.com/kirr/go123/xfmt"
+	"lab.nexedi.com/kirr/neo/go/zodb"
+)
+
+
+// dumper dumps zodb record to a writer
+type dumper struct {
+	W          io.Writer
+	HashOnly   bool		// whether to dump only hashes of data without content
+
+	afterFirst bool // true after first transaction has been dumped
+
+	buf xfmt.Buffer // reusable data buffer for formatting
+}
+
+var _LF = []byte{'\n'}
+
+
+// DumpData dumps one data record
+func (d *dumper) DumpData(datai *zodb.DataInfo) error {
+	buf := &d.buf
+	buf.Reset()
+
+	buf .S("obj ") .V(&datai.Oid) .Cb(' ')
+
+	writeData := false
+
+	switch {
+	case datai.Data == nil:
+		buf .S("delete")
+
+	case datai.DataTidHint != 0:
+		buf .S("from ") .V(&datai.DataTidHint)
+
+	default:
+		// XXX sha1 is hardcoded for now. Dump format allows other hashes.
+		dataSha1 := sha1.Sum(datai.Data)
+		buf .D(len(datai.Data)) .S(" sha1:") .Xb(dataSha1[:])
+
+		writeData = true
+	}
+
+	var data []byte
+	if writeData {
+		if d.HashOnly {
+			buf .S(" -")
+		} else {
+			buf .Cb('\n')
+			data = datai.Data
+		}
+	}
+
+	// TODO use writev(buf, data, "\n") via net.Buffers (it is already available)
+	_, err := d.W.Write(buf.Bytes())
+	if err != nil {
+		goto out
+	}
+
+	if data != nil {
+		_, err = d.W.Write(datai.Data)
+		if err != nil {
+			goto out
+		}
+	}
+
+	_, err = d.W.Write(_LF)
+	if err != nil {
+		goto out
+	}
+
+out:
+	// XXX do we need this context ?
+	// see for rationale in similar place in DumpTxn
+	if err != nil {
+		return fmt.Errorf("%v: %v", datai.Oid, err)
+	}
+
+	return nil
+}
+
+// DumpTxn dumps one transaction record
+func (d *dumper) DumpTxn(ctx context.Context, txni *zodb.TxnInfo, dataIter zodb.IDataIterator) error {
+	var datai *zodb.DataInfo
+
+	// LF in-between txn records
+	vskip := "\n"
+	if !d.afterFirst {
+		vskip = ""
+		d.afterFirst = true
+	}
+
+	_, err := fmt.Fprintf(d.W, "%stxn %s %q\nuser %q\ndescription %q\nextension %q\n",
+			vskip, txni.Tid, string(txni.Status), txni.User, txni.Description, txni.Extension)
+	if err != nil {
+		goto out
+	}
+
+	// data records
+	for {
+		datai, err = dataIter.NextData(ctx)
+		if err != nil {
+			if err == io.EOF {
+				err = nil	// XXX -> okEOF ?
+			}
+
+			break
+		}
+
+		err = d.DumpData(datai)
+		if err != nil {
+			break
+		}
+	}
+
+out:
+	// XXX do we need this context ?
+	// rationale: dataIter.NextData() if error in db - will include db context
+	// if error is in writer - it will include its own context
+	if err != nil {
+		return fmt.Errorf("%v: %v", txni.Tid, err)
+	}
+
+	return nil
+}
+
+// Dump dumps transaction records in between tidMin..tidMax
+func (d *dumper) Dump(ctx context.Context, stor zodb.IStorage, tidMin, tidMax zodb.Tid) error {
+	var txni     *zodb.TxnInfo
+	var dataIter zodb.IDataIterator
+	var err      error
+
+	iter := stor.Iterate(ctx, tidMin, tidMax)
+
+	// transactions
+	for {
+		txni, dataIter, err = iter.NextTxn(ctx)
+		if err != nil {
+			if err == io.EOF {
+				err = nil	// XXX -> okEOF ?
+			}
+
+			break
+		}
+
+		err = d.DumpTxn(ctx, txni, dataIter)
+		if err != nil {
+			break
+		}
+	}
+
+	if err != nil {
+		return fmt.Errorf("%s: dump %v..%v: %v", stor.URL(), tidMin, tidMax, err)
+	}
+
+	return nil
+}
+
+// Dump dumps contents of a storage in between tidMin..tidMax range to a writer.
+//
+// see top-level documentation for the dump format.
+func Dump(ctx context.Context, w io.Writer, stor zodb.IStorage, tidMin, tidMax zodb.Tid, hashOnly bool) error {
+	d := dumper{W: w, HashOnly: hashOnly}
+	return d.Dump(ctx, stor, tidMin, tidMax)
+}
+
+// ----------------------------------------
+
+const dumpSummary = "dump content of a ZODB database"
+
+func dumpUsage(w io.Writer) {
+	fmt.Fprintf(w,
+`Usage: zodb dump [OPTIONS] <storage> [tidmin..tidmax]
+Dump content of a ZODB database.
+
+<storage> is an URL (see 'zodb help zurl') of a ZODB-storage.
+
+Options:
+
+	-h --help       this help text.
+	-hashonly	dump only hashes of objects without content.
+`)
+}
+
+func dumpMain(argv []string) {
+	hashOnly := false
+	tidRange := ".." // [0, +inf]
+
+	flags := flag.FlagSet{Usage: func() { dumpUsage(os.Stderr) }}
+	flags.Init("", flag.ExitOnError)
+	flags.BoolVar(&hashOnly, "hashonly", hashOnly, "dump only hashes of objects")
+	flags.Parse(argv[1:])
+
+	argv = flags.Args()
+	if len(argv) < 1 {
+		flags.Usage()
+		prog.Exit(2)
+	}
+	storUrl := argv[0]
+
+	if len(argv) > 1 {
+		tidRange = argv[1]
+	}
+
+	tidMin, tidMax, err := zodb.ParseTidRange(tidRange)
+	if err != nil {
+		prog.Fatal(err)
+	}
+
+	ctx := context.Background()
+
+	stor, err := zodb.OpenStorage(ctx, storUrl, &zodb.OpenOptions{ReadOnly: true})
+	if err != nil {
+		prog.Fatal(err)
+	}
+	// TODO defer stor.Close()
+
+	err = Dump(ctx, os.Stdout, stor, tidMin, tidMax, hashOnly)
+	if err != nil {
+		prog.Fatal(err)
+	}
+}
--- a/go/zodb/zodbtools/dump_test.go
+++ b/go/zodb/zodbtools/dump_test.go
+// Copyright (C) 2016-2017  Nexedi SA and Contributors.
+//                          Kirill Smelkov <kirr@nexedi.com>
+//
+// This program is free software: you can Use, Study, Modify and Redistribute
+// it under the terms of the GNU General Public License version 3, or (at your
+// option) any later version, as published by the Free Software Foundation.
+//
+// You can also Link and Combine this program with other software covered by
+// the terms of any of the Free Software licenses or any of the Open Source
+// Initiative approved licenses and Convey the resulting work. Corresponding
+// source of such a combination shall include the source code for all other
+// software used.
+//
+// This program is distributed WITHOUT ANY WARRANTY; without even the implied
+// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See COPYING file for full licensing terms.
+// See https://www.nexedi.com/licensing for rationale and options.
+
+package zodbtools
+
+//go:generate sh -c "python2 -m zodbtools.zodb dump ../../zodb/storage/fs1/testdata/1.fs >testdata/1.zdump.pyok"
+//go:generate sh -c "python2 -m zodbtools.zodb dump ../../zodb/storage/fs1/testdata/empty.fs >testdata/empty.zdump.pyok"
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"testing"
+
+	"lab.nexedi.com/kirr/neo/go/zodb"
+	_ "lab.nexedi.com/kirr/neo/go/zodb/wks"
+
+	"github.com/kylelemons/godebug/diff"
+	"lab.nexedi.com/kirr/go123/exc"
+)
+
+// loadZdumpPy loads a zdump file and normalizes escaped strings to the way go
+// would escape them.
+func loadZdumpPy(t *testing.T, path string) string {
+	dump, err := ioutil.ReadFile(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// python quotes "\v" as "\x0b", go as "\v"; same for "\f", "\a", "\b".
+	// XXX this is a bit hacky. We could compare quoted strings as decoded,
+	// but this would need zdump format parser which could contain other
+	// bugs.  Here we want to compare output ideally bit-to-bit but those
+	// \v vs \x0b glitches prevents that to be done directly. So here we
+	// are with this ugly hack:
+	var pyNoBackLetter = []struct{ backNoLetterRe, backLetter string }{
+		{`\\x07`, `\a`},
+		{`\\x08`, `\b`},
+		{`\\x0b`, `\v`},
+		{`\\x0c`, `\f`},
+	}
+
+	for _, __ := range pyNoBackLetter {
+		re := regexp.MustCompile(__.backNoLetterRe)
+		dump = re.ReplaceAllLiteral(dump, []byte(__.backLetter))
+	}
+
+	return string(dump)
+}
+
+func withTestdataFs(t testing.TB, db string, f func(zstor zodb.IStorage)) {
+	zstor, err := zodb.OpenStorage(context.Background(), fmt.Sprintf("../../zodb/storage/fs1/testdata/%s.fs", db), &zodb.OpenOptions{ReadOnly: true})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	defer exc.XRun(zstor.Close)
+
+	f(zstor)
+}
+
+func TestZodbDump(t *testing.T) {
+	testv := []string{"1", "empty"}
+	for _, tt := range testv {
+		t.Run("db=" + tt, func(t *testing.T) {
+			withTestdataFs(t, tt, func(zstor zodb.IStorage) {
+				buf := bytes.Buffer{}
+
+				err := Dump(context.Background(), &buf, zstor, 0, zodb.TidMax, false)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				dumpOk := loadZdumpPy(t, fmt.Sprintf("testdata/%s.zdump.pyok", tt))
+
+				if dumpOk != buf.String() {
+					t.Errorf("dump different:\n%v", diff.Diff(dumpOk, buf.String()))
+				}
+			})
+		})
+	}
+}
+
+
+func BenchmarkZodbDump(b *testing.B) {
+	// FIXME small testdata/1.fs is not representative for benchmarking
+	withTestdataFs(b, "1", func(zstor zodb.IStorage) {
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			err := Dump(context.Background(), ioutil.Discard, zstor, 0, zodb.TidMax, false)
+			if err != nil {
+				b.Fatal(err)
+			}
+		}
+
+		b.StopTimer()
+	})
+}
--- a/go/zodb/zodbtools/main.go
+++ b/go/zodb/zodbtools/main.go
@@ -25,6 +25,7 @@ import "lab.nexedi.com/kirr/go123/prog"
 // registry of all zodbtools commands
 var commands = prog.CommandRegistry{
 	// NOTE the order commands are listed here is the order how they will appear in help
+	{"dump", dumpSummary, dumpUsage, dumpMain},
 }

 // main zodbtools driver

--- a/go/zodb/zodbtools/testdata/1.zdump.pyok
+++ b/go/zodb/zodbtools/testdata/1.zdump.pyok
--- a/go/zodb/zodbtools/testdata/empty.zdump.pyok
+++ b/go/zodb/zodbtools/testdata/empty.zdump.pyok