zodbdump += DumpReader - to read/parse zodbdump stream

We will likely need this reader for `zodb restore` in the future. We will also use this reader for `zodb commit` in the next patch. pygolang dependency v↑ becuase we use recently introduced golang.strconv to unquote user/desc/extension strings. Python2 works. Python3 support is only minimal and incomplete.

zodbdump += DumpReader - to read/parse zodbdump stream
We will likely need this reader for `zodb restore` in the future. We will also use this reader for `zodb commit` in the next patch. pygolang dependency v↑ becuase we use recently introduced golang.strconv to unquote user/desc/extension strings. Python2 works. Python3 support is only minimal and incomplete.
dd959b28 · Kirill Smelkov · e973d519 · dd959b28 · dd959b28 · dd959b28
Commit dd959b28 authored Dec 13, 2018 by Kirill Smelkov
Showing with 361 additions and 8 deletions

setup.py setup.py +1 -1

zodbtools/test/test_dump.py zodbtools/test/test_dump.py +108 -3

zodbtools/util.py zodbtools/util.py +4 -1

zodbtools/zodbdump.py zodbtools/zodbdump.py +248 -3

No files found.
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ setup(
    keywords    = 'zodb utility tool',

    packages    = find_packages(),
-    install_requires = ['ZODB', 'zodburi', 'pygolang >= 0.0.0.dev3', 'six'],
+    install_requires = ['ZODB', 'zodburi', 'zope.interface', 'pygolang >= 0.0.0.dev6', 'six'],

    extras_require = {
                  'test': ['pytest'],

--- a/zodbtools/test/test_dump.py
+++ b/zodbtools/test/test_dump.py
-# Copyright (C) 2017  Nexedi SA and Contributors.
-#                     Kirill Smelkov <kirr@nexedi.com>
+# Copyright (C) 2017-2018  Nexedi SA and Contributors.
+#                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
 # it under the terms of the GNU General Public License version 3, or (at your
@@ -17,12 +17,18 @@
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.

-from zodbtools.zodbdump import zodbdump
+from zodbtools.zodbdump import (
+        zodbdump, DumpReader, Transaction, ObjectDelete, ObjectCopy,
+        ObjectData, HashOnly
+    )
 from ZODB.FileStorage import FileStorage
+from ZODB.utils import p64
 from cStringIO import StringIO

 from os.path import dirname

+from pytest import raises
+
 # verify zodbdump output against golden
 def test_zodbdump():
    tdir = dirname(__file__)
@@ -35,3 +41,102 @@ def test_zodbdump():
    zodbdump(stor, None, None, out=out)

    assert out.getvalue() == dumpok
+
+
+# verify zodbdump.DumpReader
+def test_dumpreader():
+    in_ = b"""\
+txn 0123456789abcdef " "
+user "my name"
+description "o la-la..."
+extension "zzz123 def"
+obj 0000000000000001 delete
+obj 0000000000000002 from 0123456789abcdee
+obj 0000000000000003 54 adler32:01234567 -
+obj 0000000000000004 4 sha1:9865d483bc5a94f2e30056fc256ed3066af54d04
+ZZZZ
+obj 0000000000000005 9 crc32:52fdeac5
+ABC
+
+DEF!
+
+txn 0123456789abcdf0 " "
+user "author2"
+description "zzz"
+extension "qqq"
+
+"""
+
+    r = DumpReader(StringIO(in_))
+    t1 = r.readtxn()
+    assert isinstance(t1, Transaction)
+    assert t1.tid == '0123456789abcdef'.decode('hex')
+    assert t1.user              == b'my name'
+    assert t1.description       == b'o la-la...'
+    assert t1.extension_bytes   == b'zzz123 def'
+    assert len(t1.objv)         == 5
+    _ = t1.objv[0]
+    assert isinstance(_, ObjectDelete)
+    assert _.oid        == p64(1)
+    _ = t1.objv[1]
+    assert isinstance(_, ObjectCopy)
+    assert _.oid        == p64(2)
+    assert _.copy_from  == '0123456789abcdee'.decode('hex')
+    _ = t1.objv[2]
+    assert isinstance(_, ObjectData)
+    assert _.oid        == p64(3)
+    assert _.data       == HashOnly(54)
+    assert _.hashfunc   == 'adler32'
+    assert _.hash_      == '01234567'.decode('hex')
+    _ = t1.objv[3]
+    assert isinstance(_, ObjectData)
+    assert _.oid        == p64(4)
+    assert _.data       == b'ZZZZ'
+    assert _.hashfunc   == 'sha1'
+    assert _.hash_      == '9865d483bc5a94f2e30056fc256ed3066af54d04'.decode('hex')
+    _ = t1.objv[4]
+    assert isinstance(_, ObjectData)
+    assert _.oid        == p64(5)
+    assert _.data       == b'ABC\n\nDEF!'
+    assert _.hashfunc   == 'crc32'
+    assert _.hash_      == '52fdeac5'.decode('hex')
+
+    t2 = r.readtxn()
+    assert isinstance(t2, Transaction)
+    assert t2.tid == '0123456789abcdf0'.decode('hex')
+    assert t2.user              == b'author2'
+    assert t2.description       == b'zzz'
+    assert t2.extension_bytes   == b'qqq'
+    assert t2.objv              == []
+
+    assert r.readtxn() == None
+
+    z = ''.join([_.zdump() for _ in (t1, t2)])
+    assert z == in_
+
+    # unknown hash function
+    r = DumpReader(StringIO("""\
+txn 0000000000000000 " "
+user ""
+description ""
+extension ""
+obj 0000000000000001 1 xyz:0123 -
+
+"""))
+    with raises(RuntimeError) as exc:
+        r.readtxn()
+    assert exc.value.args == ("""+5: invalid line: unknown hash function "xyz" ('obj 0000000000000001 1 xyz:0123 -')""",)
+
+    # data integrity error
+    r = DumpReader(StringIO("""\
+txn 0000000000000000 " "
+user ""
+description ""
+extension ""
+obj 0000000000000001 5 crc32:01234567
+hello
+
+"""))
+    with raises(RuntimeError) as exc:
+        r.readtxn()
+    assert exc.value.args == ("""+6: data corrupt: crc32 = 3610a686, expected 01234567""",)
--- a/zodbtools/util.py
+++ b/zodbtools/util.py
@@ -18,7 +18,7 @@
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.

-import hashlib, struct
+import hashlib, struct, codecs
 import zodburi
 from six.moves.urllib_parse import urlsplit, urlunsplit
 from zlib import crc32, adler32
@@ -26,6 +26,9 @@ from zlib import crc32, adler32
 def ashex(s):
    return s.encode('hex')

+def fromhex(s):
+    return codecs.decode(s, 'hex')
+
 def sha1(data):
    m = hashlib.sha1()
    m.update(data)

--- a/zodbtools/zodbdump.py
+++ b/zodbtools/zodbdump.py
-# Copyright (C) 2016-2017  Nexedi SA and Contributors.
+# Copyright (C) 2016-2018  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -53,15 +53,19 @@ TODO also protect txn record by hash.
 """

 from __future__ import print_function
-from zodbtools.util import ashex, sha1, txnobjv, parse_tidrange, TidRangeInvalid,   \
-        storageFromURL
+from zodbtools.util import ashex, fromhex, sha1, txnobjv, parse_tidrange, TidRangeInvalid,   \
+        storageFromURL, hashRegistry
 from ZODB._compat import loads, _protocol, BytesIO
 from zodbpickle.slowpickle import Pickler as pyPickler
 #import pickletools
+from ZODB.interfaces import IStorageTransactionMetaData
+from zope.interface import implementer

 import sys
 import logging
+import re
 from golang.gcompat import qq
+from golang import strconv

 # txn_raw_extension returns raw extension from txn metadata
 def txn_raw_extension(stor, txn):
@@ -271,3 +275,244 @@ def main(argv):
    stor = storageFromURL(storurl, read_only=True)

    zodbdump(stor, tidmin, tidmax, hashonly)
+
+
+# ----------------------------------------
+# dump reading/parsing
+
+_txn_re = re.compile(b'^txn (?P<tid>[0-9a-f]{16}) "(?P<status>.)"$')
+_obj_re = re.compile(b'^obj (?P<oid>[0-9a-f]{16}) ((?P<delete>delete)|from (?P<from>[0-9a-f]{16})|(?P<size>[0-9]+) (?P<hashfunc>\w+):(?P<hash>[0-9a-f]+)(?P<hashonly> -)?)')
+
+# _ioname returns name of the reader r, if it has one.
+# if there is no name - '' is returned.
+def _ioname(r):
+    return getattr(r, 'name', '')
+
+
+# DumpReader wraps IO reader to read transactions from zodbdump stream.
+#
+# The reader must provide .readline() and .read() methods.
+# The reader must be opened in binary mode.
+class DumpReader(object):
+    # .lineno   - line number position in read stream
+
+    def __init__(self, r):
+        self._r         = r
+        self._line      = None  # last read line
+        self.lineno     = 0
+
+    def _readline(self):
+        l = self._r.readline()
+        if l == '':
+            self._line = None
+            return None # EOF
+
+        l = l.rstrip(b'\n')
+        self.lineno += 1
+        self._line = l
+        return l
+
+    # report a problem found around currently-read line
+    def _badline(self, msg):
+        raise RuntimeError("%s+%d: invalid line: %s (%r)" % (_ioname(self._r), self.lineno, msg, self._line))
+
+    # readtxn reads one transaction record from input stream and returns
+    # Transaction instance or None at EOF.
+    def readtxn(self):
+        # header
+        l = self._readline()
+        if l is None:
+            return None
+        m = _txn_re.match(l)
+        if m is None:
+            self._badline('no txn start')
+        tid = fromhex(m.group('tid'))
+        status = m.group('status')
+
+        def get(name):
+            l = self._readline()
+            if l is None or not l.startswith(b'%s ' % name):
+                self._badline('no %s' % name)
+
+            return strconv.unquote(l[len(name) + 1:])
+
+        user          = get(b'user')
+        description   = get(b'description')
+        extension     = get(b'extension')
+
+        # objects
+        objv = []
+        while 1:
+            l = self._readline()
+            if l == '':
+                break   # empty line - end of transaction
+
+            if l is None or not l.startswith(b'obj '):
+                self._badline('no obj')
+
+            m = _obj_re.match(l)
+            if m is None:
+                self._badline('invalid obj entry')
+
+            obj = None # will be Object*
+            oid = fromhex(m.group('oid'))
+
+            from_ = m.group('from')
+
+            if m.group('delete'):
+                obj = ObjectDelete(oid)
+
+            elif from_:
+                copy_from = fromhex(from_)
+                obj = ObjectCopy(oid, copy_from)
+
+            else:
+                size     = int(m.group('size'))
+                hashfunc = m.group('hashfunc')
+                hashok   = fromhex(m.group('hash'))
+                hashonly = m.group('hashonly') is not None
+                data     = None # see vvv
+
+                hcls = hashRegistry.get(hashfunc)
+                if hcls is None:
+                    self._badline('unknown hash function %s' % qq(hashfunc))
+
+                if hashonly:
+                    data = HashOnly(size)
+                else:
+                    # XXX -> io.readfull
+                    n = size+1  # data LF
+                    data = b''
+                    while n > 0:
+                        chunk = self._r.read(n)
+                        data += chunk
+                        n -= len(chunk)
+                    self.lineno += data.count('\n')
+                    self._line = None
+                    if data[-1:] != b'\n':
+                        raise RuntimeError('%s+%d: no LF after obj data' % (_ioname(self._r), self.lineno))
+                    data = data[:-1]
+
+                    # verify data integrity
+                    # TODO option to allow reading corrupted data
+                    h = hcls()
+                    h.update(data)
+                    hash_ = h.digest()
+                    if hash_ != hashok:
+                        raise RuntimeError('%s+%d: data corrupt: %s = %s, expected %s' % (
+                            _ioname(self._r), self.lineno, h.name, ashex(hash_), ashex(hashok)))
+
+                obj = ObjectData(oid, data, hashfunc, hashok)
+
+            objv.append(obj)
+
+        return Transaction(tid, status, user, description, extension, objv)
+
+
+# Transaction represents one transaction record in zodbdump stream.
+@implementer(IStorageTransactionMetaData)
+class Transaction(object):
+    # .tid              p64         transaction ID
+    # .status           char        status of the transaction
+    # .user             bytes       transaction author
+    # .description      bytes       transaction description
+    # .extension_bytes  bytes       transaction extension
+    # .objv             []Object*   objects changed by transaction
+    def __init__(self, tid, status, user, description, extension, objv):
+        self.tid                = tid
+        self.status             = status
+        self.user               = user
+        self.description        = description
+        self.extension_bytes    = extension
+        self.objv               = objv
+
+    # ZODB wants to work with extension as {} - try to convert it on the fly.
+    #
+    # The conversion can fail for arbitrary .extension_bytes input.
+    # The conversion should become not needed once
+    #
+    #   https://github.com/zopefoundation/ZODB/pull/183, or
+    #   https://github.com/zopefoundation/ZODB/pull/207
+    #
+    # is in ZODB.
+    @property
+    def extension(self):
+        if not self.extension_bytes:
+            return {}
+        return loads(self.extension_bytes)
+
+    # zdump returns text representation of a record in zodbdump format.
+    def zdump(self):
+        z  = 'txn %s %s\n' % (ashex(self.tid), qq(self.status))
+        z += 'user %s\n' % qq(self.user)
+        z += 'description %s\n' % qq(self.description)
+        z += 'extension %s\n' % qq(self.extension_bytes)
+        for obj in self.objv:
+            z += obj.zdump()
+        z += '\n'
+        return z
+
+
+# Object is base class for object records in zodbdump stream.
+class Object(object):
+    # .oid          p64         object ID
+    def __init__(self, oid):
+        self.oid = oid
+
+# ObjectDelete represents objects deletion.
+class ObjectDelete(Object):
+
+    def __init__(self, oid):
+        super(ObjectDelete, self).__init__(oid)
+
+    def zdump(self):
+        return 'obj %s delete\n' % (ashex(self.oid))
+
+# ObjectCopy represents object data copy.
+class ObjectCopy(Object):
+    # .copy_from    tid         copy object data from object's revision tid
+    def __init__(self, oid, copy_from):
+        super(ObjectCopy, self).__init__(oid)
+        self.copy_from = copy_from
+
+    def zdump(self):
+        return 'obj %s from %s\n' % (ashex(self.oid), ashex(self.copy_from))
+
+# ObjectData represents record with object data.
+class ObjectData(Object):
+    # .data         HashOnly | bytes
+    # .hashfunc     str             hash function used for integrity
+    # .hash_        bytes           hash of the object's data
+    def __init__(self, oid, data, hashfunc, hash_):
+        super(ObjectData, self).__init__(oid)
+        self.data       = data
+        self.hashfunc   = hashfunc
+        self.hash_      = hash_
+
+    def zdump(self):
+        data = self.data
+        hashonly = isinstance(data, HashOnly)
+        if hashonly:
+            size = data.size
+        else:
+            size = len(data)
+        z = 'obj %s %d %s:%s' % (ashex(self.oid), size, self.hashfunc, ashex(self.hash_))
+        if hashonly:
+            z += ' -'
+        else:
+            z += '\n'
+            z += data
+        z += '\n'
+        return z
+
+# HashOnly indicated that this ObjectData record contains only hash and does not contain object data.
+class HashOnly(object):
+    # .size         int
+    def __init__(self, size):
+        self.size = size
+
+    def __repr__(self):
+        return 'HashOnly(%d)' % self.size
+
+    def __eq__(a, b):
+        return isinstance(b, HashOnly) and a.size == b.size