Commit dd959b28 authored by Kirill Smelkov's avatar Kirill Smelkov

zodbdump += DumpReader - to read/parse zodbdump stream

We will likely need this reader for `zodb restore` in the future.
We will also use this reader for `zodb commit` in the next patch.

pygolang dependency v↑ becuase we use recently introduced
golang.strconv to unquote user/desc/extension strings.

Python2 works. Python3 support is only minimal and incomplete.
parent e973d519
...@@ -19,7 +19,7 @@ setup( ...@@ -19,7 +19,7 @@ setup(
keywords = 'zodb utility tool', keywords = 'zodb utility tool',
packages = find_packages(), packages = find_packages(),
install_requires = ['ZODB', 'zodburi', 'pygolang >= 0.0.0.dev3', 'six'], install_requires = ['ZODB', 'zodburi', 'zope.interface', 'pygolang >= 0.0.0.dev6', 'six'],
extras_require = { extras_require = {
'test': ['pytest'], 'test': ['pytest'],
......
# Copyright (C) 2017 Nexedi SA and Contributors. # Copyright (C) 2017-2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -17,12 +17,18 @@ ...@@ -17,12 +17,18 @@
# See COPYING file for full licensing terms. # See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
from zodbtools.zodbdump import zodbdump from zodbtools.zodbdump import (
zodbdump, DumpReader, Transaction, ObjectDelete, ObjectCopy,
ObjectData, HashOnly
)
from ZODB.FileStorage import FileStorage from ZODB.FileStorage import FileStorage
from ZODB.utils import p64
from cStringIO import StringIO from cStringIO import StringIO
from os.path import dirname from os.path import dirname
from pytest import raises
# verify zodbdump output against golden # verify zodbdump output against golden
def test_zodbdump(): def test_zodbdump():
tdir = dirname(__file__) tdir = dirname(__file__)
...@@ -35,3 +41,102 @@ def test_zodbdump(): ...@@ -35,3 +41,102 @@ def test_zodbdump():
zodbdump(stor, None, None, out=out) zodbdump(stor, None, None, out=out)
assert out.getvalue() == dumpok assert out.getvalue() == dumpok
# verify zodbdump.DumpReader
def test_dumpreader():
in_ = b"""\
txn 0123456789abcdef " "
user "my name"
description "o la-la..."
extension "zzz123 def"
obj 0000000000000001 delete
obj 0000000000000002 from 0123456789abcdee
obj 0000000000000003 54 adler32:01234567 -
obj 0000000000000004 4 sha1:9865d483bc5a94f2e30056fc256ed3066af54d04
ZZZZ
obj 0000000000000005 9 crc32:52fdeac5
ABC
DEF!
txn 0123456789abcdf0 " "
user "author2"
description "zzz"
extension "qqq"
"""
r = DumpReader(StringIO(in_))
t1 = r.readtxn()
assert isinstance(t1, Transaction)
assert t1.tid == '0123456789abcdef'.decode('hex')
assert t1.user == b'my name'
assert t1.description == b'o la-la...'
assert t1.extension_bytes == b'zzz123 def'
assert len(t1.objv) == 5
_ = t1.objv[0]
assert isinstance(_, ObjectDelete)
assert _.oid == p64(1)
_ = t1.objv[1]
assert isinstance(_, ObjectCopy)
assert _.oid == p64(2)
assert _.copy_from == '0123456789abcdee'.decode('hex')
_ = t1.objv[2]
assert isinstance(_, ObjectData)
assert _.oid == p64(3)
assert _.data == HashOnly(54)
assert _.hashfunc == 'adler32'
assert _.hash_ == '01234567'.decode('hex')
_ = t1.objv[3]
assert isinstance(_, ObjectData)
assert _.oid == p64(4)
assert _.data == b'ZZZZ'
assert _.hashfunc == 'sha1'
assert _.hash_ == '9865d483bc5a94f2e30056fc256ed3066af54d04'.decode('hex')
_ = t1.objv[4]
assert isinstance(_, ObjectData)
assert _.oid == p64(5)
assert _.data == b'ABC\n\nDEF!'
assert _.hashfunc == 'crc32'
assert _.hash_ == '52fdeac5'.decode('hex')
t2 = r.readtxn()
assert isinstance(t2, Transaction)
assert t2.tid == '0123456789abcdf0'.decode('hex')
assert t2.user == b'author2'
assert t2.description == b'zzz'
assert t2.extension_bytes == b'qqq'
assert t2.objv == []
assert r.readtxn() == None
z = ''.join([_.zdump() for _ in (t1, t2)])
assert z == in_
# unknown hash function
r = DumpReader(StringIO("""\
txn 0000000000000000 " "
user ""
description ""
extension ""
obj 0000000000000001 1 xyz:0123 -
"""))
with raises(RuntimeError) as exc:
r.readtxn()
assert exc.value.args == ("""+5: invalid line: unknown hash function "xyz" ('obj 0000000000000001 1 xyz:0123 -')""",)
# data integrity error
r = DumpReader(StringIO("""\
txn 0000000000000000 " "
user ""
description ""
extension ""
obj 0000000000000001 5 crc32:01234567
hello
"""))
with raises(RuntimeError) as exc:
r.readtxn()
assert exc.value.args == ("""+6: data corrupt: crc32 = 3610a686, expected 01234567""",)
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
# See COPYING file for full licensing terms. # See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
import hashlib, struct import hashlib, struct, codecs
import zodburi import zodburi
from six.moves.urllib_parse import urlsplit, urlunsplit from six.moves.urllib_parse import urlsplit, urlunsplit
from zlib import crc32, adler32 from zlib import crc32, adler32
...@@ -26,6 +26,9 @@ from zlib import crc32, adler32 ...@@ -26,6 +26,9 @@ from zlib import crc32, adler32
def ashex(s): def ashex(s):
return s.encode('hex') return s.encode('hex')
def fromhex(s):
return codecs.decode(s, 'hex')
def sha1(data): def sha1(data):
m = hashlib.sha1() m = hashlib.sha1()
m.update(data) m.update(data)
......
# Copyright (C) 2016-2017 Nexedi SA and Contributors. # Copyright (C) 2016-2018 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -53,15 +53,19 @@ TODO also protect txn record by hash. ...@@ -53,15 +53,19 @@ TODO also protect txn record by hash.
""" """
from __future__ import print_function from __future__ import print_function
from zodbtools.util import ashex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \ from zodbtools.util import ashex, fromhex, sha1, txnobjv, parse_tidrange, TidRangeInvalid, \
storageFromURL storageFromURL, hashRegistry
from ZODB._compat import loads, _protocol, BytesIO from ZODB._compat import loads, _protocol, BytesIO
from zodbpickle.slowpickle import Pickler as pyPickler from zodbpickle.slowpickle import Pickler as pyPickler
#import pickletools #import pickletools
from ZODB.interfaces import IStorageTransactionMetaData
from zope.interface import implementer
import sys import sys
import logging import logging
import re
from golang.gcompat import qq from golang.gcompat import qq
from golang import strconv
# txn_raw_extension returns raw extension from txn metadata # txn_raw_extension returns raw extension from txn metadata
def txn_raw_extension(stor, txn): def txn_raw_extension(stor, txn):
...@@ -271,3 +275,244 @@ def main(argv): ...@@ -271,3 +275,244 @@ def main(argv):
stor = storageFromURL(storurl, read_only=True) stor = storageFromURL(storurl, read_only=True)
zodbdump(stor, tidmin, tidmax, hashonly) zodbdump(stor, tidmin, tidmax, hashonly)
# ----------------------------------------
# dump reading/parsing
_txn_re = re.compile(b'^txn (?P<tid>[0-9a-f]{16}) "(?P<status>.)"$')
_obj_re = re.compile(b'^obj (?P<oid>[0-9a-f]{16}) ((?P<delete>delete)|from (?P<from>[0-9a-f]{16})|(?P<size>[0-9]+) (?P<hashfunc>\w+):(?P<hash>[0-9a-f]+)(?P<hashonly> -)?)')
# _ioname returns name of the reader r, if it has one.
# if there is no name - '' is returned.
def _ioname(r):
return getattr(r, 'name', '')
# DumpReader wraps IO reader to read transactions from zodbdump stream.
#
# The reader must provide .readline() and .read() methods.
# The reader must be opened in binary mode.
class DumpReader(object):
# .lineno - line number position in read stream
def __init__(self, r):
self._r = r
self._line = None # last read line
self.lineno = 0
def _readline(self):
l = self._r.readline()
if l == '':
self._line = None
return None # EOF
l = l.rstrip(b'\n')
self.lineno += 1
self._line = l
return l
# report a problem found around currently-read line
def _badline(self, msg):
raise RuntimeError("%s+%d: invalid line: %s (%r)" % (_ioname(self._r), self.lineno, msg, self._line))
# readtxn reads one transaction record from input stream and returns
# Transaction instance or None at EOF.
def readtxn(self):
# header
l = self._readline()
if l is None:
return None
m = _txn_re.match(l)
if m is None:
self._badline('no txn start')
tid = fromhex(m.group('tid'))
status = m.group('status')
def get(name):
l = self._readline()
if l is None or not l.startswith(b'%s ' % name):
self._badline('no %s' % name)
return strconv.unquote(l[len(name) + 1:])
user = get(b'user')
description = get(b'description')
extension = get(b'extension')
# objects
objv = []
while 1:
l = self._readline()
if l == '':
break # empty line - end of transaction
if l is None or not l.startswith(b'obj '):
self._badline('no obj')
m = _obj_re.match(l)
if m is None:
self._badline('invalid obj entry')
obj = None # will be Object*
oid = fromhex(m.group('oid'))
from_ = m.group('from')
if m.group('delete'):
obj = ObjectDelete(oid)
elif from_:
copy_from = fromhex(from_)
obj = ObjectCopy(oid, copy_from)
else:
size = int(m.group('size'))
hashfunc = m.group('hashfunc')
hashok = fromhex(m.group('hash'))
hashonly = m.group('hashonly') is not None
data = None # see vvv
hcls = hashRegistry.get(hashfunc)
if hcls is None:
self._badline('unknown hash function %s' % qq(hashfunc))
if hashonly:
data = HashOnly(size)
else:
# XXX -> io.readfull
n = size+1 # data LF
data = b''
while n > 0:
chunk = self._r.read(n)
data += chunk
n -= len(chunk)
self.lineno += data.count('\n')
self._line = None
if data[-1:] != b'\n':
raise RuntimeError('%s+%d: no LF after obj data' % (_ioname(self._r), self.lineno))
data = data[:-1]
# verify data integrity
# TODO option to allow reading corrupted data
h = hcls()
h.update(data)
hash_ = h.digest()
if hash_ != hashok:
raise RuntimeError('%s+%d: data corrupt: %s = %s, expected %s' % (
_ioname(self._r), self.lineno, h.name, ashex(hash_), ashex(hashok)))
obj = ObjectData(oid, data, hashfunc, hashok)
objv.append(obj)
return Transaction(tid, status, user, description, extension, objv)
# Transaction represents one transaction record in zodbdump stream.
@implementer(IStorageTransactionMetaData)
class Transaction(object):
# .tid p64 transaction ID
# .status char status of the transaction
# .user bytes transaction author
# .description bytes transaction description
# .extension_bytes bytes transaction extension
# .objv []Object* objects changed by transaction
def __init__(self, tid, status, user, description, extension, objv):
self.tid = tid
self.status = status
self.user = user
self.description = description
self.extension_bytes = extension
self.objv = objv
# ZODB wants to work with extension as {} - try to convert it on the fly.
#
# The conversion can fail for arbitrary .extension_bytes input.
# The conversion should become not needed once
#
# https://github.com/zopefoundation/ZODB/pull/183, or
# https://github.com/zopefoundation/ZODB/pull/207
#
# is in ZODB.
@property
def extension(self):
if not self.extension_bytes:
return {}
return loads(self.extension_bytes)
# zdump returns text representation of a record in zodbdump format.
def zdump(self):
z = 'txn %s %s\n' % (ashex(self.tid), qq(self.status))
z += 'user %s\n' % qq(self.user)
z += 'description %s\n' % qq(self.description)
z += 'extension %s\n' % qq(self.extension_bytes)
for obj in self.objv:
z += obj.zdump()
z += '\n'
return z
# Object is base class for object records in zodbdump stream.
class Object(object):
# .oid p64 object ID
def __init__(self, oid):
self.oid = oid
# ObjectDelete represents objects deletion.
class ObjectDelete(Object):
def __init__(self, oid):
super(ObjectDelete, self).__init__(oid)
def zdump(self):
return 'obj %s delete\n' % (ashex(self.oid))
# ObjectCopy represents object data copy.
class ObjectCopy(Object):
# .copy_from tid copy object data from object's revision tid
def __init__(self, oid, copy_from):
super(ObjectCopy, self).__init__(oid)
self.copy_from = copy_from
def zdump(self):
return 'obj %s from %s\n' % (ashex(self.oid), ashex(self.copy_from))
# ObjectData represents record with object data.
class ObjectData(Object):
# .data HashOnly | bytes
# .hashfunc str hash function used for integrity
# .hash_ bytes hash of the object's data
def __init__(self, oid, data, hashfunc, hash_):
super(ObjectData, self).__init__(oid)
self.data = data
self.hashfunc = hashfunc
self.hash_ = hash_
def zdump(self):
data = self.data
hashonly = isinstance(data, HashOnly)
if hashonly:
size = data.size
else:
size = len(data)
z = 'obj %s %d %s:%s' % (ashex(self.oid), size, self.hashfunc, ashex(self.hash_))
if hashonly:
z += ' -'
else:
z += '\n'
z += data
z += '\n'
return z
# HashOnly indicated that this ObjectData record contains only hash and does not contain object data.
class HashOnly(object):
# .size int
def __init__(self, size):
self.size = size
def __repr__(self):
return 'HashOnly(%d)' % self.size
def __eq__(a, b):
return isinstance(b, HashOnly) and a.size == b.size
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment