Commit 289e1cac authored by Nicolas Wavrant's avatar Nicolas Wavrant Committed by Jérome Perrin

ZODB: import patch "incpozo"

Patch submitted here:
https://github.com/zopefoundation/ZODB/pull/403

We backport it in SlapOS directly as the branch 5.8 is not being
developed anymore.
parent 95b1d7df
Pipeline #39848 passed with stage
in 0 seconds
......@@ -60,14 +60,22 @@ egg-versions =
[ZODB5]
<= _ZODB
egg-versions =
ZODB = 5.8.1
ZODB = 5.8.1+SlapOSPatched001
transaction = 4.0.0
patch-binary = ${patch:location}/bin/patch
ZODB-patches =
${:_profile_base_location_}/incpozo-5.8.0.patch#99a9e7889b06ab3fdb1adbd2b152afba
ZODB-patch-options = -p1
[ZODB5:python2]
<= _ZODB
egg-versions =
ZODB = 5.8.1
ZODB = 5.8.1+SlapOSPatched001
transaction = 3.0.1
patch-binary = ${patch:location}/bin/patch
ZODB-patches =
${:_profile_base_location_}/incpozo-5.8.0.patch#99a9e7889b06ab3fdb1adbd2b152afba
ZODB-patch-options = -p1
[ZODB6]
<= _ZODB
......
From 265adfd18f3d5d1c478e63037e37f250604d881d Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Fri, 18 Oct 2024 10:21:38 +0900
Subject: [PATCH 1/5] repozo: factorize code doing the actual recover (write),
in preparation to the implementation of the incremental recover
---
src/ZODB/scripts/repozo.py | 71 ++++++++++++++++++++------------------
1 file changed, 37 insertions(+), 34 deletions(-)
diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
index 8dce5dc80..32d54f561 100755
--- a/src/ZODB/scripts/repozo.py
+++ b/src/ZODB/scripts/repozo.py
@@ -396,6 +396,42 @@ def func(data):
return bytesread, sum.hexdigest()
+def recover_repofiles(options, repofiles, outfp):
+ if options.withverify:
+ datfile = os.path.splitext(repofiles[0])[0] + '.dat'
+ with open(datfile) as fp:
+ truth_dict = {}
+ for line in fp:
+ fn, startpos, endpos, sum = line.split()
+ startpos = int(startpos)
+ endpos = int(endpos)
+ filename = os.path.join(options.repository,
+ os.path.basename(fn))
+ truth_dict[filename] = {
+ 'size': endpos - startpos,
+ 'sum': sum,
+ }
+ totalsz = 0
+ for repofile in repofiles:
+ reposz, reposum = concat([repofile], outfp)
+ expected_truth = truth_dict[repofile]
+ if reposz != expected_truth['size']:
+ raise VerificationFail(
+ "%s is %d bytes, should be %d bytes" % (
+ repofile, reposz, expected_truth['size']))
+ if reposum != expected_truth['sum']:
+ raise VerificationFail(
+ "%s has checksum %s instead of %s".format(
+ repofile, reposum, expected_truth['sum']))
+ totalsz += reposz
+ log("Recovered chunk %s : %s bytes, md5: %s",
+ repofile, reposz, reposum)
+ log("Recovered a total of %s bytes", totalsz)
+ else:
+ reposz, reposum = concat(repofiles, outfp)
+ log('Recovered %s bytes, md5: %s', reposz, reposum)
+
+
def gen_filedate(options):
return getattr(options, 'test_now', time.gmtime()[:6])
@@ -698,40 +734,7 @@ def do_recover(options):
files_to_close += (outfp,)
try:
- if options.withverify:
- datfile = os.path.splitext(repofiles[0])[0] + '.dat'
- with open(datfile) as fp:
- truth_dict = {}
- for line in fp:
- fn, startpos, endpos, sum = line.split()
- startpos = int(startpos)
- endpos = int(endpos)
- filename = os.path.join(options.repository,
- os.path.basename(fn))
- truth_dict[filename] = {
- 'size': endpos - startpos,
- 'sum': sum,
- }
- totalsz = 0
- for repofile in repofiles:
- reposz, reposum = concat([repofile], outfp)
- expected_truth = truth_dict[repofile]
- if reposz != expected_truth['size']:
- raise VerificationFail(
- "%s is %d bytes, should be %d bytes" % (
- repofile, reposz, expected_truth['size']))
- if reposum != expected_truth['sum']:
- raise VerificationFail(
- "%s has checksum %s instead of %s" % (
- repofile, reposum, expected_truth['sum']))
- totalsz += reposz
- log("Recovered chunk %s : %s bytes, md5: %s",
- repofile, reposz, reposum)
- log("Recovered a total of %s bytes", totalsz)
- else:
- reposz, reposum = concat(repofiles, outfp)
- log('Recovered %s bytes, md5: %s', reposz, reposum)
-
+ recover_repofiles(options, repofiles, outfp)
if options.output is not None:
last_base = os.path.splitext(repofiles[-1])[0]
source_index = '%s.index' % last_base
From f5eacd0a6116a977f65c1efe5a5eb5945d458fad Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Thu, 17 Oct 2024 17:20:03 +0900
Subject: [PATCH 2/5] repozo: support incremental recover
Which allows to recover a zodb filestorage by only appending the missing
chunks from the latest recovered file, instead of always recovering from
zero.
Based on the work of @vpelletier (incpozo).
---
src/ZODB/scripts/repozo.py | 121 +++++++++++---
src/ZODB/scripts/tests/test_repozo.py | 227 +++++++++++++++++++++++++-
3 files changed, 331 insertions(+), 28 deletions(-)
diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
index 32d54f561..283cae2a3 100755
--- a/src/ZODB/scripts/repozo.py
+++ b/src/ZODB/scripts/repozo.py
@@ -73,6 +73,12 @@
Note: for the stdout case, the index file will **not** be restored
automatically.
+ -F / --full
+ Force a full recover. By default, an incremental recover is made
+ if possible, by only copying the latest backup delta to the recovered
+ ZODB file. A full recover will always be done if a pack has occured
+ since the last incremental backup.
+
-w
--with-verify
Verify on the fly the backup files on recovering. This option runs
@@ -185,7 +191,7 @@ class Options:
mode = None # BACKUP, RECOVER or VERIFY
file = None # name of input Data.fs file
repository = None # name of directory holding backups
- full = False # True forces full backup
+ full = False # True forces full backup or full recovery
date = None # -D argument, if any
output = None # where to write recovered data; None = stdout
quick = False # -Q flag state
@@ -396,9 +402,8 @@ def func(data):
return bytesread, sum.hexdigest()
-def recover_repofiles(options, repofiles, outfp):
+def recover_repofiles(options, repofiles, datfile, outfp):
if options.withverify:
- datfile = os.path.splitext(repofiles[0])[0] + '.dat'
with open(datfile) as fp:
truth_dict = {}
for line in fp:
@@ -709,15 +714,7 @@ def do_backup(options):
do_full_backup(options)
-def do_recover(options):
- # Find the first full backup at or before the specified date
- repofiles = find_files(options)
- if not repofiles:
- if options.date:
- raise NoFiles('No files in repository before %s' % (options.date,))
- else:
- raise NoFiles('No files in repository')
-
+def do_full_recover(options, repofiles):
files_to_close = ()
if options.output is None:
log('Recovering file to stdout')
@@ -734,17 +731,8 @@ def do_recover(options):
files_to_close += (outfp,)
try:
- recover_repofiles(options, repofiles, outfp)
- if options.output is not None:
- last_base = os.path.splitext(repofiles[-1])[0]
- source_index = '%s.index' % last_base
- target_index = '%s.index' % options.output
- if os.path.exists(source_index):
- log('Restoring index file %s to %s',
- source_index, target_index)
- shutil.copyfile(source_index, target_index)
- else:
- log('No index file to restore: %s', source_index)
+ datfile = os.path.splitext(repofiles[0])[0] + '.dat'
+ recover_repofiles(options, repofiles, datfile, outfp)
finally:
for f in files_to_close:
f.close()
@@ -758,6 +746,93 @@ def do_recover(options):
raise
+def do_incremental_recover(options, repofiles):
+ datfile = os.path.splitext(repofiles[0])[0] + '.dat'
+ log('Recovering (incrementally) file to %s', options.output)
+ with open(options.output, 'r+b') as outfp:
+ outfp.seek(0, 2)
+ initial_length = outfp.tell()
+ with open(datfile) as fp:
+ previous_chunk = None
+ for line in fp:
+ fn, startpos, endpos, _ = chunk = line.split()
+ startpos = int(startpos)
+ endpos = int(endpos)
+ if endpos > initial_length:
+ break
+ previous_chunk = chunk
+
+ if previous_chunk is None:
+ log('Target file smaller than full backup, '
+ 'falling back to a full recover.')
+ return do_full_recover(options, repofiles)
+ if endpos < initial_length:
+ log('Target file is larger than latest backup, '
+ 'falling back to a full recover.')
+ return do_full_recover(options, repofiles)
+ check_startpos = int(previous_chunk[1])
+ check_endpos = int(previous_chunk[2])
+ with open(options.output, 'r+b') as outfp:
+ outfp.seek(check_startpos)
+ check_sum = checksum(outfp, check_endpos - check_startpos)
+ if endpos == initial_length and chunk[3] == check_sum:
+ log('Target file is same size as latest backup, '
+ 'doing nothing.')
+ return
+ elif previous_chunk[3] != check_sum:
+ if endpos == initial_length:
+ log('Target file is not consistent with latest backup, '
+ 'falling back to a full recover.')
+ return do_full_recover(options, repofiles)
+ else:
+ log('Last whole common chunk checksum did not match with backup, '
+ 'falling back to a full recover.')
+ return do_full_recover(options, repofiles)
+
+ filename = os.path.join(options.repository,
+ os.path.basename(fn))
+ first_file_to_restore = repofiles.index(filename)
+ assert first_file_to_restore > 0, (
+ first_file_to_restore, options.repository, fn, filename, repofiles)
+
+ temporary_output_file = options.output + '.part'
+ os.rename(options.output, temporary_output_file)
+ with open(temporary_output_file, 'r+b') as outfp:
+ outfp.seek(startpos)
+ recover_repofiles(options,
+ repofiles[first_file_to_restore:],
+ datfile,
+ outfp)
+ os.rename(temporary_output_file, options.output)
+
+
+def do_recover(options):
+ # Find the first full backup at or before the specified date
+ repofiles = find_files(options)
+ if not repofiles:
+ if options.date:
+ raise NoFiles('No files in repository before %s' % (options.date,))
+ else:
+ raise NoFiles('No files in repository')
+
+ if (options.full or options.output is None
+ or not os.path.exists(options.output)):
+ do_full_recover(options, repofiles)
+ else:
+ do_incremental_recover(options, repofiles)
+
+ if options.output is not None:
+ last_base = os.path.splitext(repofiles[-1])[0]
+ source_index = '%s.index' % last_base
+ target_index = '%s.index' % options.output
+ if os.path.exists(source_index):
+ log('Restoring index file %s to %s',
+ source_index, target_index)
+ shutil.copyfile(source_index, target_index)
+ else:
+ log('No index file to restore: %s', source_index)
+
+
def do_verify(options):
# Verify the sizes and checksums of all files mentioned in the .dat file
repofiles = find_files(options)
diff --git a/src/ZODB/scripts/tests/test_repozo.py b/src/ZODB/scripts/tests/test_repozo.py
index 13d24bef5..eccb4f1c1 100644
--- a/src/ZODB/scripts/tests/test_repozo.py
+++ b/src/ZODB/scripts/tests/test_repozo.py
@@ -864,8 +864,7 @@ def test_w_changes(self):
self.assertEqual(index.maxKey(), db.maxkey)
-class Test_do_recover(OptionsTestBase, unittest.TestCase):
-
+class Mixin_do_recover:
def _callFUT(self, options):
from ZODB.scripts.repozo import do_recover
return do_recover(options)
@@ -902,6 +901,17 @@ def test_no_files_before_explicit_date(self):
files.append(self._makeFile(h, m, s, e))
self.assertRaises(NoFiles, self._callFUT, options)
+
+class Test_do_full_recover(
+ Mixin_do_recover,
+ OptionsTestBase,
+ unittest.TestCase
+):
+ def _makeOptions(self, **kw):
+ options = super()._makeOptions(**kw)
+ options.full = True
+ return options
+
def test_w_full_backup_latest_no_index(self):
import tempfile
dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
@@ -1010,6 +1020,216 @@ def test_w_incr_backup_with_verify_size_inconsistent(self):
self.assertTrue(os.path.exists(output + '.part'))
+class Test_do_incremental_recover(
+ Mixin_do_recover,
+ OptionsTestBase,
+ unittest.TestCase
+):
+ def setUp(self):
+ from ZODB.scripts import repozo
+ self._old_verbosity = repozo.VERBOSE
+ self._old_stderr = sys.stderr
+ repozo.VERBOSE = True
+ sys.stderr = StringIO()
+
+ def tearDown(self):
+ from ZODB.scripts import repozo
+ sys.stderr = self._old_stderr
+ repozo.VERBOSE = self._old_verbosity
+
+ def _makeOptions(self, **kw):
+ options = super()._makeOptions(**kw)
+ options.full = False
+ return options
+
+ def _createRecoveredDataFS(self, output, options):
+ self._makeFile(2, 3, 4, '.fs', 'AAA')
+ self._makeFile(4, 5, 6, '.deltafs', 'BBB')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n') # noqa: E501 line too long
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'AAABBB')
+ self.assertFalse(os.path.exists(output + '.part'))
+ return output
+
+ def test_do_nothing(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ self._callFUT(options)
+ self.assertIn(
+ "doing nothing", sys.stderr.getvalue())
+
+ def test_w_incr_recover_from_incr_backup(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ # Create 2 more .deltafs, to prove the code knows where to pick up
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(8, 9, 10, '.deltafs', 'DDD')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long
+ '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long
+ os.unlink(
+ os.path.join(self._repository_directory,
+ '2010-05-14-04-05-06.deltafs'))
+ self._callFUT(options)
+ self.assertNotIn('falling back to a full recover.',
+ sys.stderr.getvalue())
+ self.assertEqual(_read_file(output), b'AAABBBCCCDDD')
+ self.assertFalse(os.path.exists(output + '.part'))
+
+ def test_w_incr_backup_with_verify_sum_inconsistent(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=True)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(8, 9, 10, '.deltafs', 'DDD')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166af\n' # noqa: E501 line too long
+ '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long
+ from ZODB.scripts.repozo import VerificationFail
+ self.assertRaises(VerificationFail, self._callFUT, options)
+ self.assertNotIn('falling back to a full recover.',
+ sys.stderr.getvalue())
+ self.assertTrue(os.path.exists(output + '.part'))
+
+ def test_w_incr_backup_with_verify_size_inconsistent_too_small(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=True)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(8, 9, 10, '.deltafs', 'DDD')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 8 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long
+ '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long
+ from ZODB.scripts.repozo import VerificationFail
+ self.assertRaises(VerificationFail, self._callFUT, options)
+ self.assertNotIn('falling back to a full recover.',
+ sys.stderr.getvalue())
+ self.assertTrue(os.path.exists(output + '.part'))
+
+ def test_w_incr_backup_with_verify_size_inconsistent_too_big(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=True)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(8, 9, 10, '.deltafs', 'DDD')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 10 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long
+ '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long
+ from ZODB.scripts.repozo import VerificationFail
+ self.assertRaises(VerificationFail, self._callFUT, options)
+ self.assertNotIn('falling back to a full recover.',
+ sys.stderr.getvalue())
+ self.assertTrue(os.path.exists(output + '.part'))
+
+ def test_w_incr_backup_switch_auto_to_full_recover_if_output_larger_than_dat(self): # noqa: E501 line too long
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long
+ # The ZODB is longer than announced in the .dat file
+ with open(output, 'r+b') as f:
+ f.write(b'AAABBBCCCDDD')
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'AAABBBCCC')
+ self.assertFalse(os.path.exists(output + '.part'))
+ self.assertIn(
+ "falling back to a full recover", sys.stderr.getvalue())
+
+ def test_w_incr_backup_switch_auto_to_full_recover_if_chunk_is_wrong(self): # noqa: E501 line too long
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'AAABBBCCC')
+ self.assertFalse(os.path.exists(output + '.part'))
+ self.assertIn(
+ "Last whole common chunk checksum did not match with backup, falling back to a full recover.", # noqa: E501 line too long
+ sys.stderr.getvalue())
+
+ def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ withverify=False)
+ self._makeFile(2, 3, 4, '.fs', 'AAA')
+ self._makeFile(4, 5, 6, '.deltafs', 'BBB')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n') # noqa: E501 line too long
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'AAABBB')
+
+ self._makeFile(6, 7, 8, '.fs', 'CCDD')
+ self._makeFile(
+ 6, 7, 8, '.dat',
+ '/backup/2010-05-14-06-07-08.fs 0 4 dc0ee37408176d839c13f291a4d588de\n') # noqa: E501 line too long
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'CCDD')
+ self.assertFalse(os.path.exists(output + '.part'))
+ self.assertIn(
+ 'Target file is larger than latest backup, falling back to a full recover.', # noqa: E501 line too long
+ sys.stderr.getvalue())
+
+
class Test_do_verify(OptionsTestBase, unittest.TestCase):
def _callFUT(self, options):
@@ -1296,7 +1516,8 @@ def test_suite():
unittest.makeSuite(Test_do_full_backup),
unittest.makeSuite(Test_do_incremental_backup),
# unittest.makeSuite(Test_do_backup), #TODO
- unittest.makeSuite(Test_do_recover),
+ unittest.makeSuite(Test_do_full_recover),
+ unittest.makeSuite(Test_do_incremental_recover),
unittest.makeSuite(Test_do_verify),
# N.B.: this test take forever to run (~40sec on a fast laptop),
# *and* it is non-deterministic.
From c21a6df92544302d971594c893452224c3e71279 Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Thu, 14 Nov 2024 16:17:42 +0900
Subject: [PATCH 3/5] repozo: integrate a quick option for the incremental
recover
The default behavior becomes to check the integrality of the existing
recovered Data.fs before incrementally restore on it. The quick option
allows to verify only the latest chunck previously recovered on the
Data.fs before restoring incrementally. This saves many reads.
---
src/ZODB/scripts/repozo.py | 59 +++++++++++-------
src/ZODB/scripts/tests/test_repozo.py | 87 +++++++++++++++++++++++++--
2 files changed, 120 insertions(+), 26 deletions(-)
diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
index 283cae2a3..8240bebc7 100755
--- a/src/ZODB/scripts/repozo.py
+++ b/src/ZODB/scripts/repozo.py
@@ -79,6 +79,13 @@
ZODB file. A full recover will always be done if a pack has occured
since the last incremental backup.
+ -Q / --quick
+ Verify via md5 checksum only the last incremental recovered of the
+ output file. This reduces the disk i/o at the (theoretical) cost of
+ inconsistency. This is a probabilistic way of determining whether a
+ full recover is necessary. This argument is ignored when -F / --full
+ is used.
+
-w
--with-verify
Verify on the fly the backup files on recovering. This option runs
@@ -267,6 +274,9 @@ class Options:
if options.killold:
log('--kill-old-on-full option is ignored in recover mode')
options.killold = False
+ if options.full and options.quick:
+ log('--quick option is ignored if --full option is used')
+ options.quick = None
else:
assert options.mode == VERIFY
if options.date is not None:
@@ -752,42 +762,49 @@ def do_incremental_recover(options, repofiles):
with open(options.output, 'r+b') as outfp:
outfp.seek(0, 2)
initial_length = outfp.tell()
- with open(datfile) as fp:
- previous_chunk = None
+
+ error = ''
+ previous_chunk = None
+ with open(datfile) as fp, open(options.output, 'r+b') as outfp:
for line in fp:
- fn, startpos, endpos, _ = chunk = line.split()
+ fn, startpos, endpos, check_sum = chunk = line.split()
startpos = int(startpos)
endpos = int(endpos)
if endpos > initial_length:
break
+ if not options.quick:
+ if check_sum != checksum(outfp, endpos - startpos):
+ error = ('Target file is not consistent with backup %s, '
+ 'falling back to a full recover.') % fn
+ break
previous_chunk = chunk
-
- if previous_chunk is None:
+ if error:
+ log(error)
+ return do_full_recover(options, repofiles)
+ elif previous_chunk is None:
log('Target file smaller than full backup, '
'falling back to a full recover.')
return do_full_recover(options, repofiles)
- if endpos < initial_length:
+ elif endpos < initial_length:
log('Target file is larger than latest backup, '
'falling back to a full recover.')
return do_full_recover(options, repofiles)
- check_startpos = int(previous_chunk[1])
- check_endpos = int(previous_chunk[2])
- with open(options.output, 'r+b') as outfp:
- outfp.seek(check_startpos)
- check_sum = checksum(outfp, check_endpos - check_startpos)
- if endpos == initial_length and chunk[3] == check_sum:
+ if options.quick:
+ check_startpos = int(previous_chunk[1])
+ check_endpos = int(previous_chunk[2])
+ with open(options.output, 'r+b') as outfp:
+ outfp.seek(check_startpos)
+ if previous_chunk[3] != checksum(
+ outfp, check_endpos - check_startpos):
+ error = ('Target file is not consistent with backup %s, '
+ 'falling back to a full recover.' % previous_chunk[0])
+ if error:
+ log(error)
+ return do_full_recover(options, repofiles)
+ if endpos == initial_length:
log('Target file is same size as latest backup, '
'doing nothing.')
return
- elif previous_chunk[3] != check_sum:
- if endpos == initial_length:
- log('Target file is not consistent with latest backup, '
- 'falling back to a full recover.')
- return do_full_recover(options, repofiles)
- else:
- log('Last whole common chunk checksum did not match with backup, '
- 'falling back to a full recover.')
- return do_full_recover(options, repofiles)
filename = os.path.join(options.repository,
os.path.basename(fn))
diff --git a/src/ZODB/scripts/tests/test_repozo.py b/src/ZODB/scripts/tests/test_repozo.py
index eccb4f1c1..9d5696e72 100644
--- a/src/ZODB/scripts/tests/test_repozo.py
+++ b/src/ZODB/scripts/tests/test_repozo.py
@@ -220,13 +220,16 @@ def test_recover_ignored_args(self):
from ZODB.scripts import repozo
options = repozo.parseargs(['-R', '-r', '/tmp/nosuchdir', '-v',
'-f', '/tmp/ignored.fs',
- '-k'])
+ '-k', '--full', '--quick'])
self.assertEqual(options.file, None)
self.assertIn('--file option is ignored in recover mode',
sys.stderr.getvalue())
self.assertEqual(options.killold, False)
self.assertIn('--kill-old-on-full option is ignored in recover mode',
sys.stderr.getvalue())
+ self.assertEqual(options.quick, None)
+ self.assertIn('--quick option is ignored if --full option is used',
+ sys.stderr.getvalue())
def test_verify_ignored_args(self):
from ZODB.scripts import repozo
@@ -1040,6 +1043,7 @@ def tearDown(self):
def _makeOptions(self, **kw):
options = super()._makeOptions(**kw)
options.full = False
+ options.quick = kw.get('quick', False)
return options
def _createRecoveredDataFS(self, output, options):
@@ -1092,6 +1096,33 @@ def test_w_incr_recover_from_incr_backup(self):
self.assertEqual(_read_file(output), b'AAABBBCCCDDD')
self.assertFalse(os.path.exists(output + '.part'))
+ def test_w_quick_incr_recover_from_incr_backup(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ quick=True,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ # Create 2 more .deltafs, to prove the code knows where to pick up
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(8, 9, 10, '.deltafs', 'DDD')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n' # noqa: E501 line too long
+ '/backup/2010-05-14-08-09-10.deltafs 9 12 45054f47ac3305a2a33e9bcceadff712\n') # noqa: E501 line too long
+ os.unlink(
+ os.path.join(self._repository_directory,
+ '2010-05-14-04-05-06.deltafs'))
+ self._callFUT(options)
+ self.assertNotIn('falling back to a full recover.',
+ sys.stderr.getvalue())
+ self.assertEqual(_read_file(output), b'AAABBBCCCDDD')
+ self.assertFalse(os.path.exists(output + '.part'))
+
def test_w_incr_backup_with_verify_sum_inconsistent(self):
import tempfile
dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
@@ -1192,14 +1223,14 @@ def test_w_incr_backup_switch_auto_to_full_recover_if_chunk_is_wrong(self): # n
self._makeFile(6, 7, 8, '.deltafs', 'CCC')
self._makeFile(
2, 3, 4, '.dat',
- '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
- '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b8\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
'/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long
self._callFUT(options)
self.assertEqual(_read_file(output), b'AAABBBCCC')
self.assertFalse(os.path.exists(output + '.part'))
self.assertIn(
- "Last whole common chunk checksum did not match with backup, falling back to a full recover.", # noqa: E501 line too long
+ "Target file is not consistent with backup /backup/2010-05-14-02-03-04.fs, falling back to a full recover.", # noqa: E501 line too long
sys.stderr.getvalue())
def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self):
@@ -1226,9 +1257,55 @@ def test_w_incr_backup_switch_auto_to_full_recover_after_pack(self):
self.assertEqual(_read_file(output), b'CCDD')
self.assertFalse(os.path.exists(output + '.part'))
self.assertIn(
- 'Target file is larger than latest backup, falling back to a full recover.', # noqa: E501 line too long
+ "Target file is not consistent with backup /backup/2010-05-14-06-07-08.fs, falling back to a full recover.", # noqa: E501 line too long
sys.stderr.getvalue())
+ def test_w_quick_incr_backup_switch_auto_to_full_recover_if_last_chunk_is_wrong(self): # noqa: E501 line too long
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ quick=True,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a4\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'AAABBBCCC')
+ self.assertFalse(os.path.exists(output + '.part'))
+ self.assertIn(
+ "Target file is not consistent with backup /backup/2010-05-14-04-05-06.deltafs, falling back to a full recover.", # noqa: E501 line too long
+ sys.stderr.getvalue())
+
+ def test_w_quick_incr_backup_dont_see_old_inconsistencies(self):
+ import tempfile
+ dd = self._data_directory = tempfile.mkdtemp(prefix='zodb-test-')
+ output = os.path.join(dd, 'Data.fs')
+ options = self._makeOptions(date='2010-05-15-13-30-57',
+ output=output,
+ quick=True,
+ withverify=False)
+ self._createRecoveredDataFS(output, options)
+ self._makeFile(6, 7, 8, '.deltafs', 'CCC')
+ self._makeFile(
+ 2, 3, 4, '.dat',
+ '/backup/2010-05-14-02-03-04.fs 0 3 e1faffb3e614e6c2fba74296962386b7\n' # noqa: E501 line too long
+ '/backup/2010-05-14-04-05-06.deltafs 3 6 2bb225f0ba9a58930757a868ed57d9a3\n' # noqa: E501 line too long
+ '/backup/2010-05-14-06-07-08.deltafs 6 9 defb99e69a9f1f6e06f15006b1f166ae\n') # noqa: E501 line too long
+ # The ZODB is longer than announced in the .dat file
+ with open(output, 'r+b') as f:
+ f.write(b'ZZZBBBCCC')
+ self._callFUT(options)
+ self.assertEqual(_read_file(output), b'ZZZBBBCCC')
+ self.assertFalse(os.path.exists(output + '.part'))
+ self.assertNotIn(
+ "falling back to a full recover", sys.stderr.getvalue())
+
class Test_do_verify(OptionsTestBase, unittest.TestCase):
From 93a1e0653873067db46cc675c548750310189576 Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Fri, 15 Nov 2024 16:25:09 +0900
Subject: [PATCH 4/5] repozo: be more explicit on the FileStorage size
retrieval methodology
---
src/ZODB/scripts/repozo.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
index 8240bebc7..aaf5620a6 100755
--- a/src/ZODB/scripts/repozo.py
+++ b/src/ZODB/scripts/repozo.py
@@ -760,7 +760,10 @@ def do_incremental_recover(options, repofiles):
datfile = os.path.splitext(repofiles[0])[0] + '.dat'
log('Recovering (incrementally) file to %s', options.output)
with open(options.output, 'r+b') as outfp:
- outfp.seek(0, 2)
+ # Note that we do not open the FileStorage to use getSize here,
+ # we really want the actual file size, even if there is invalid
+ # transaction data at the end.
+ outfp.seek(0, os.SEEK_END)
initial_length = outfp.tell()
error = ''
From 3b4abd939171fc5af92de3016f66ce20f55e5a2f Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Fri, 15 Nov 2024 16:53:04 +0900
Subject: [PATCH 5/5] repozo: log files that will be used by the incremental
recovery
As find_files logs all the files needed to recover the state from 0,
add a log to show which files will be effectively used in the case
of an incremental recovery.
---
src/ZODB/scripts/repozo.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
index aaf5620a6..015418c12 100755
--- a/src/ZODB/scripts/repozo.py
+++ b/src/ZODB/scripts/repozo.py
@@ -814,6 +814,9 @@ def do_incremental_recover(options, repofiles):
first_file_to_restore = repofiles.index(filename)
assert first_file_to_restore > 0, (
first_file_to_restore, options.repository, fn, filename, repofiles)
+ log('remaining files needed to recover incrementally onto target file:')
+ for f in repofiles[first_file_to_restore:]:
+ log('\t%s', f)
temporary_output_file = options.output + '.part'
os.rename(options.output, temporary_output_file)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment