ZBigFile: Add ZBlk format option 'h' (heuristic)

There are two formats to save data with a ZBigFile: ZBlk0 and ZBlk1. They differ by adjusting the ratio between access-time and growing disk-space, where ZBlk1 is better regarding to disk space, while ZBlk0 has a better access-time. Wendelin.core users may not always know yet or care which format fits better for their data. In this case it may be easier for users to just let the program automatically select the ZBlk format. With this patch and the new 'h' (for heuristic) option of the 'ZBlk' argument of ZBigFile, this is now possible. The 'h' option isn't really a new ZBlk format in itself, but it just tries to automatically select the best ZBlk format option according to the characteristics of the changes that the user applies to the ZBigFile. With this patch comes a test (bigfile/tests/test-zblk-fmt) that creates benchmarks for different combinations and zblk formats. The test aims at checking that the 'heuristic' format behaves mostly as good as the explicitly set formats: Use only a very small change size, so that heuristic always uses ZBlk1 --------------------------------------------- --------------------------------------------- Set change_size_set to 20 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 20.388751 MB Access time: 3.83012294769 Run tests with format ZBlk0: ZODB storage size: 1064.636095 MB Access time: 3.63488578796 Run tests with format ZBlk1: ZODB storage size: 18.59421 MB Access time: 3.93918204308 --------------------------------------------- --------------------------------------------- Use only a very big change size, so that heuristic always uses ZBlk0 --------------------------------------------- --------------------------------------------- Set change_size_set to 200000 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 2113.32534 MB Access time: 3.79592084885 Run tests with format ZBlk0: ZODB storage size: 2113.254473 MB Access time: 3.76431703568 Run tests with format ZBlk1: ZODB storage size: 1651.236315 MB Access time: 4.11528992653 --------------------------------------------- --------------------------------------------- Mix between change size so that heuristic switches between ZBlk0 and ZBlk1 --------------------------------------------- --------------------------------------------- Set change_size_set to 20,200000 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 820.17736 MB Access time: 3.85217094421 Run tests with format ZBlk0: ZODB storage size: 1576.361791 MB Access time: 3.65322995186 Run tests with format ZBlk1: ZODB storage size: 815.323463 MB Access time: 3.96401691437 --------------------------------------------- ---------------------------------------------

ZBigFile: Add ZBlk format option 'h' (heuristic)
There are two formats to save data with a ZBigFile: ZBlk0 and ZBlk1. They differ by adjusting the ratio between access-time and growing disk-space, where ZBlk1 is better regarding to disk space, while ZBlk0 has a better access-time. Wendelin.core users may not always know yet or care which format fits better for their data. In this case it may be easier for users to just let the program automatically select the ZBlk format. With this patch and the new 'h' (for heuristic) option of the 'ZBlk' argument of ZBigFile, this is now possible. The 'h' option isn't really a new ZBlk format in itself, but it just tries to automatically select the best ZBlk format option according to the characteristics of the changes that the user applies to the ZBigFile. With this patch comes a test (bigfile/tests/test-zblk-fmt) that creates benchmarks for different combinations and zblk formats. The test aims at checking that the 'heuristic' format behaves mostly as good as the explicitly set formats: Use only a very small change size, so that heuristic always uses ZBlk1 --------------------------------------------- --------------------------------------------- Set change_size_set to 20 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 20.388751 MB Access time: 3.83012294769 Run tests with format ZBlk0: ZODB storage size: 1064.636095 MB Access time: 3.63488578796 Run tests with format ZBlk1: ZODB storage size: 18.59421 MB Access time: 3.93918204308 --------------------------------------------- --------------------------------------------- Use only a very big change size, so that heuristic always uses ZBlk0 --------------------------------------------- --------------------------------------------- Set change_size_set to 200000 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 2113.32534 MB Access time: 3.79592084885 Run tests with format ZBlk0: ZODB storage size: 2113.254473 MB Access time: 3.76431703568 Run tests with format ZBlk1: ZODB storage size: 1651.236315 MB Access time: 4.11528992653 --------------------------------------------- --------------------------------------------- Mix between change size so that heuristic switches between ZBlk0 and ZBlk1 --------------------------------------------- --------------------------------------------- Set change_size_set to 20,200000 Set change_count to 500 Set arrsize to 1000000 Run tests with format h: ZODB storage size: 820.17736 MB Access time: 3.85217094421 Run tests with format ZBlk0: ZODB storage size: 1576.361791 MB Access time: 3.65322995186 Run tests with format ZBlk1: ZODB storage size: 815.323463 MB Access time: 3.96401691437 --------------------------------------------- ---------------------------------------------
850703f8 · Levin Zimmermann · 743937a4 · 850703f8 · 850703f8 · 850703f8
Commit 850703f8 authored Oct 25, 2023 by Levin Zimmermann
5 changed files
--- a/bigfile/file_zodb.py
+++ b/bigfile/file_zodb.py
@@ -414,14 +414,7 @@ class ZBlk1(ZBlkBase):
                break

        # scan over buf and update/delete changed chunks
-        for start in range(0, len(buf), CHUNKSIZE):
-            data = buf[start:start+CHUNKSIZE]   # FIXME copy on py2
-            # make sure data is bytes
-            # (else we cannot .rstrip() it below)
-            if not isinstance(data, bytes):
-                data = bytes(data)              # FIXME copy on py3
-            # trim trailing \0
-            data = data.rstrip(b'\0')           # FIXME copy
+        for data, start in _buf_iterator(buf, CHUNKSIZE):
            chunk = chunktab.get(start)

            # all 0 -> make sure to remove chunk
@@ -511,22 +504,23 @@ class ZBigFile(LivePersistent):

    def __init__(self, blksize, zblk_fmt=""):
        LivePersistent.__init__(self)
-        self.__setstate__((blksize, LOBTree(), zblk_fmt))     # NOTE L enough for blk_t
+        self.__setstate__((blksize, LOBTree(), zblk_fmt, 0, 0))     # NOTE L enough for blk_t
        self.zblk_fmt = zblk_fmt  # Evoke check if zblk_fmt is valid


-    # state is (.blksize, .blktab, .zblk_fmt)
+    # state is (.blksize, .blktab, .zblk_fmt, .zblk_fmt0_counter .zblk_fmt1_counter)
    def __getstate__(self):
-        return (self.blksize, self.blktab, self.zblk_fmt)
+        return (self.blksize, self.blktab, self.zblk_fmt, self.zblk_fmt0_counter, self.zblk_fmt1_counter)

    def __setstate__(self, state):
        state_length = len(state)
-        # NOTE set _zblk_fmt instead of zblk_fmt to avoid check => ↑ performance
        if state_length == 2:  # BBB
-            self.blksize, self.blktab = state
-            self._zblk_fmt = ""
-        elif state_length == 3:
-            self.blksize, self.blktab, self._zblk_fmt = state
+            self.__setstate__(tuple(state) + ("", 0, 0))
+        elif state_length == 3:  # BBB
+            self.__setstate__(tuple(state) + (0, 0))
+        elif state_length == 5:
+            # NOTE set _zblk_fmt instead of zblk_fmt to avoid check => ↑ performance
+            self.blksize, self.blktab, self._zblk_fmt, self.zblk_fmt0_counter, self.zblk_fmt1_counter = state
        else:
            raise RuntimeError("E: Unexpected state length: %s" % state)
        self._v_file = _ZBigFile._new(self, self.blksize)
@@ -555,7 +549,10 @@ class ZBigFile(LivePersistent):
    # store data    dirty page -> ZODB obj
    def storeblk(self, blk, buf):
        zblk = self.blktab.get(blk)
-        zblk_type_write = ZBlk_fmt_registry[self.zblk_fmt or ZBlk_fmt_write]
+        zblk_fmt = self.zblk_fmt
+        if zblk_fmt == "h":  # apply heuristic
+            zblk_fmt = self._zblk_fmt_heuristic(zblk, buf)
+        zblk_type_write = ZBlk_fmt_registry[zblk_fmt or ZBlk_fmt_write]
        # if zblk was absent or of different type - we (re-)create it anew
        if zblk is None  or \
           type(zblk) is not zblk_type_write:
@@ -576,6 +573,43 @@ class ZBigFile(LivePersistent):
        zblk.bindzfile(self, blk)


+    # Heuristically determine zblk format by optimizing
+    # storage-space/access-speed ratio. Both can't be ideal, see
+    # module docstring: "Due to weakness of current ZODB storage
+    # servers, wendelin.core cannot provide at the same time both
+    # fast reads and small database size growth ..."
+    def _zblk_fmt_heuristic(self, zblk, buf):
+        # If the heuristic often switches between ZBlk0 and ZBlk1 the
+        # access time is even worse than when using only ZBlk1. Therefore
+        # the heuristic keeps track on how often the ZBlk format is changed.
+        # If it's more frequently changing than being stable, it switches
+        # forever to ZBlk1 and doesn't apply the heuristic anymore.
+        c0, c1 = self.zblk_fmt0_counter, self.zblk_fmt1_counter
+        try:
+            zblk_fmt_ratio = c0 / c1 if c1 > c0 else c1 / c0
+        except ZeroDivisionError:
+            zblk_fmt_ratio = 0
+        if zblk_fmt_ratio > 0.5:  # Switch forever to ZBlk1
+            self.zblk_fmt = zblk_fmt = 'ZBlk1'
+            return zblk_fmt
+
+        if zblk is None:  # no data yet => can't make any assumptions yet
+            return "ZBlk0"
+        else:
+            # We already commited our first data. Now let's
+            # see whether it's better to use ZBlk0 or ZBlk1.
+            p = _change_percentage(zblk, buf)
+            if p > 0.5:  # more than half of all chunks changed
+                # Pick ZBlk0 in case of wide change: ZBlk1 advantage of
+                # a smaller disk footprint isn't so strong then:
+                # we can go for a faster read access with ZBlk0.
+                self.zblk_fmt0_counter += 1
+                return 'ZBlk0'
+            else:
+                self.zblk_fmt1_counter += 1
+                return 'ZBlk1'
+
+
    # invalidate data   .blktab[blk] invalidated -> invalidate page
    def invalidateblk(self, blk):
        for fileh in self._v_filehset:
@@ -622,7 +656,7 @@ class ZBigFile(LivePersistent):

    @zblk_fmt.setter
    def zblk_fmt(self, zblk_fmt):
-        if zblk_fmt and zblk_fmt not in ZBlk_fmt_registry:
+        if zblk_fmt and zblk_fmt != "h" and zblk_fmt not in ZBlk_fmt_registry:
            raise RuntimeError('E: Unknown ZBlk format %r' % zblk_fmt)
        self._zblk_fmt = zblk_fmt

@@ -851,3 +885,61 @@ class _ZBigFileH(object):
        # and also more right - tpc_finish is there assumed as non-failing by
        # ZODB design)
        self.abort(txn)
+
+
+# Utility functions for zblk
+
+# Percentage how much the page changed to previous commit:
+#   0.0 = nothing changed
+#   0.5 = half of data changed
+#   1.0 = all data changed
+def _change_percentage(zblk, buf):
+    if type(zblk) == ZBlk0:
+        CHUNKSIZE = 4096
+        chunktab = _adhoc_chunktab(zblk.loadblkdata(), CHUNKSIZE)
+    else:
+        chunktab, CHUNKSIZE = zblk.chunktab, zblk.CHUNKSIZE
+    chunk_count = len(buf) / CHUNKSIZE
+    change_count = _count_changes(buf, chunktab, CHUNKSIZE)
+    return change_count / float(chunk_count)
+
+
+# Count how many chunks changed to previous commit.
+def _count_changes(buf, chunktab, CHUNKSIZE):
+    change_count = 0
+    for data, start in _buf_iterator(buf, CHUNKSIZE):
+        chunk = chunktab.get(start)
+        if data:
+            if chunk is None:
+                change_count += 1
+            elif chunk.data != data:
+                change_count += 1
+        elif chunk is not None:  # and not data
+            change_count += 1
+    return change_count
+
+
+# Create chunktab from buffer with chunk objects
+# that mimic ZData objects.
+def _adhoc_chunktab(buf, CHUNKSIZE):
+    class chunk():  # mimic ZData
+        def __init__(self, data):
+            self.data = data
+
+    chunktab = {}
+    for data, start in _buf_iterator(buf, CHUNKSIZE):
+        chunktab[start] = chunk(data)
+    return chunktab
+
+
+# Iterate over buffer and yield chunks and start position
+def _buf_iterator(buf, CHUNKSIZE):
+    for start in range(0, len(buf), CHUNKSIZE):
+        data = buf[start:start+CHUNKSIZE]
+        # make sure data is bytes
+        # (else we cannot .rstrip() it below)
+        if not isinstance(data, bytes):
+            data = bytes(data)              # FIXME copy on py3
+        # trim trailing \0
+        data = data.rstrip(b'\0')           # FIXME copy
+        yield data, start
--- a/bigfile/tests/_test_zblk_fmt
+++ b/bigfile/tests/_test_zblk_fmt
+# Copyright (C) 2023  Nexedi SA and Contributors.
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Free Software licenses or any of the Open Source
+# Initiative approved licenses and Convey the resulting work. Corresponding
+# source of such a combination shall include the source code for all other
+# software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+# See https://www.nexedi.com/licensing for rationale and options.
+
+# Test to compare disk-space and access-speed of the different ZBlk format options:
+#
+# 	- ZBlk0
+# 	- ZBlk1
+# 	- h
+# 
+# The heuristic 'h' should behave as good as ZBlk0 in case of wide changes
+# and as good as ZBlk1 in case of small changes.
+
+import os
+import random
+import resource
+import tempfile
+import timeit
+import sys
+
+# Add relative module path, to run tests on local code
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '.'))
+
+from golang import defer, func
+import numpy as np
+import transaction
+import ZODB, ZODB.FileStorage
+
+from wendelin.bigarray.array_zodb import ZBigArray
+
+random.seed(10)
+
+# Avoid error due to too many opened file descriptors.
+cur_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
+new_limit = (cur_limit[1], cur_limit[1])
+resource.setrlimit(resource.RLIMIT_NOFILE, new_limit)
+
+storage_path = tempfile.mktemp()
+
+# Declare test parameters.
+zblk_fmt = os.environ.get('zblk_fmt', 'h')
+change_size_set = tuple(int(n) for n in os.environ.get('change_size_set', '20').split(','))
+change_count = int(os.environ.get('change_count', '1000'))
+arrsize = int(os.environ.get('arrsize', '1000000'))
+
+# Utiliy functions
+
+def randarr(size=1000000):
+    return np.array([[random.randint(1, 1000), random.randint(1, 1000)] for _ in range(size)])
+
+def setrand(A, size=20):
+    A[0:size][:] = randarr(size)
+    transaction.commit()
+
+def accessrand(A, size=1000):
+    n = random.randint(0, arrsize - size)
+    a = A[n:n+size]
+
+@func
+def root(func):
+    storage = ZODB.FileStorage.FileStorage(storage_path)
+    db = ZODB.DB(storage)
+    connection = db.open()
+    root = connection.root
+
+    defer(connection.close)
+    defer(db.close)
+    defer(storage.close)
+
+    func(root)
+
+@root
+def setup(root):
+    root.A = A = ZBigArray(shape=[1, 2], dtype=int, zblk_fmt=zblk_fmt)
+    transaction.commit()
+
+@root
+def fillup(root):
+    A = root.A
+    values = randarr(arrsize)
+    A.append(values)
+    transaction.commit()
+
+@root
+def change(root):
+    A = root.A
+    for _ in range(change_count):
+        change_size = random.choice(change_size_set)
+        setrand(A, change_size)
+        transaction.commit()
+
+def access():
+    @root
+    def _(root):
+        accessrand(root.A)
+
+def statistics():
+    print("\tZODB storage size: %s MB" % (os.path.getsize(storage_path) / float(10**6)))
+
+    random.seed(10)
+    time = timeit.timeit("access()", "from __main__ import access", number=5000)
+    print("\tAccess time: %s" % time)
+
+statistics()
--- a/bigfile/tests/test-zblk-fmt
+++ b/bigfile/tests/test-zblk-fmt
+#!/usr/bin/env bash
+
+# Copyright (C) 2023  Nexedi SA and Contributors.
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Free Software licenses or any of the Open Source
+# Initiative approved licenses and Convey the resulting work. Corresponding
+# source of such a combination shall include the source code for all other
+# software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+# See https://www.nexedi.com/licensing for rationale and options.
+
+# Test to compare disk-space and access-speed of the different ZBlk format options:
+#
+# 	- ZBlk0
+# 	- ZBlk1
+# 	- h
+# 
+# The heuristic 'h' should behave as good as ZBlk0 in case of wide changes
+# and as good as ZBlk1 in case of small changes.
+
+function test {
+
+	function t {
+		zblkfmt=$1
+		echo "Run tests with format $zblkfmt:"
+		echo ""
+		export zblk_fmt=$zblkfmt
+		python bigfile/tests/_test_zblk_fmt
+		echo ""
+		echo ""
+	}
+
+	change_size_set=$1
+	change_count=$2
+	arrsize=$3
+
+	echo "---------------------------------------------"
+	echo "---------------------------------------------"
+	echo "Set change_size_set to $change_size_set"
+	echo "Set change_count to $change_count"
+	echo "Set arrsize to $arrsize"
+
+	echo ""
+
+	export change_size_set=$change_size_set
+	export change_count=$change_count
+	export arrsize=$arrsize
+
+	t h
+	t ZBlk0
+	t ZBlk1
+
+	echo ""
+	echo "---------------------------------------------"
+	echo "---------------------------------------------"
+	echo ""
+}
+
+echo "Use only a very small change size, so that heuristic always uses ZBlk1"
+test 20 500 1000000
+
+echo "Use only a very big change size, so that heuristic always uses ZBlk0"
+test 200000 500 1000000
+
+echo "Mix between change size so that heuristic switches between ZBlk0 and ZBlk1"
+test 20,200000 500 1000000
--- a/bigfile/tests/test_filezodb.py
+++ b/bigfile/tests/test_filezodb.py
 # Wendelin.core.bigfile | Tests for ZODB BigFile backend
-# Copyright (C) 2014-2021  Nexedi SA and Contributors.
+# Copyright (C) 2014-2023  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -716,3 +716,27 @@ def test_bigfile_set_zblk_fmt():
    transaction.commit()

    assert type(f.blktab[0]) is file_zodb.ZBlk1
+
+
+# Minimal test to ensure normal operations work as expected
+# with zblk fmt 'h'
+@func
+def test_bigfile_zblk_fmt_heuristic():
+    root = dbopen()
+    defer(lambda: dbclose(root))
+    root['zfile8'] = f = ZBigFile(blksize, zblk_fmt="h")
+    transaction.commit()
+
+    fh  = f.fileh_open()
+    vma = fh.mmap(0, blen)
+
+    b = Blk(vma, 0)
+    b[:] = 1
+    transaction.commit()
+
+    assert (b == 1).all()
+
+    b[0] = 2
+    transaction.commit()
+
+    assert b[0] == 2
--- a/wcfs/internal/zdata/zblk.go
+++ b/wcfs/internal/zdata/zblk.go
@@ -354,37 +354,42 @@ func (zb *ZBlk1) LoadBlkData(ctx context.Context) (_ []byte, _ zodb.Tid, err err
 type ZBigFile struct {
 	zodb.Persistent

-	// state: (.blksize, .blktab, .zblk_fmt)
-	blksize  int64
-	blktab   *btree.LOBTree // {}  blk -> ZBlk*(blkdata)
-	zblk_fmt string
+	// state: (.blksize, .blktab, .zblk_fmt, .zblk_fmt0_counter, .zblk_fmt1_counter)
+	blksize           int64
+	blktab            *btree.LOBTree // {}  blk -> ZBlk*(blkdata)
+	zblk_fmt          string
+	zblk_fmt0_counter int64
+	zblk_fmt1_counter int64
 }

 type zBigFileState ZBigFile // hide state methods from public API

 // DropState implements zodb.Ghostable.
 func (bf *zBigFileState) DropState() {
-	bf.blksize  = 0
-	bf.blktab   = nil
-	bf.zblk_fmt = ""
+	bf.blksize            = 0
+	bf.blktab             = nil
+	bf.zblk_fmt           = ""
+	bf.zblk_fmt0_counter  = 0
+	bf.zblk_fmt1_counter  = 0
 }

 // PyGetState implements zodb.PyStateful.
 func (bf *zBigFileState) PyGetState() interface{} {
-	return pickle.Tuple{bf.blksize, bf.blktab, bf.zblk_fmt}
+	return pickle.Tuple{bf.blksize, bf.blktab, bf.zblk_fmt, bf.zblk_fmt0_counter, bf.zblk_fmt1_counter}
 }

 // PySetState implements zodb.PyStateful.
 func (bf *zBigFileState) PySetState(pystate interface{}) (err error) {
 	t, ok := pystate.(pickle.Tuple)
 	if !ok {
-		return fmt.Errorf("expect [2|3](); got %s", xzodb.TypeOf(pystate))
+		return fmt.Errorf("expect [2|3|5](); got %s", xzodb.TypeOf(pystate))
 	}
 	// BBB: we either accept data before adding zblk_fmt to state
-	// (lent==2) or data after adding zblk_fmt to state (lent==3).
+	// (lent==2) or data after adding zblk_fmt to state (lent==3) or
+	// data after adding zblk_fmt counter (lent==5).
 	lent := len(t)
-	if lent != 2 && lent != 3 {
-		return fmt.Errorf("expect [2|3](); got [%d]()", len(t))
+	if lent != 2 && lent != 3 && lent != 5 {
+		return fmt.Errorf("expect [2|3|5](); got [%d]()", len(t))
 	}

 	blksize, ok := pycompat.Int64(t[0])
@@ -409,6 +414,21 @@ func (bf *zBigFileState) PySetState(pystate interface{}) (err error) {
 			return fmt.Errorf("zblk_fmt: expect str; got %s", xzodb.TypeOf(t[2]))
 		}
 		bf.zblk_fmt = zblk_fmt
+
+		if lent == 5 {
+			zblk_fmt0_counter, ok := pycompat.Int64(t[3])
+			if !ok {
+				return fmt.Errorf("zblk_fmt0_counter: expect integer; got %s", xzodb.TypeOf(t[3]))
+			}
+
+			zblk_fmt1_counter, ok := pycompat.Int64(t[4])
+			if !ok {
+				return fmt.Errorf("zblk_fmt1_counter: expect integer; got %s", xzodb.TypeOf(t[4]))
+			}
+
+			bf.zblk_fmt0_counter = zblk_fmt0_counter
+			bf.zblk_fmt1_counter = zblk_fmt1_counter
+		}
 	}

 	return nil