bigfile/zodb: Add ZBlk format option 'auto' (heuristic)

There are two formats to save data with a ZBigFile: ZBlk0 and ZBlk1. They differ by adjusting the ratio between access-time and growing disk-space, where ZBlk1 is better regarding to disk space, while ZBlk0 has a better access-time. Wendelin.core users may not always know yet or care which format fits better for their data. In this case it may be easier for users to just let the program automatically select the ZBlk format. With this patch and the new 'auto' (for heuristic) option of the 'ZBlk' argument of ZBigFile, this is now possible. The 'auto' option isn't really a new ZBlk format in itself, but it just tries to automatically select the best ZBlk format option according to the characteristics of the changes that the user applies to the ZBigFile. In its current implementation, the heuristic tackles the use-case of large arrays with many small append-only changes. In this case 'auto' is smaller in space than ZBlk0, but faster to read than ZBlk1. It does so, by initially using ZBlk1 until a blk is filled up. Once a blk is full, it switches to ZBlk1, as it was recommended by @kirr in nexedi/wendelin.core!20 (comment 196084). With this patch comes a test (bigfile/tests/bench_zblkfmt) that creates benchmarks for different combinations and zblk formats. The test aims to check how the 'heuristic' format performs in contrast to 'ZBlk0' and 'ZBlk1': BenchmarkAppendSize/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 1 538.1 MB BenchmarkAppendRandRead/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 6 2.085 ms/blk BenchmarkAppendSize/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 1 16.8 MB BenchmarkAppendRandRead/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 6 14.564 ms/blk BenchmarkAppendSize/zblk=auto/change_count=500/change_percentage_set=[0.014] 1 29.4 MB BenchmarkAppendRandRead/zblk=auto/change_count=500/change_percentage_set=[0.014] 6 2.119 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 1021.1 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 2.324 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 216.2 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 15.317 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 219.8 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 14.027 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1048.6 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 2.126 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1070.4 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 14.284 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1070.3 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 14.072 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 1046.4 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 2.137 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 638.2 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 14.083 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 639.5 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 13.937 ms/blk and post-processed with benchstat from 3 such runs: │ x.log │ │ B │ AppendSize/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 513.2Mi ± 0% AppendSize/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 16.02Mi ± 0% AppendSize/zblk=auto/change_count=500/change_percentage_set=[0.014] 28.04Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 973.8Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 206.2Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 209.6Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 1000.0Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 1020.8Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 1020.7Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 997.9Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 608.6Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 609.9Mi ± 0% geomean 353.0Mi │ x.log │ │ ms/blk │ AppendRandRead/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 2.094 ± 12% AppendRandRead/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 14.47 ± 1% AppendRandRead/zblk=auto/change_count=500/change_percentage_set=[0.014] 2.168 ± 2% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 2.324 ± 1% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 13.73 ± 12% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 13.60 ± 3% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 2.125 ± 2% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 14.18 ± 3% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 14.17 ± 1% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 2.118 ± 1% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 13.85 ± 2% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 13.80 ± 1% geomean 6.423 See nexedi/wendelin.core!20 and kirr/wendelin.core@da765ef7...0c6f0850 for the preliminary history of this patch. Co-authored-by: Kirill Smelkov <kirr@nexedi.com> Fix typo.

bigfile/zodb: Add ZBlk format option 'auto' (heuristic)
There are two formats to save data with a ZBigFile: ZBlk0 and ZBlk1. They differ by adjusting the ratio between access-time and growing disk-space, where ZBlk1 is better regarding to disk space, while ZBlk0 has a better access-time. Wendelin.core users may not always know yet or care which format fits better for their data. In this case it may be easier for users to just let the program automatically select the ZBlk format. With this patch and the new 'auto' (for heuristic) option of the 'ZBlk' argument of ZBigFile, this is now possible. The 'auto' option isn't really a new ZBlk format in itself, but it just tries to automatically select the best ZBlk format option according to the characteristics of the changes that the user applies to the ZBigFile. In its current implementation, the heuristic tackles the use-case of large arrays with many small append-only changes. In this case 'auto' is smaller in space than ZBlk0, but faster to read than ZBlk1. It does so, by initially using ZBlk1 until a blk is filled up. Once a blk is full, it switches to ZBlk1, as it was recommended by @kirr in nexedi/wendelin.core!20 (comment 196084). With this patch comes a test (bigfile/tests/bench_zblkfmt) that creates benchmarks for different combinations and zblk formats. The test aims to check how the 'heuristic' format performs in contrast to 'ZBlk0' and 'ZBlk1': BenchmarkAppendSize/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 1 538.1 MB BenchmarkAppendRandRead/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 6 2.085 ms/blk BenchmarkAppendSize/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 1 16.8 MB BenchmarkAppendRandRead/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 6 14.564 ms/blk BenchmarkAppendSize/zblk=auto/change_count=500/change_percentage_set=[0.014] 1 29.4 MB BenchmarkAppendRandRead/zblk=auto/change_count=500/change_percentage_set=[0.014] 6 2.119 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 1021.1 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 2.324 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 216.2 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 15.317 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 1 219.8 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 3 14.027 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1048.6 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 2.126 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1070.4 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 14.284 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 1 1070.3 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 3 14.072 ms/blk BenchmarkRandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 1046.4 MB BenchmarkRandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 2.137 ms/blk BenchmarkRandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 638.2 MB BenchmarkRandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 14.083 ms/blk BenchmarkRandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 1 639.5 MB BenchmarkRandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 3 13.937 ms/blk and post-processed with benchstat from 3 such runs: │ x.log │ │ B │ AppendSize/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 513.2Mi ± 0% AppendSize/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 16.02Mi ± 0% AppendSize/zblk=auto/change_count=500/change_percentage_set=[0.014] 28.04Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 973.8Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 206.2Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 209.6Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 1000.0Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 1020.8Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 1020.7Mi ± 0% RandWriteSize/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 997.9Mi ± 0% RandWriteSize/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 608.6Mi ± 0% RandWriteSize/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 609.9Mi ± 0% geomean 353.0Mi │ x.log │ │ ms/blk │ AppendRandRead/zblk=ZBlk0/change_count=500/change_percentage_set=[0.014] 2.094 ± 12% AppendRandRead/zblk=ZBlk1/change_count=500/change_percentage_set=[0.014] 14.47 ± 1% AppendRandRead/zblk=auto/change_count=500/change_percentage_set=[0.014] 2.168 ± 2% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 2.324 ± 1% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 13.73 ± 12% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2] 13.60 ± 3% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[1] 2.125 ± 2% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[1] 14.18 ± 3% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[1] 14.17 ± 1% RandWriteRandRead/zblk=ZBlk0/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 2.118 ± 1% RandWriteRandRead/zblk=ZBlk1/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 13.85 ± 2% RandWriteRandRead/zblk=auto/arrsize=1000000/change_count=500/change_percentage_set=[0.2,1] 13.80 ± 1% geomean 6.423 See nexedi/wendelin.core!20 and kirr/wendelin.core@da765ef7...0c6f0850 for the preliminary history of this patch. Co-authored-by: Kirill Smelkov <kirr@nexedi.com> Fix typo.
d6628427 · Levin Zimmermann · Kirill Smelkov · 84def52e · d6628427 · d6628427
Commit d6628427 authored Oct 25, 2023 by Levin Zimmermann Committed by Kirill Smelkov Apr 03, 2024
5 changed files
--- a/bigfile/file_zodb.py
+++ b/bigfile/file_zodb.py
@@ -83,13 +83,18 @@ changes. "Small" here means something like 1-10000 bytes per transaction as
 larger changes become comparable to 2M block size and are handled efficiently
 out of the box. Until the problem is fixed on ZODB server side, wendelin.core
 provides on-client workaround in the form of specialized block format, and
-users have to explicitly indicate via environment variable that their workload
-is "small changes" if they prefer to prioritize database size over access
-speed::
+users can explicitly indicate via environment variable that their workload is
+either "big changes", if they prefer to prioritize access speed, or "small
+changes" if they prefer to prioritize database size over access speed. There is
+also "auto" mode that tries to heuristically use both ZBlk0 and ZBlk1 depending
+on change pattern and works relatively good regarding both access speed and
+database size for append-like workloads::

  $WENDELIN_CORE_ZBLK_FMT
-      ZBlk0             fast reads      (default)
+      ZBlk0             fast reads
      ZBlk1             small changes
+      auto  (default)   heuristically use either ZBlk0 or ZBlk1
+                        depending on change pattern

 Description of block formats follow:

@@ -159,7 +164,7 @@ will be our future approach after we teach NEO about object deduplication.

 from wendelin.bigfile import WRITEOUT_STORE, WRITEOUT_MARKSTORED
 from wendelin.bigfile._file_zodb import _ZBigFile
-from wendelin.lib.mem import bzero, memcpy
+from wendelin.lib.mem import bzero, memcpy, memdelta
 from wendelin.lib.zodb import LivePersistent, deactivate_btree

 from transaction.interfaces import IDataManager, ISynchronizer
@@ -476,10 +481,14 @@ class ZBlk1(ZBlkBase):
 # backward compatibility (early versions wrote ZBlk0 named as ZBlk)
 ZBlk = ZBlk0

+# _ZBlk_auto indicates to heuristically select ZBlk format
+_ZBlk_auto = object()
+
 # format-name -> blk format type
 ZBlk_fmt_registry = {
    'ZBlk0':    ZBlk0,
    'ZBlk1':    ZBlk1,
+    'auto':     _ZBlk_auto,
 }

 # format for updated blocks
@@ -547,6 +556,11 @@ class ZBigFile(LivePersistent):
    def storeblk(self, blk, buf):
        zblk = self.blktab.get(blk)
        zblk_type_write = ZBlk_fmt_registry[ZBlk_fmt_write]
+        if zblk_type_write is _ZBlk_auto:  # apply heuristic
+            zblk_type_write = self._zblk_fmt_heuristic(zblk, blk, buf)
+        self._setzblk(blk, zblk, buf, zblk_type_write)
+
+    def _setzblk(self, blk, zblk, buf, zblk_type_write):  # helper
        # if zblk was absent or of different type - we (re-)create it anew
        if zblk is None  or \
           type(zblk) is not zblk_type_write:
@@ -567,6 +581,72 @@ class ZBigFile(LivePersistent):
        zblk.bindzfile(self, blk)


+    # Heuristically determine zblk format by optimizing
+    # storage-space/access-speed ratio. Both can't be ideal, see
+    # module docstring: "Due to weakness of current ZODB storage
+    # servers, wendelin.core cannot provide at the same time both
+    # fast reads and small database size growth ..."
+    def _zblk_fmt_heuristic(self, zblk, blk, buf):
+        # see if we are doing a "small append" like change
+        # load previous data and compute the difference along the way
+        new_data = bytes(buf).rstrip(b'\0')
+        if zblk is None:
+            old_data = b''
+        else:
+            assert not zblk._p_changed
+            old_data  = bytes(zblk.loadblkdata()).rstrip(b'\0')
+        ndelta = memdelta(old_data, new_data)
+
+        try:
+            last_blk = self.blktab.maxKey()
+        except ValueError: # empty tree
+            last_blk = -1
+
+        append_oldblk = ((blk == last_blk)   and (new_data[:len(old_data)] == old_data))
+        append_newblk = ((blk == last_blk+1) and (len(old_data) == 0))
+
+        append = (append_oldblk or append_newblk)
+        small  = (ndelta < 0.5*self.blksize)
+        filled = (len(new_data) == self.blksize)  # filled full with non-zeros at the end
+
+        # append - migrate previously filled-up block to ZBlk0 for fast reads
+        #        - for current block use ZBlk1 if the append is small and not fully filled, and ZBlk0 otherwise
+        #
+        # do the migration of previous block only if it is also changed in
+        # current transaction. This preserves the invariant that "transaction
+        # changes ZBlk objects only for modified blocks of the file".
+        # NOTE: this misses a case when append stops exactly on blocks boundary
+        # after appending some zeros. For now we ignore such case as improbable.
+        #
+        # For the implementation we rely on that zfileh.dirty_writeout()
+        # invokes storeblk in ascending order of blk, so that when we are here,
+        # we can be sure that if previous block is also modified, then .blktab
+        # already has corresponding entry for it and the entry is changed.
+        if append:
+            if append_newblk:
+                zblk_prev = self.blktab.get(blk-1)
+                if zblk_prev is not None  and   \
+                   zblk_prev._p_changed   and   \
+                   type(zblk_prev) is not ZBlk0:
+                    # gather data prepared for previous block
+                    # NOTE: loadblkdata throws away all changes inside zblk_prev
+                    zblk_prev_data = zblk_prev.loadblkdata()
+                    # but we re-save that updated data immediately after
+                    self._setzblk(blk-1, zblk_prev, zblk_prev_data, ZBlk0)
+            return ZBlk1 if (small and not filled) else ZBlk0
+
+        # all other changes - use ZBlk1 if the change is small and ZBlk0 otherwise
+        else:
+            if small:
+                # TODO(kirr): "to support sporadic small changes over initial big fillup [...]
+                # we could introduce e.g. a ZBlkδ object, which would refer to base
+                # underlying ZBlk object and add "patch" information on top of that [...]."
+                # See https://lab.nexedi.com/nexedi/wendelin.core/merge_requests/20#note_196084
+                return ZBlk1
+            else:
+                return ZBlk0
+
+
    # invalidate data   .blktab[blk] invalidated -> invalidate page
    def invalidateblk(self, blk):
        for fileh in self._v_filehset:

--- a/bigfile/tests/bench_zblkfmt
+++ b/bigfile/tests/bench_zblkfmt
+#!/usr/bin/env python
+# Copyright (C) 2023  Nexedi SA and Contributors.
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Free Software licenses or any of the Open Source
+# Initiative approved licenses and Convey the resulting work. Corresponding
+# source of such a combination shall include the source code for all other
+# software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+# See https://www.nexedi.com/licensing for rationale and options.
+
+# Test to compare disk-space and access-speed of the different ZBlk format options:
+#
+#   - ZBlk0
+#   - ZBlk1
+#   - auto
+
+from __future__ import print_function, absolute_import, division
+
+import os
+import random
+import tempfile
+import timeit
+import shutil
+import multiprocessing
+from time import sleep
+ms = 1e-3
+
+from golang import defer, func
+import numpy as np
+import transaction
+import ZODB, ZODB.FileStorage
+
+from wendelin.bigarray.array_zodb import ZBigArray
+from wendelin.bigfile import file_zodb
+
+
+# IWriteWorkLoad represents write workload type:
+class IWriteWorkLoad:
+    # do_write should perform all write operations of the workload on the
+    # database associated with root object.
+    def do_write(wrk, root):  raise NotImplementedError()
+
+    # args should be set to string with arguments used to parameterize this workload.
+    args = ''
+
+
+# benchwrk benchmarks database size and read speed under write workload wrk.
+#
+# the benchmark is run for all supported ZBlk formats.
+def benchwrk(wrk):
+    # run each benchmark in separate process so that there is no chance they
+    # somehow affect each other.
+    zblk_fmtv = list(file_zodb.ZBlk_fmt_registry.keys())
+    zblk_fmtv.sort()
+    for zblk_fmt in zblk_fmtv:
+        def _():
+            file_zodb.ZBlk_fmt_write = zblk_fmt
+            _benchwrk(wrk)
+        p = multiprocessing.Process(target=_)
+        p.start()
+        p.join()
+
+@func
+def _benchwrk(wrk):
+    tmpd = tempfile.mkdtemp('', 'zblkbench')
+    def _():
+        shutil.rmtree(tmpd)
+    defer(_)
+
+    storage_path = '%s/data.fs' % tmpd
+
+    # with_db runs f(root) on a freshly-opened connection to test database.
+    traceload = False
+    delayload = False
+    @func
+    def with_db(f):
+        storage = ZODB.FileStorage.FileStorage(storage_path)
+        defer(storage.close)
+
+        # simulate loading latency as actually seen on NEO.
+        # there I was seeing latencies up to _1_ millisecond, but even with
+        # "modest" 0.2 ms it really shows in the figures.
+        #
+        # (activated only during read benchmark to avoid wasting time
+        # while preparing data)
+        tloaddelay = 0.2 * ms
+        stor_load       = storage.load
+        stor_loadBefore = storage.loadBefore
+        def loadBefore(oid, tid):
+            if traceload:
+                print('# loadBefore %r %r' % (oid, tid))
+            if delayload:
+                sleep(tloaddelay)
+            return stor_loadBefore(oid, tid)
+        def load(oid):
+            # load is used on plain ZODB4; ZODB5 and ZODB4-wc2 use loadBefore only
+            if traceload:
+                print('# load %r' % (oid,))
+            # see loadBefore above
+            if delayload:
+                sleep(tloaddelay)
+            return stor_load(oid)
+        storage.loadBefore = loadBefore
+        storage.load       = load
+
+        db = ZODB.DB(storage)   ; defer(db.close)
+        connection = db.open()  ; defer(connection.close)
+        root = connection.root
+
+        f(root)
+
+    # create test database with empty array, then run specified write workload
+    # and see how big ZODB size is.
+    @with_db
+    def _(root):
+        root.A = ZBigArray(shape=[0], dtype=int)
+        transaction.commit()
+
+        random.seed(10)
+        wrk.do_write(root)
+        transaction.commit()    # just in case
+
+    def emitbench(name, data):
+        wrkname = wrk.__class__.__name__
+        benchprefix = "Benchmark%s%s/zblk=%s/%s" % (wrkname, name, file_zodb.ZBlk_fmt_write, wrk.args)
+        print('%s\t%s' % (benchprefix, data))
+    emitbench("Size", "1\t%.1f MB" % (os.path.getsize(storage_path) / 1E6))
+
+    # now benchmark random reads.
+    delayload = True
+    @with_db
+    def _(root):
+        A = root.A
+        blklen = arr_blklen(A)
+
+        # make sure we never read the same block twice - else we will start to
+        # measure time of hot access without any ZODB loading
+        random.seed(10)
+        blkv = list(range(len(A) // blklen))
+        random.shuffle(blkv)
+
+        a = A[:]
+        def _():
+            blk = blkv.pop()
+            # force load of ZBlk data via reading ndarray element from inside the block
+            a[blk*blklen]
+
+        niter = min(len(blkv), 10)
+        assert niter >= 3, niter
+        taccess = timeit.timeit(_, number=niter) / niter
+        emitbench("RandRead", "%d %.3f ms/blk" % (niter, taccess/ms))
+
+
+# Append simulates workload when data are appended in chunks to end of array.
+class Append(IWriteWorkLoad):
+    def __init__(wrk, change_count, change_percentage_set):
+        wrk.change_count = change_count
+        wrk.change_percentage_set = change_percentage_set
+        wrk.args = "change_count=%d/change_percentage_set=%s" % (
+                                    change_count, repr(change_percentage_set).replace(' ',''))
+
+    def do_write(wrk, root):
+        A = root.A
+        for _ in range(wrk.change_count):
+            change_percentage = random.choice(wrk.change_percentage_set)
+            size = int(arr_blklen(A) * change_percentage)
+            A.append(randarr(size))
+            transaction.commit()
+
+
+# RandWrite simulates workload when data is written randomly in the array.
+class RandWrite(IWriteWorkLoad):
+    def __init__(wrk, arrsize, change_count, change_percentage_set):
+        wrk.arrsize = arrsize
+        wrk.change_count = change_count
+        wrk.change_percentage_set = change_percentage_set
+        wrk.args = "arrsize=%d/change_count=%d/change_percentage_set=%s" % (
+                                    arrsize, change_count,
+                                    repr(change_percentage_set).replace(' ',''))
+
+    def do_write(wrk, root):
+        A = root.A
+        A.append([0]*wrk.arrsize)
+        transaction.commit()
+
+        for _ in range(wrk.change_count):
+            change_percentage = random.choice(wrk.change_percentage_set)
+            setrand(A, change_percentage)
+            transaction.commit()
+
+
+# Utility functions
+
+# randarr returns random [size]int array.
+def randarr(size):
+    return np.array([random.randint(1, 1000) for _ in range(size)])
+
+# setrand makes random write access to ZBigArray A.
+#
+# The amount of changed data is fraction of underlying block size.
+# Only one block is changed.
+def setrand(A, change_percentage):
+    blklen = arr_blklen(A)
+    change_size = int(blklen * change_percentage)
+    blk_index = random.randrange(0, len(A) // blklen)
+    blk_offset = blk_index * blklen
+    # Ensure we don't always only change the beginning of a block
+    blk_offset = blk_offset + random.randint(0, blklen - change_size)
+    A[blk_offset:blk_offset+change_size][:] = randarr(change_size)
+
+# arr_blklen returns how many ZBigArray items constitute up a block in underlying ZBigFile.
+def arr_blklen(A):
+    assert isinstance(A, ZBigArray)
+    assert len(A.shape) == 1
+    assert A.zfile.blksize  % A.itemsize == 0
+    return A.zfile.blksize // A.itemsize
+
+
+# ---- benchmarks we want to run ----
+
+def main():
+    _ = benchwrk
+
+    _(Append(            500, [0.014]))     # appends of ~ 30K
+    _(RandWrite(1000000, 500, [0.2]))       # small change size, so that heuristic always uses ZBlk1
+    _(RandWrite(1000000, 500, [1]))         # big change size,   so that heuristic always uses ZBlk0
+    _(RandWrite(1000000, 500, [0.2, 1]))    # Mix between change size so that heuristic switches
+                                            # between ZBlk0 and ZBlk1
+
+
+if __name__ == '__main__':
+    main()
--- a/bigfile/tests/test_filezodb.py
+++ b/bigfile/tests/test_filezodb.py
 # Wendelin.core.bigfile | Tests for ZODB BigFile backend
-# Copyright (C) 2014-2021  Nexedi SA and Contributors.
+# Copyright (C) 2014-2023  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -17,7 +17,7 @@
 #
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.
-from wendelin.bigfile.file_zodb import ZBigFile, ZBlk_fmt_registry
+from wendelin.bigfile.file_zodb import ZBigFile, ZBlk_fmt_registry, _ZBlk_auto
 from wendelin.bigfile import file_zodb, ram_reclaim
 from wendelin.bigfile.tests.test_thread import NotifyChannel
 from wendelin.lib.zodb import LivePersistent, dbclose
@@ -631,6 +631,8 @@ def test_bigfile_filezodb_fmt_change():
        for dst_fmt, dst_type in ZBlk_fmt_registry.items():
            if src_fmt == dst_fmt:
                continue    # skip checking e.g. ZBlk0 -> ZBlk0
+            if src_type is _ZBlk_auto   or  dst_type is _ZBlk_auto:
+                continue    # skip checking e.g. * -> auto

            file_zodb.ZBlk_fmt_write = src_fmt
            struct.pack_into('p', vma, 0, b(src_fmt))
@@ -690,3 +692,34 @@ def test_bigfile_zblk1_zdata_reuse():
    assert len(zdata_v1) == len(zdata_v2)
    for i in range(len(zdata_v1)):
        assert zdata_v1[i] is zdata_v2[i]
+
+
+# Minimal test to ensure normal operations work as expected with zblk format 'auto'.
+@func
+def test_bigfile_zblk_fmt_auto():
+    root = dbopen()
+    defer(lambda: dbclose(root))
+
+    # set ZBlk_fmt_write to 'auto' for this test
+    fmt_write_save = file_zodb.ZBlk_fmt_write
+    file_zodb.ZBlk_fmt_write = 'auto'
+    def _():
+        file_zodb.ZBlk_fmt_write = fmt_write_save
+    defer(_)
+
+    root['zfile8'] = f = ZBigFile(blksize)
+    transaction.commit()
+
+    fh  = f.fileh_open()
+    vma = fh.mmap(0, blen)
+
+    b = Blk(vma, 0)
+    b[:] = 1
+    transaction.commit()
+
+    assert (b == 1).all()
+
+    b[0] = 2
+    transaction.commit()
+
+    assert b[0] == 2
--- a/bigfile/virtmem.c
+++ b/bigfile/virtmem.c
@@ -388,7 +388,8 @@ int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags)
    BUG_ON(fileh->writeout_inprogress);
    fileh->writeout_inprogress = 1;

-    /* pages are stored (if stored) in sorted order */
+    /* pages are stored (if stored) in sorted order
+     * NOTE writeout of ZBlk format 'auto' relies on this */
    if (flags & WRITEOUT_STORE)
        list_sort(&fileh->dirty_pages, hpage_indirty_cmp_bypgoffset, NULL);


--- a/tox.ini
+++ b/tox.ini
 # wendelin.core | tox setup
 [tox]
-envlist = py27-{ZODB4,ZODB5}-{zblk0,zblk1}-{fs,zeo,neo}-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
-          {py36,py37}-{ZODB4,ZODB5}-{zblk0,zblk1}-fs-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
-          py36-{ZODB4,ZODB5}-{zblk0,zblk1}-zeo-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
-          py37-ZODB5-{zblk0,zblk1}-zeo-{numpy115,numpy116-{!wcfs,wcfs,wcfs:1,wcfs:2}}
+envlist = py27-{ZODB4,ZODB5}-{zblk0,zblk1,auto}-{fs,zeo,neo}-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
+          {py36,py37}-{ZODB4,ZODB5}-{zblk0,zblk1,auto}-fs-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
+          py36-{ZODB4,ZODB5}-{zblk0,zblk1,auto}-zeo-{numpy115,numpy116}-{!wcfs,wcfs,wcfs:1,wcfs:2},
+          py37-ZODB5-{zblk0,zblk1,auto}-zeo-{numpy115,numpy116-{!wcfs,wcfs,wcfs:1,wcfs:2}}
 # (NOTE ZEO4 does not work with python3.7)
 # (NOTE NEO does not work on python3 at all)
 # (XXX ZODB5-*-neo are currently failing)
@@ -40,6 +40,7 @@ setenv =

    zblk0:  WENDELIN_CORE_ZBLK_FMT=ZBlk0
    zblk1:  WENDELIN_CORE_ZBLK_FMT=ZBlk1
+    auto:   WENDELIN_CORE_ZBLK_FMT=auto

    !wcfs:  WENDELIN_CORE_VIRTMEM=rw:uvmm
    wcfs:   WENDELIN_CORE_VIRTMEM=r:wcfs+w:uvmm