zodbanalyze: now supports both FileStorage and repozo deltafs

/reviewed-on nexedi/zodbtools!1 /see-also nexedi/slapos!116

zodbanalyze: now supports both FileStorage and repozo deltafs
/reviewed-on nexedi/zodbtools!1 /see-also nexedi/slapos!116
1e506a81 · Kazuhiko Shiozaki · Kirill Smelkov · ab17cf2d · 1e506a81 · 1e506a81
Commit 1e506a81 authored Nov 17, 2016 by Kazuhiko Shiozaki Committed by Kirill Smelkov Nov 17, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 140 additions and 62 deletions

README.rst README.rst +1 -0

setup.py setup.py +3 -2

zodbtool/zodbanalyze.py zodbtool/zodbanalyze.py +136 -60

No files found.
--- a/README.rst
+++ b/README.rst
@@ -8,5 +8,6 @@ scripts anymore. So we are here:
 __ https://github.com/zopefoundation/ZODB/pull/128#issuecomment-260970932
+- `zodbanalyze` - analyze FileStorage or repozo deltafs usage.
 - `zodbcmp` - compare content of two ZODB databases bit-to-bit.
 - `zodbdump` - dump content of a ZODB database.
--- a/setup.py
+++ b/setup.py
@@ -26,8 +26,9 @@ setup(
    # zodb cmd ...
    # zodb dump ...
    entry_points= {'console_scripts': [
-                        'zodbcmp    = zodbtool.zodbcmp:main',
+                        'zodbanalyze = zodbtool.zodbanalyze:main',
-                        'zodbdump   = zodbtool.zodbdump:main',
+                        'zodbcmp     = zodbtool.zodbcmp:main',
+                        'zodbdump    = zodbtool.zodbdump:main',
                      ]
                  },

--- a/zodbtool/zodbanalyze.py
+++ b/zodbtool/zodbanalyze.py
-#!/usr/bin/env python2.4
+#!/usr/bin/env python
 # Based on a transaction analyzer by Matt Kromer.
@@ -8,12 +8,43 @@ import getopt
 import anydbm as dbm
 import tempfile
 import shutil
-from ZODB.FileStorage import FileStorage
+from ZODB.FileStorage import FileIterator, FileStorage, packed_version
+from ZODB.FileStorage.format import FileStorageFormatter
 from ZODB.utils import get_pickle_metadata
+class DeltaFileStorage(
+    FileStorageFormatter,
+    ):
+    def __init__(self, file_name, **kw):
+        self._file_name = file_name
+    def iterator(self, start=None, stop=None):
+        return DeltaFileIterator(self._file_name, start, stop)
+class DeltaFileIterator(FileIterator):
+    def __init__(self, filename, start=None, stop=None, pos=0L):
+        assert isinstance(filename, str)
+        file = open(filename, 'rb')
+        self._file = file
+        file.seek(0,2)
+        self._file_size = file.tell()
+        if pos > self._file_size:
+            raise ValueError("Given position is greater than the file size",
+                             pos, self._file_size)
+        self._pos = pos
+        assert start is None or isinstance(start, str)
+        assert stop is None or isinstance(stop, str)
+        self._start = start
+        self._stop = stop
+        if start:
+            if self._file_size <= 4:
+                return
+            self._skip_to_start(start)
 class Report:
-    def __init__(self, use_dbm=False):
+    def __init__(self, use_dbm=False, delta_fs=False):
        self.use_dbm = use_dbm
+        self.delta_fs = delta_fs
        if use_dbm:
            self.temp_dir = tempfile.mkdtemp()
            self.OIDMAP = dbm.open(os.path.join(self.temp_dir, 'oidmap.db'),
@@ -52,6 +83,7 @@ def shorten(s, n):
    return "..." + s
 def report(rep, csv=False):
+    delta_fs = rep.delta_fs
    if not csv:
        print "Processed %d records in %d transactions" % (rep.OIDS, rep.TIDS)
        print "Average record size is %7.2f bytes" % (rep.DBYTES * 1.0 / rep.OIDS)
@@ -59,17 +91,28 @@ def report(rep, csv=False):
               (rep.DBYTES * 1.0 / rep.TIDS))
        print "Types used:"
-    if csv:
+    if delta_fs:
-        fmt = "%s,%s,%s,%s,%s,%s,%s,%s,%s"
+        if csv:
-        fmtp = "%s,%d,%d,%f%%,%f,%d,%d,%d,%d" # per-class format
+            fmt = "%s,%s,%s,%s,%s"
+            fmtp = "%s,%d,%d,%f%%,%f" # per-class format
+        else:
+            fmt = "%-46s %7s %9s %6s %7s"
+            fmtp = "%-46s %7d %9d %5.1f%% %7.2f" # per-class format
+        print fmt % ("Class Name", "T.Count", "T.Bytes", "Pct", "AvgSize")
+        if not csv:
+            print fmt % ('-'*46, '-'*7, '-'*9, '-'*5, '-'*7)
    else:
-        fmt = "%-46s %7s %9s %6s %7s %7s %9s %7s %9s"
+        if csv:
-        fmtp = "%-46s %7d %9d %5.1f%% %7.2f %7d %9d %7d %9d" # per-class format
+            fmt = "%s,%s,%s,%s,%s,%s,%s,%s,%s"
+            fmtp = "%s,%d,%d,%f%%,%f,%d,%d,%d,%d" # per-class format
+        else:
+            fmt = "%-46s %7s %9s %6s %7s %7s %9s %7s %9s"
+            fmtp = "%-46s %7d %9d %5.1f%% %7.2f %7d %9d %7d %9d" # per-class format
+        print fmt % ("Class Name", "T.Count", "T.Bytes", "Pct", "AvgSize",
+                     "C.Count", "C.Bytes", "O.Count", "O.Bytes")
+        if not csv:
+            print fmt % ('-'*46, '-'*7, '-'*9, '-'*5, '-'*7, '-'*7, '-'*9, '-'*7, '-'*9)
    fmts = "%46s %7d %8dk %5.1f%% %7.2f" # summary format
-    print fmt % ("Class Name", "T.Count", "T.Bytes", "Pct", "AvgSize",
-                 "C.Count", "C.Bytes", "O.Count", "O.Bytes")
-    if not csv:
-        print fmt % ('-'*46, '-'*7, '-'*9, '-'*5, '-'*7, '-'*7, '-'*9, '-'*7, '-'*9)
    typemap = rep.TYPEMAP.keys()
    typemap.sort(key=lambda a:rep.TYPESIZE[a])
    cumpct = 0.0
@@ -80,32 +123,46 @@ def report(rep, csv=False):
            t_display = t
        else:
            t_display = shorten(t, 46)
-        print fmtp % (t_display, rep.TYPEMAP[t], rep.TYPESIZE[t],
+        if delta_fs:
-                      pct, rep.TYPESIZE[t] * 1.0 / rep.TYPEMAP[t],
+            print fmtp % (t_display, rep.TYPEMAP[t], rep.TYPESIZE[t],
-                      rep.COIDSMAP[t], rep.CBYTESMAP[t],
+                          pct, rep.TYPESIZE[t] * 1.0 / rep.TYPEMAP[t])
-                      rep.FOIDSMAP.get(t, 0), rep.FBYTESMAP.get(t, 0))
+        else:
+            print fmtp % (t_display, rep.TYPEMAP[t], rep.TYPESIZE[t],
+                          pct, rep.TYPESIZE[t] * 1.0 / rep.TYPEMAP[t],
+                          rep.COIDSMAP[t], rep.CBYTESMAP[t],
+                          rep.FOIDSMAP.get(t, 0), rep.FBYTESMAP.get(t, 0))
    if csv:
        return
-    print fmt % ('='*46, '='*7, '='*9, '='*5, '='*7, '='*7, '='*9, '='*7, '='*9)
+    if delta_fs:
-    print "%46s %7d %9s %6s %6.2fk" % ('Total Transactions', rep.TIDS, ' ',
+        print fmt % ('='*46, '='*7, '='*9, '='*5, '='*7)
-        ' ', rep.DBYTES * 1.0 / rep.TIDS / 1024.0)
+        print "%46s %7d %9s %6s %6.2f" % ('Total Transactions', rep.TIDS, ' ',
-    print fmts % ('Total Records', rep.OIDS, rep.DBYTES / 1024.0, cumpct,
+                                          ' ', rep.DBYTES * 1.0 / rep.TIDS)
-                  rep.DBYTES * 1.0 / rep.OIDS)
+        print fmts % ('Total Records', rep.OIDS, rep.DBYTES, cumpct,
+                      rep.DBYTES * 1.0 / rep.OIDS)
-    print fmts % ('Current Objects', rep.COIDS, rep.CBYTES / 1024.0,
+    else:
-                  rep.CBYTES * 100.0 / rep.DBYTES,
+        print fmt % ('='*46, '='*7, '='*9, '='*5, '='*7, '='*7, '='*9, '='*7, '='*9)
-                  rep.CBYTES * 1.0 / rep.COIDS)
+        print "%46s %7d %9s %6s %6.2fk" % ('Total Transactions', rep.TIDS, ' ',
-    if rep.FOIDS:
+            ' ', rep.DBYTES * 1.0 / rep.TIDS / 1024.0)
-        print fmts % ('Old Objects', rep.FOIDS, rep.FBYTES / 1024.0,
+        print fmts % ('Total Records', rep.OIDS, rep.DBYTES / 1024.0, cumpct,
-                      rep.FBYTES * 100.0 / rep.DBYTES,
+                      rep.DBYTES * 1.0 / rep.OIDS)
-                      rep.FBYTES * 1.0 / rep.FOIDS)
+        print fmts % ('Current Objects', rep.COIDS, rep.CBYTES / 1024.0,
-def analyze(path, use_dbm):
+                      rep.CBYTES * 100.0 / rep.DBYTES,
-    fs = FileStorage(path, read_only=1)
+                      rep.CBYTES * 1.0 / rep.COIDS)
+        if rep.FOIDS:
+            print fmts % ('Old Objects', rep.FOIDS, rep.FBYTES / 1024.0,
+                          rep.FBYTES * 100.0 / rep.DBYTES,
+                          rep.FBYTES * 1.0 / rep.FOIDS)
+def analyze(path, use_dbm, delta_fs):
+    if delta_fs:
+        fs = DeltaFileStorage(path, read_only=1)
+    else:
+        fs = FileStorage(path, read_only=1)
    fsi = fs.iterator()
-    report = Report(use_dbm)
+    report = Report(use_dbm, delta_fs)
    for txn in fsi:
        analyze_trans(report, txn)
    if use_dbm:
@@ -130,44 +187,52 @@ def analyze_rec(report, record):
    try:
        size = len(record.data) # Ignores various overhead
        report.DBYTES += size
-        if oid not in report.OIDMAP:
+        if report.delta_fs:
            type = get_type(record)
-            report.OIDMAP[oid] = type
+            report.TYPEMAP[type] = report.TYPEMAP.get(type, 0) + 1
-            if report.use_dbm:
+            report.TYPESIZE[type] = report.TYPESIZE.get(type, 0) + size
-                report.USEDMAP[oid] = str(size)
-            else:
-                report.USEDMAP[oid] = size
-            report.COIDS += 1
-            report.CBYTES += size
-            report.COIDSMAP[type] = report.COIDSMAP.get(type, 0) + 1
-            report.CBYTESMAP[type] = report.CBYTESMAP.get(type, 0) + size
        else:
-            type = report.OIDMAP[oid]
+            if oid not in report.OIDMAP:
-            if report.use_dbm:
+                type = get_type(record)
-                fsize = int(report.USEDMAP[oid])
+                report.OIDMAP[oid] = type
-                report.USEDMAP[oid] = str(size)
+                if report.use_dbm:
+                    report.USEDMAP[oid] = str(size)
+                else:
+                    report.USEDMAP[oid] = size
+                report.COIDS += 1
+                report.CBYTES += size
+                report.COIDSMAP[type] = report.COIDSMAP.get(type, 0) + 1
+                report.CBYTESMAP[type] = report.CBYTESMAP.get(type, 0) + size
            else:
-                fsize = report.USEDMAP[oid]
+                type = report.OIDMAP[oid]
-                report.USEDMAP[oid] = size
+                if report.use_dbm:
-            report.FOIDS += 1
+                    fsize = int(report.USEDMAP[oid])
-            report.FBYTES += fsize
+                    report.USEDMAP[oid] = str(size)
-            report.CBYTES += size - fsize
+                else:
-            report.FOIDSMAP[type] = report.FOIDSMAP.get(type, 0) + 1
+                    fsize = report.USEDMAP[oid]
-            report.FBYTESMAP[type] = report.FBYTESMAP.get(type, 0) + fsize
+                    report.USEDMAP[oid] = size
-            report.CBYTESMAP[type] = report.CBYTESMAP.get(type, 0) + size - fsize
+                report.FOIDS += 1
-        report.TYPEMAP[type] = report.TYPEMAP.get(type, 0) + 1
+                report.FBYTES += fsize
-        report.TYPESIZE[type] = report.TYPESIZE.get(type, 0) + size
+                report.CBYTES += size - fsize
+                report.FOIDSMAP[type] = report.FOIDSMAP.get(type, 0) + 1
+                report.FBYTESMAP[type] = report.FBYTESMAP.get(type, 0) + fsize
+                report.CBYTESMAP[type] = report.CBYTESMAP.get(type, 0) + size - fsize
+            report.TYPEMAP[type] = report.TYPEMAP.get(type, 0) + 1
+            report.TYPESIZE[type] = report.TYPESIZE.get(type, 0) + size
    except Exception, err:
        print err
-__doc__ = """%(program)s: Data.fs analyzer
+__doc__ = """%(program)s: Analyzer for FileStorage data or repozo deltafs
-usage: %(program)s [options] /path/to/Data.fs
+usage: %(program)s [options] /path/to/Data.fs (or /path/to/file.deltafs)
 Options:
  -h, --help                 this help screen
  -c, --csv                  output CSV
  -d, --dbm                  use DBM as temporary storage to limit memory usage
+                             (no meaning for deltafs case)
+Note:
+  Input deltafs file should be uncompressed.
 """
 def usage(stream, msg=None):
@@ -196,7 +261,18 @@ def main():
        if opt in ('-h', '--help'):
            usage(sys.stdout)
            sys.exit()
-    report(analyze(path, use_dbm), csv)
+    header = open(path, 'rb').read(4)
+    if header == packed_version:
+        delta_fs = False
+    else:
+        delta_fs = True
+        _orig_read_data_header = FileStorageFormatter._read_data_header
+        def _read_data_header(self, pos, oid=None):
+            h = _orig_read_data_header(self, pos, oid=oid)
+            h.tloc = self._tpos
+            return h
+        FileStorageFormatter._read_data_header = _read_data_header
+    report(analyze(path, use_dbm, delta_fs), csv)
 if __name__ == "__main__":
    main()