virtmem: Benchmarks for pagefault handling

Benchmark the time it takes for virtmem to handle pagefault with noop loadblk for loadblk both implemented in C and in Python. On my computer it is: name µs/op PagefaultC 269 ± 0% pagefault_py 291 ± 0% Quite a big time in other words. It turned out to be mostly spent in fallocate'ing pages on tmpfs from /dev/shm. Part of the above 269 µs/op is taken by freeing (reclaiming) pages back when benchmarking work size exceed /dev/shm size, and part to allocating. If I limit the work size (via npage in benchmem.c) to be less than whole /dev/shm it starts to be ~ 170 µs/op and with additional tracing it shows as something like this: .. on_pagefault_start 0.954 µs .. vma_on_pagefault_pre 0.954 µs .. ramh_alloc_page_pre 0.954 µs .. ramh_alloc_page 169.992 µs .. vma_on_pagefault 172.853 µs .. vma_on_pagefault_pre 172.853 µs .. vma_on_pagefault 174.046 µs .. on_pagefault_end 174.046 µs .. whole: 171.900 µs so almost all time is spent in ramh_alloc_page which is doing the fallocate: https://lab.nexedi.com/nexedi/wendelin.core/blob/f11386a4/bigfile/ram_shmfs.c#L125 Simple benchmark[1] confirmed it is indeed the case for fallocate(tmpfs) to be relatively slow[2] (and that for recent kernels it regressed somewhat compared to Linux 3.16). Profile flamegraph for that benchmark[3] shows internal loading of shmem_fallocate which for 1 hardware page is not that too slow (e.g. <1µs) but when a request comes for a region internally performs it page by page and so accumulates that ~ 170µs for 2M. I've tried to briefly rerun the benchmark with huge pages activated on /dev/shm via mount /dev/shm -o huge=always,remount as both regular user and as root but it was executing several times slower. Probably something to investigate more later. [1] https://lab.nexedi.com/kirr/misc/blob/4f84a06e/tmpfs/t_fallocate.c [2] https://lab.nexedi.com/kirr/misc/blob/4f84a06e/tmpfs/1.txt [3] https://lab.nexedi.com/kirr/misc/raw/4f84a06e/tmpfs/fallocate-2M-nohuge.svg

virtmem: Benchmarks for pagefault handling
Benchmark the time it takes for virtmem to handle pagefault with noop loadblk for loadblk both implemented in C and in Python. On my computer it is: name µs/op PagefaultC 269 ± 0% pagefault_py 291 ± 0% Quite a big time in other words. It turned out to be mostly spent in fallocate'ing pages on tmpfs from /dev/shm. Part of the above 269 µs/op is taken by freeing (reclaiming) pages back when benchmarking work size exceed /dev/shm size, and part to allocating. If I limit the work size (via npage in benchmem.c) to be less than whole /dev/shm it starts to be ~ 170 µs/op and with additional tracing it shows as something like this: .. on_pagefault_start 0.954 µs .. vma_on_pagefault_pre 0.954 µs .. ramh_alloc_page_pre 0.954 µs .. ramh_alloc_page 169.992 µs .. vma_on_pagefault 172.853 µs .. vma_on_pagefault_pre 172.853 µs .. vma_on_pagefault 174.046 µs .. on_pagefault_end 174.046 µs .. whole: 171.900 µs so almost all time is spent in ramh_alloc_page which is doing the fallocate: https://lab.nexedi.com/nexedi/wendelin.core/blob/f11386a4/bigfile/ram_shmfs.c#L125 Simple benchmark[1] confirmed it is indeed the case for fallocate(tmpfs) to be relatively slow[2] (and that for recent kernels it regressed somewhat compared to Linux 3.16). Profile flamegraph for that benchmark[3] shows internal loading of shmem_fallocate which for 1 hardware page is not that too slow (e.g. <1µs) but when a request comes for a region internally performs it page by page and so accumulates that ~ 170µs for 2M. I've tried to briefly rerun the benchmark with huge pages activated on /dev/shm via mount /dev/shm -o huge=always,remount as both regular user and as root but it was executing several times slower. Probably something to investigate more later. [1] https://lab.nexedi.com/kirr/misc/blob/4f84a06e/tmpfs/t_fallocate.c [2] https://lab.nexedi.com/kirr/misc/blob/4f84a06e/tmpfs/1.txt [3] https://lab.nexedi.com/kirr/misc/raw/4f84a06e/tmpfs/fallocate-2M-nohuge.svg
3cfc2728 · Kirill Smelkov · 51f252d4 · 3cfc2728 · 3cfc2728 · 3cfc2728
Commit 3cfc2728 authored Dec 06, 2017 by Kirill Smelkov
5 changed files
--- a/Makefile
+++ b/Makefile
@@ -192,5 +192,10 @@ test.fault : $(FAULTS:%=%.tfault)


 # -*- benchmarking -*-
-bench	: bigfile/_bigfile.so
+BENCHV.C:= $(patsubst %.c,%,$(wildcard bigfile/tests/bench_*.c))
+bench	: bench.t bench.py
+
+bench.t	: $(BENCHV.C:%=%.trun)
+
+bench.py: bigfile/_bigfile.so
 	$(PYBENCH) $(PYTEST_IGNORE)
--- a/bigfile/tests/bench_0virtmem.py
+++ b/bigfile/tests/bench_0virtmem.py
@@ -29,6 +29,7 @@ from io import FileIO
 from wendelin.bigfile.file_file import BigFile_File
 from wendelin.bigfile import WRITEOUT_STORE, WRITEOUT_MARKSTORED
 from wendelin.lib.testing import Adler32, nulladler32_bysize, ffadler32_bysize
+from wendelin.bigarray.tests.test_basic import BigFile_Zero
 from wendelin.lib.mem import bzero, memset
 from tempfile import NamedTemporaryFile

@@ -69,6 +70,36 @@ def teardown_module():
    unlink(tmpf.name)


+# BigFile that reads as zeros and tracks last loadblk request
+class BigFile_ZeroTrack(BigFile_Zero):
+
+    def loadblk(self, blk, buf):
+        #print('zload #%d' % blk)
+        self.last_load = blk
+        super(BigFile_ZeroTrack, self).loadblk(blk, buf)
+
+# benchmark the time it takes for virtmem to handle pagefault with noop loadblk
+# implemented  in Python.
+def bench_pagefault_py(b):
+    npage = b.N
+    PS  = blksize   # XXX assumes blksize = pagesize
+
+    f   = BigFile_ZeroTrack(PS)
+    fh  = f.fileh_open()
+    vma = fh.mmap(0, npage)
+    m   = memoryview(vma)
+
+    b.reset_timer()
+    for p in xrange(npage):
+        m[p*PS]
+        assert f.last_load == p
+
+    del m
+    del vma # vma.close()
+    del fh  # fh.close()
+    del f   # f.close()
+
+
 # compute hash via mmaping the file at OS-level
 def _bench_file_mmapread(hasher, expect):
    fd = os.open(tmpf.name, O_RDONLY)

--- a/bigfile/tests/bench_virtmem.c
+++ b/bigfile/tests/bench_virtmem.c
+/* Wendelin.bigfile | virtual memory benchmarks
+ * Copyright (C) 2017  Nexedi SA and Contributors.
+ *                     Kirill Smelkov <kirr@nexedi.com>
+ *
+ * This program is free software: you can Use, Study, Modify and Redistribute
+ * it under the terms of the GNU General Public License version 3, or (at your
+ * option) any later version, as published by the Free Software Foundation.
+ *
+ * You can also Link and Combine this program with other software covered by
+ * the terms of any of the Free Software licenses or any of the Open Source
+ * Initiative approved licenses and Convey the resulting work. Corresponding
+ * source of such a combination shall include the source code for all other
+ * software used.
+ *
+ * This program is distributed WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See COPYING file for full licensing terms.
+ * See https://www.nexedi.com/licensing for rationale and options.
+ */
+
+// XXX better link with it
+#include "../virtmem.c"
+#include    "../pagemap.c"
+#include    "../ram.c"
+#include "../ram_shmfs.c"
+#include "../pagefault.c"
+
+#include <ccan/tap/tap.h>
+
+#include "../../t/t_utils.h"
+#include   "../../t/t_utils.c"
+
+/* file that reads as zeros and tracks last loadblk request */
+struct BigFile_ZeroTrack {
+    BigFile;
+    blk_t last_load;
+};
+typedef struct BigFile_ZeroTrack BigFile_ZeroTrack;
+
+int zero_loadblk(BigFile *file0, blk_t blk, void *buf)
+{
+    BigFile_ZeroTrack *file = upcast(BigFile_ZeroTrack *, file0);
+
+    //diag("zload #%ld", blk);
+
+    // Nothing to do here - the memory buf obtained from OS comes pre-cleared
+    // XXX reenable once/if memory comes uninitialized here
+    file->last_load = blk;
+    return 0;
+}
+
+static const struct bigfile_ops filez_ops = {
+    .loadblk    = zero_loadblk,
+    .storeblk   = NULL, // XXX
+    .release    = NULL, // XXX
+};
+
+/* benchmark the time it takes for virtmem to handle pagefault with noop loadblk */
+void bench_pagefault() {
+    RAM *ram;
+    BigFileH fh_struct, *fh = &fh_struct;
+    VMA vma_struct, *vma = &vma_struct;
+    pgoff_t p, npage = 10000;
+    size_t PS;
+    int err;
+
+    double Tstart, Tend;
+
+    ok1(!pagefault_init());
+
+    ram = ram_new(NULL,NULL);
+    ok1(ram);
+    PS = ram->pagesize;
+
+    /* setup zero file */
+    BigFile_ZeroTrack f = {
+        .blksize    = ram->pagesize,    /* artificially blksize = pagesize */
+        .file_ops   = &filez_ops,
+    };
+
+    /* setup f mapping */
+    err = fileh_open(fh, &f, ram);
+    ok1(!err);
+
+    err = fileh_mmap(vma, fh, 0, npage);
+    ok1(!err);
+
+    Tstart = microtime();
+
+    // access first byte of every page
+    for (p = 0; p < npage; p++) {
+        b(vma, p * PS);
+        if (f.last_load != p)
+            fail("accessed page #%ld but last loadblk was for block #%ld", p, f.last_load);
+    }
+
+    Tend = microtime();
+
+    printf("BenchmarkPagefaultC\t%ld\t%.3lf µs/op\n", npage, (Tend - Tstart) * 1E6 / npage);
+
+    vma_unmap(vma);
+    fileh_close(fh);
+    ram_close(ram);
+
+}
+
+int main()
+{
+    int i, nrun=3;
+    tap_fail_callback = abort;  // XXX to catch failure immediately
+
+    for (i=0; i<nrun; i++)
+        bench_pagefault();
+
+    return 0;
+}
--- a/t/t_utils.c
+++ b/t/t_utils.c
@@ -21,6 +21,8 @@

 #include <wendelin/utils.h>

+#include <sys/time.h>
+

 static const struct ram_ops ram_limited_ops;
 static const struct ramh_ops ramh_limited_ops;
@@ -155,3 +157,17 @@ static const struct ramh_ops ramh_limited_ops = {
    .mmap_page      = ramh_limited_mmap_page,
    .close          = ramh_limited_close,
 };
+
+
+double microtime() {
+        int err;
+        struct timeval tv;
+
+        err = gettimeofday(&tv, NULL);
+        if (err == -1) {
+                perror("gettimeofday");
+                abort();
+        }
+
+        return tv.tv_sec + 1E-6 * tv.tv_usec;
+}
--- a/t/t_utils.h
+++ b/t/t_utils.h
@@ -43,4 +43,7 @@ typedef struct RAMLimited RAMLimited;

 RAMLimited *ram_limited_new(RAM *backend, size_t alloc_max);

+/* current time as float */
+double microtime();
+
 #endif