.

cdde581d · Kirill Smelkov · 3054e4a3 · cdde581d
Commit cdde581d authored Jun 10, 2019 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 24 deletions

wcfs/wcfs_test.py wcfs/wcfs_test.py +25 -24

No files found.
--- a/wcfs/wcfs_test.py
+++ b/wcfs/wcfs_test.py
@@ -431,43 +431,44 @@ class tFile:
        # mmap the file past the end up to _max_tracked_pages and setup
        # invariants on which we rely to verify OS cache state:
        #
-        # 1. lock pages with MLOCK_ONFAULT: this way when a page is read by
+        # 1. lock pages with MLOCK_ONFAULT: this way after a page is read by
        #    mmap access we have the guarantee from kernel that the page will
        #    stay in pagecache.
        #
-        # 2. madvise in interleaved mode blocks memory to be either
-        #    MADV_NORMAL or MAD_RANDOM. This adjusts kernel readahead (which
-        #    triggers for MADV_NORMAL memory) to not go over to next block and
-        #    thus a read access to one block won't trigger implicit read access
-        #    to neighbour block.
+        # 2. madvise memory with MADV_NORMAL and MADV_RANDOM in interleaved
+        #    mode. This adjusts kernel readahead (which triggers for MADV_NORMAL
+        #    vma) to not go over to next block and thus a read access to one
+        #    block won't trigger implicit read access to its neighbour block.
        #
        #      https://www.quora.com/What-heuristics-does-the-adaptive-readahead-implementation-in-the-Linux-kernel-use
        #      https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/madvise.c?h=v5.2-rc4#n51
        #
        #    don't disable readahead universally, since enabled readahead helps
-        #    to test how wcfs handles simultaneous read vs wcfs uploading data
-        #    for the same block into OS cache.
+        #    to test how wcfs handles simultaneous read triggered by async
+        #    kernel readahead vs wcfs uploading data for the same block into OS
+        #    cache. Also, fully enabled readahead is how wcfs is actually used.
        assert t.blksize % mm.PAGE_SIZE == 0
        t.fmmap = mm.map_ro(t.f.fileno(), 0, t._max_tracked_pages*t.blksize)
+
        mm.lock(t.fmmap, mm.MLOCK_ONFAULT)
+
        for blk in range(t._max_tracked_pages):
            blkmmap = t.fmmap[blk*t.blksize:(blk+1)*t.blksize]
-            # FIXME somehow does not completely prevent readahead to go into MADV_RANDOM page
-            # NOTE  with MADV_RANDOM the kernel issues 4K sized reads; wcfs
-            # starts uploading into cache almost immediately, but the kernel
-            # still issues many reads to read the full 2MB of the block. This
-            # works slow.
-            # XXX -> make read(while-uploading) wait for uploading to complete
-            # and only then return? (maybe it will help performance even in normal case)
-            #mm.advise(blkmmap, (mm.MADV_NORMAL, mm.MADV_RANDOM)[blk%2])
-            #mm.advise(blkmmap, (mm.MADV_RANDOM, mm.MADV_NORMAL)[blk%2])
-            #mm.advise(blkmmap, mm.MADV_NORMAL)
-            #mm.advise(blkmmap, mm.MADV_RANDOM)
-
-            # XXX vvv works - at the end of every block there is MAD_RANDOM
-            # range which is wider than RA window (XXX implicit) and so RA
-            # triggered before that, even if it overlaps with that last 1/4,
-            # don't trigger RA that overlaps with next block.
+            # NOTE the kernel does not start readahead from access to
+            # MADV_RANDOM vma, but for MADV_NORMAL vma it starts readhead which
+            # can go _beyond_ vma that was used to decide RA start. For this
+            # reason - to prevent RA started at one block to overlap with the
+            # next block, we put MADV_RANDOM vma at the end of every block
+            # covering last 1/4 of it.
+            # XXX implicit assumption that RA window is < 1/4·blksize
+            #
+            # NOTE with a block completely covered by MADV_RANDOM the kernel
+            # issues 4K sized reads; wcfs starts uploading into cache almost
+            # immediately, but the kernel still issues many reads to read the
+            # full 2MB of the block. This works slow.
+            # XXX -> investigate and maybe make read(while-uploading) wait for
+            # uploading to complete and only then return? (maybe it will help
+            # performance even in normal case)
            mm.advise(blkmmap[len(blkmmap)*3//4:], mm.MADV_RANDOM)

        tdb._files.add(t)