Commit 8293025b authored by Kirill Smelkov's avatar Kirill Smelkov

X Thoughts on how to avoid readahead touching pages of neighbour block

parent 8687b6f6
...@@ -35,6 +35,13 @@ cpdef enum: ...@@ -35,6 +35,13 @@ cpdef enum:
MCL_FUTURE = mman.MCL_FUTURE MCL_FUTURE = mman.MCL_FUTURE
MCL_ONFAULT = mman.MCL_ONFAULT MCL_ONFAULT = mman.MCL_ONFAULT
# XXX there is no mman.MADV_{NORMAL,RANDOM} -> use POSIX_... as workaround
MADV_NORMAL = mman.POSIX_MADV_NORMAL
MADV_RANDOM = mman.POSIX_MADV_RANDOM
MADV_SEQUENTIAL = mman.POSIX_MADV_SEQUENTIAL
MADV_WILLNEED = mman.POSIX_MADV_WILLNEED
MADV_DONTNEED = mman.POSIX_MADV_DONTNEED
# incore returns bytearray vector indicating whether page of mem is in core or not. # incore returns bytearray vector indicating whether page of mem is in core or not.
# #
# mem start must be page-aligned. # mem start must be page-aligned.
...@@ -96,7 +103,7 @@ def map_ro(int fd, off_t offset, size_t size): ...@@ -96,7 +103,7 @@ def map_ro(int fd, off_t offset, size_t size):
return <unsigned char[:size:1]>addr return <unsigned char[:size:1]>addr
# unmap unmaps memory coverd by mem. # unmap unmaps memory covered by mem.
def unmap(const unsigned char[::1] mem not None): def unmap(const unsigned char[::1] mem not None):
cdef const void *addr = &mem[0] cdef const void *addr = &mem[0]
cdef size_t size = mem.shape[0] cdef size_t size = mem.shape[0]
...@@ -105,3 +112,16 @@ def unmap(const unsigned char[::1] mem not None): ...@@ -105,3 +112,16 @@ def unmap(const unsigned char[::1] mem not None):
if err: if err:
PyErr_SetFromErrno(OSError) PyErr_SetFromErrno(OSError)
# ok # ok
# advise advises kernel about use of mem's memory.
#
# see madvise(2) for details.
def advise(const unsigned char[::1] mem not None, int advice):
cdef const void *addr = &mem[0]
cdef size_t size = mem.shape[0]
# XXX using posix_madvise, not madvise
cdef err = mman.posix_madvise(<void *>addr, size, advice)
if err:
PyErr_SetFromErrno(OSError)
...@@ -428,13 +428,40 @@ class tFile: ...@@ -428,13 +428,40 @@ class tFile:
t.f = tdb._open(zf, at=at) t.f = tdb._open(zf, at=at)
t.blksize = zf.blksize t.blksize = zf.blksize
# mmap the file past the end up to _max_tracked_pages and lock the # mmap the file past the end up to _max_tracked_pages and setup
# pages with MLOCK_ONFAULT. This way when a page is read by mmap access # invariants on which we rely to verify OS cache state:
# we have the guarantee from kernel that the page will stay in #
# pagecache. We rely on this to verify OS cache state. # 1. lock pages with MLOCK_ONFAULT: this way when a page is read by
# mmap access we have the guarantee from kernel that the page will
# stay in pagecache.
#
# 2. madvise in interleaved mode blocks memory to be either
# MADV_NORMAL or MAD_RANDOM. This adjusts kernel readahead (which
# triggers for MADV_NORMAL memory) to not go over to next block and
# thus a read access to one block won't trigger implicit read access
# to neighbour block.
#
# https://www.quora.com/What-heuristics-does-the-adaptive-readahead-implementation-in-the-Linux-kernel-use
# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/madvise.c?h=v5.2-rc4#n51
#
# don't disable readahead universally, since enabled readahead helps
# to test how wcfs handles simultaneous read vs wcfs uploading data
# for the same block into OS cache.
assert t.blksize % mm.PAGE_SIZE == 0 assert t.blksize % mm.PAGE_SIZE == 0
t.fmmap = mm.map_ro(t.f.fileno(), 0, t._max_tracked_pages*t.blksize) t.fmmap = mm.map_ro(t.f.fileno(), 0, t._max_tracked_pages*t.blksize)
mm.lock(t.fmmap, mm.MLOCK_ONFAULT) mm.lock(t.fmmap, mm.MLOCK_ONFAULT)
for blk in range(t._max_tracked_pages):
blkmmap = t.fmmap[blk*t.blksize:(blk+1)*t.blksize]
# FIXME somehow does not completely prevent readahead to go into MADV_RANDOM page
# NOTE with MADV_RANDOM the kernel issues 4K sized reads; wcfs
# starts uploading into cache almost immediately, but the kernel
# still issues many reads to read the full 2MB of the block. This
# works slow.
# XXX -> make read(while-uploading) wait for uploading to complete
# and only then return? (maybe it will help performance even in normal case)
mm.advise(blkmmap, (mm.MADV_NORMAL, mm.MADV_RANDOM)[blk%2])
#mm.advise(blkmmap, mm.MADV_NORMAL)
#mm.advise(blkmmap, mm.MADV_RANDOM)
tdb._files.add(t) tdb._files.add(t)
...@@ -490,6 +517,7 @@ class tFile: ...@@ -490,6 +517,7 @@ class tFile:
# pinokByWLink: {} tWatchLink -> {} blk -> at. # pinokByWLink: {} tWatchLink -> {} blk -> at.
# pinokByWLink can be None - in that case it is computed automatically. # pinokByWLink can be None - in that case it is computed automatically.
def assertBlk(t, blk, dataok, pinokByWLink=None): def assertBlk(t, blk, dataok, pinokByWLink=None):
print('assertBlk #%d' % blk)
if not isinstance(dataok, bytes): if not isinstance(dataok, bytes):
dataok = dataok.encode('utf-8') dataok = dataok.encode('utf-8')
assert len(dataok) <= t.blksize assert len(dataok) <= t.blksize
...@@ -499,7 +527,7 @@ class tFile: ...@@ -499,7 +527,7 @@ class tFile:
assert blk < t._sizeinblk() assert blk < t._sizeinblk()
cached = t.cached()[blk] cached = t.cached()[blk]
assert cached in (0, 1) # XXX temp - breaks becuase of kernel readahead assert cached in (0, 1), "blk #%d" % blk # XXX temp - breaks becuase of kernel readahead
shouldPin = False # whether at least one wlink should receive a pin shouldPin = False # whether at least one wlink should receive a pin
# watches must be notified if access goes to @head/file; not if to @rev/file. XXX text # watches must be notified if access goes to @head/file; not if to @rev/file. XXX text
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment