Commit 1db98bcf authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge still more updates from Andrew Morton:
 "18 patches.

  Subsystems affected by this patch series: mm (memcg and cleanups) and
  epoll"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/Kconfig: fix spelling mistake "whats" -> "what's"
  selftests/filesystems: expand epoll with epoll_pwait2
  epoll: wire up syscall epoll_pwait2
  epoll: add syscall epoll_pwait2
  epoll: convert internal api to timespec64
  epoll: eliminate unnecessary lock for zero timeout
  epoll: replace gotos with a proper loop
  epoll: pull all code between fetch_events and send_event into the loop
  epoll: simplify and optimize busy loop logic
  epoll: move eavail next to the list_empty_careful check
  epoll: pull fatal signal checks into ep_send_events()
  epoll: simplify signal handling
  epoll: check for events when removing a timed out thread from the wait queue
  mm/memcontrol:rewrite mem_cgroup_page_lruvec()
  mm, kvm: account kvm_vcpu_mmap to kmemcg
  mm/memcg: remove unused definitions
  mm/memcg: warning on !memcg after readahead page charged
  mm/memcg: bail early from swap accounting if memcg disabled
parents 3644e2d2 01ab1ede
...@@ -480,3 +480,4 @@ ...@@ -480,3 +480,4 @@
548 common pidfd_getfd sys_pidfd_getfd 548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2 549 common faccessat2 sys_faccessat2
550 common process_madvise sys_process_madvise 550 common process_madvise sys_process_madvise
551 common epoll_pwait2 sys_epoll_pwait2
...@@ -454,3 +454,4 @@ ...@@ -454,3 +454,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 441 #define __NR_compat_syscalls 442
#endif #endif
#define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE
......
...@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) ...@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440 #define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise) __SYSCALL(__NR_process_madvise, sys_process_madvise)
#define __NR_epoll_pwait2 441
__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
/* /*
* Please add new compat syscalls above this comment and update * Please add new compat syscalls above this comment and update
......
...@@ -361,3 +361,4 @@ ...@@ -361,3 +361,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -440,3 +440,4 @@ ...@@ -440,3 +440,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -446,3 +446,4 @@ ...@@ -446,3 +446,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -379,3 +379,4 @@ ...@@ -379,3 +379,4 @@
438 n32 pidfd_getfd sys_pidfd_getfd 438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2 439 n32 faccessat2 sys_faccessat2
440 n32 process_madvise sys_process_madvise 440 n32 process_madvise sys_process_madvise
441 n32 epoll_pwait2 sys_epoll_pwait2
...@@ -355,3 +355,4 @@ ...@@ -355,3 +355,4 @@
438 n64 pidfd_getfd sys_pidfd_getfd 438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2 439 n64 faccessat2 sys_faccessat2
440 n64 process_madvise sys_process_madvise 440 n64 process_madvise sys_process_madvise
441 n64 epoll_pwait2 sys_epoll_pwait2
...@@ -428,3 +428,4 @@ ...@@ -428,3 +428,4 @@
438 o32 pidfd_getfd sys_pidfd_getfd 438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2 439 o32 faccessat2 sys_faccessat2
440 o32 process_madvise sys_process_madvise 440 o32 process_madvise sys_process_madvise
441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
...@@ -438,3 +438,4 @@ ...@@ -438,3 +438,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
...@@ -530,3 +530,4 @@ ...@@ -530,3 +530,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
...@@ -443,3 +443,4 @@ ...@@ -443,3 +443,4 @@
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 sys_epoll_pwait2
...@@ -443,3 +443,4 @@ ...@@ -443,3 +443,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -486,3 +486,4 @@ ...@@ -486,3 +486,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -445,3 +445,4 @@ ...@@ -445,3 +445,4 @@
438 i386 pidfd_getfd sys_pidfd_getfd 438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2 439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise 440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
...@@ -362,6 +362,7 @@ ...@@ -362,6 +362,7 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
# #
# Due to a historical design error, certain syscalls are numbered differently # Due to a historical design error, certain syscalls are numbered differently
......
...@@ -9869,7 +9869,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) ...@@ -9869,7 +9869,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = -ENOMEM; r = -ENOMEM;
page = alloc_page(GFP_KERNEL | __GFP_ZERO); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) if (!page)
goto fail_free_lapic; goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page); vcpu->arch.pio_data = page_address(page);
......
...@@ -411,3 +411,4 @@ ...@@ -411,3 +411,4 @@
438 common pidfd_getfd sys_pidfd_getfd 438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
...@@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) ...@@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
* *
* we must do our busy polling with irqs enabled * we must do our busy polling with irqs enabled
*/ */
static void ep_busy_loop(struct eventpoll *ep, int nonblock) static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{ {
unsigned int napi_id = READ_ONCE(ep->napi_id); unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET); BUSY_POLL_BUDGET);
} if (ep_events_available(ep))
return true;
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) /*
{ * Busy poll timed out. Drop NAPI ID for now, we can add
if (ep->napi_id) * it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
ep->napi_id = 0; ep->napi_id = 0;
return false;
}
return false;
} }
/* /*
...@@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) ...@@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
#else #else
static inline void ep_busy_loop(struct eventpoll *ep, int nonblock) static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
}
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
{ {
return false;
} }
static inline void ep_set_busy_poll_napi_id(struct epitem *epi) static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
...@@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep, ...@@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep,
poll_table pt; poll_table pt;
int res = 0; int res = 0;
/*
* Always short-circuit for fatal signals to allow threads to make a
* timely exit without the chance of finding more events available and
* fetching repeatedly.
*/
if (fatal_signal_pending(current))
return -EINTR;
init_poll_funcptr(&pt, NULL); init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx); mutex_lock(&ep->mtx);
...@@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep, ...@@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep,
return res; return res;
} }
static inline struct timespec64 ep_set_mstimeout(long ms) static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{ {
struct timespec64 now, ts = { struct timespec64 now;
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), if (ms < 0)
}; return NULL;
if (!ms) {
to->tv_sec = 0;
to->tv_nsec = 0;
return to;
}
to->tv_sec = ms / MSEC_PER_SEC;
to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
ktime_get_ts64(&now); ktime_get_ts64(&now);
return timespec64_add_safe(now, ts); *to = timespec64_add_safe(now, *to);
return to;
} }
/** /**
...@@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms) ...@@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* stored. * stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer. * @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in * @timeout: Maximum timeout for the ready events fetch operation, in
* milliseconds. If the @timeout is zero, the function will not block, * timespec. If the timeout is zero, the function will not block,
* while if the @timeout is less than zero, the function will block * while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error * until at least one event has been retrieved (or an error
* occurred). * occurred).
* *
...@@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms) ...@@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* error code, in case of error. * error code, in case of error.
*/ */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout) int maxevents, struct timespec64 *timeout)
{ {
int res = 0, eavail, timed_out = 0; int res, eavail, timed_out = 0;
u64 slack = 0; u64 slack = 0;
wait_queue_entry_t wait; wait_queue_entry_t wait;
ktime_t expires, *to = NULL; ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled(); lockdep_assert_irqs_enabled();
if (timeout > 0) { if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires; to = &expires;
*to = timespec64_to_ktime(end_time); *to = timespec64_to_ktime(*timeout);
} else if (timeout == 0) { } else if (timeout) {
/* /*
* Avoid the unnecessary trip to the wait queue loop, if the * Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation. We still need * caller specified a non blocking operation.
* lock because we could race and not see an epi being added
* to the ready list while in irq callback. Thus incorrectly
* returning 0 back to userspace.
*/ */
timed_out = 1; timed_out = 1;
}
write_lock_irq(&ep->lock); /*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For, cases
* with a non-zero timeout, this thread will check the ready list under
* lock and will added to the wait queue. For, cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
eavail = ep_events_available(ep); eavail = ep_events_available(ep);
write_unlock_irq(&ep->lock);
goto send_events; while (1) {
if (eavail) {
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
} }
fetch_events: if (timed_out)
return 0;
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
eavail = ep_events_available(ep); eavail = ep_busy_loop(ep, timed_out);
if (eavail) if (eavail)
goto send_events; continue;
/* if (signal_pending(current))
* Busy poll timed out. Drop NAPI ID for now, we can add return -EINTR;
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
ep_reset_busy_poll_napi_id(ep);
do {
/* /*
* Internally init_wait() uses autoremove_wake_function(), * Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each * thus wait entry is removed from the wait queue on each
...@@ -1809,55 +1833,38 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ...@@ -1809,55 +1833,38 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* important. * important.
*/ */
eavail = ep_events_available(ep); eavail = ep_events_available(ep);
if (!eavail) { if (!eavail)
if (signal_pending(current))
res = -EINTR;
else
__add_wait_queue_exclusive(&ep->wq, &wait); __add_wait_queue_exclusive(&ep->wq, &wait);
}
write_unlock_irq(&ep->lock);
if (eavail || res) write_unlock_irq(&ep->lock);
break;
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { if (!eavail)
timed_out = 1; timed_out = !schedule_hrtimeout_range(to, slack,
break; HRTIMER_MODE_ABS);
} __set_current_state(TASK_RUNNING);
/* We were woken up, thus go and try to harvest some events */ /*
* We were woken up, thus go and try to harvest some events.
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
eavail = 1; eavail = 1;
} while (0);
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.entry)) { if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock); write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
* timeout expired before it could reacquire the lock.
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait); __remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock); write_unlock_irq(&ep->lock);
} }
send_events:
if (fatal_signal_pending(current)) {
/*
* Always short-circuit for fatal signals to allow
* threads to make a timely exit without the chance of
* finding more events available and fetching
* repeatedly.
*/
res = -EINTR;
} }
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
} }
/** /**
...@@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* part of the user space epoll_wait(2). * part of the user space epoll_wait(2).
*/ */
static int do_epoll_wait(int epfd, struct epoll_event __user *events, static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout) int maxevents, struct timespec64 *to)
{ {
int error; int error;
struct fd f; struct fd f;
...@@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, ...@@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = f.file->private_data; ep = f.file->private_data;
/* Time to fish for events ... */ /* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout); error = ep_poll(ep, events, maxevents, to);
error_fput: error_fput:
fdput(f); fdput(f);
...@@ -2220,16 +2227,19 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, ...@@ -2220,16 +2227,19 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout) int, maxevents, int, timeout)
{ {
return do_epoll_wait(epfd, events, maxevents, timeout); struct timespec64 to;
return do_epoll_wait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout));
} }
/* /*
* Implement the event wait interface for the eventpoll file. It is the kernel * Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2). * part of the user space epoll_pwait(2).
*/ */
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask, int maxevents, struct timespec64 *to,
size_t, sigsetsize) const sigset_t __user *sigmask, size_t sigsetsize)
{ {
int error; int error;
...@@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, ...@@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
if (error) if (error)
return error; return error;
error = do_epoll_wait(epfd, events, maxevents, timeout); error = do_epoll_wait(epfd, events, maxevents, to);
restore_saved_sigmask_unless(error == -EINTR); restore_saved_sigmask_unless(error == -EINTR);
return error; return error;
} }
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
struct timespec64 to;
return do_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
int, maxevents, const struct __kernel_timespec __user *, timeout,
const sigset_t __user *, sigmask, size_t, sigsetsize)
{
struct timespec64 ts, *to = NULL;
if (timeout) {
if (get_timespec64(&ts, timeout))
return -EFAULT;
to = &ts;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
return do_epoll_pwait(epfd, events, maxevents, to,
sigmask, sigsetsize);
}
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
struct epoll_event __user *, events, int maxevents, struct timespec64 *timeout,
int, maxevents, int, timeout, const compat_sigset_t __user *sigmask,
const compat_sigset_t __user *, sigmask, compat_size_t sigsetsize)
compat_size_t, sigsetsize)
{ {
long err; long err;
...@@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, ...@@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
return err; return err;
err = do_epoll_wait(epfd, events, maxevents, timeout); err = do_epoll_wait(epfd, events, maxevents, timeout);
restore_saved_sigmask_unless(err == -EINTR); restore_saved_sigmask_unless(err == -EINTR);
return err; return err;
} }
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
struct epoll_event __user *, events,
int, maxevents, int, timeout,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
{
struct timespec64 to;
return do_compat_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
struct epoll_event __user *, events,
int, maxevents,
const struct __kernel_timespec __user *, timeout,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
{
struct timespec64 ts, *to = NULL;
if (timeout) {
if (get_timespec64(&ts, timeout))
return -EFAULT;
to = &ts;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
return do_compat_epoll_pwait(epfd, events, maxevents, to,
sigmask, sigsetsize);
}
#endif #endif
static int __init eventpoll_init(void) static int __init eventpoll_init(void)
......
...@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, ...@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
int maxevents, int timeout, int maxevents, int timeout,
const compat_sigset_t __user *sigmask, const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize); compat_size_t sigsetsize);
asmlinkage long compat_sys_epoll_pwait2(int epfd,
struct epoll_event __user *events,
int maxevents,
const struct __kernel_timespec __user *timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize);
/* fs/fcntl.c */ /* fs/fcntl.c */
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
......
...@@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) ...@@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
/** /**
* mem_cgroup_lruvec - get the lru list vector for a memcg & node * mem_cgroup_lruvec - get the lru list vector for a memcg & node
* @memcg: memcg of the wanted lruvec * @memcg: memcg of the wanted lruvec
* @pgdat: pglist_data
* *
* Returns the lru list vector holding pages for a given @memcg & * Returns the lru list vector holding pages for a given @memcg &
* @node combination. This can be the node lruvec, if the memory * @pgdat combination. This can be the node lruvec, if the memory
* controller is disabled. * controller is disabled.
*/ */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
...@@ -652,7 +653,21 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, ...@@ -652,7 +653,21 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
return lruvec; return lruvec;
} }
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); /**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @pgdat: pgdat of the page
*
* This function relies on page->mem_cgroup being stable.
*/
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
struct pglist_data *pgdat)
{
struct mem_cgroup *memcg = page_memcg(page);
VM_WARN_ON_ONCE_PAGE(!memcg, page);
return mem_cgroup_lruvec(memcg, pgdat);
}
static inline bool lruvec_holds_page_lru_lock(struct page *page, static inline bool lruvec_holds_page_lru_lock(struct page *page,
struct lruvec *lruvec) struct lruvec *lruvec)
...@@ -913,41 +928,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, ...@@ -913,41 +928,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
local_irq_restore(flags); local_irq_restore(flags);
} }
/**
* mod_memcg_page_state - update page state statistics
* @page: the page
* @idx: page state item to account
* @val: number of pages (positive or negative)
*
* The @page must be locked or the caller must use lock_page_memcg()
* to prevent double accounting when the page is concurrently being
* moved to another memcg:
*
* lock_page(page) or lock_page_memcg(page)
* if (TestClearPageState(page))
* mod_memcg_page_state(page, state, -1);
* unlock_page(page) or unlock_page_memcg(page)
*
* Kernel pages are an exception to this, since they'll never move.
*/
static inline void __mod_memcg_page_state(struct page *page,
int idx, int val)
{
struct mem_cgroup *memcg = page_memcg(page);
if (memcg)
__mod_memcg_state(memcg, idx, val);
}
static inline void mod_memcg_page_state(struct page *page,
int idx, int val)
{
struct mem_cgroup *memcg = page_memcg(page);
if (memcg)
mod_memcg_state(memcg, idx, val);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec, static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx) enum node_stat_item idx)
{ {
...@@ -1395,18 +1375,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, ...@@ -1395,18 +1375,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
{ {
} }
static inline void __mod_memcg_page_state(struct page *page,
int idx,
int nr)
{
}
static inline void mod_memcg_page_state(struct page *page,
int idx,
int nr)
{
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec, static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx) enum node_stat_item idx)
{ {
...@@ -1479,34 +1447,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) ...@@ -1479,34 +1447,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
} }
#endif /* CONFIG_MEMCG */ #endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_state(struct mem_cgroup *memcg,
int idx)
{
__mod_memcg_state(memcg, idx, 1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_state(struct mem_cgroup *memcg,
int idx)
{
__mod_memcg_state(memcg, idx, -1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_page_state(struct page *page,
int idx)
{
__mod_memcg_page_state(page, idx, 1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_page_state(struct page *page,
int idx)
{
__mod_memcg_page_state(page, idx, -1);
}
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{ {
__mod_lruvec_kmem_state(p, idx, 1); __mod_lruvec_kmem_state(p, idx, 1);
...@@ -1517,34 +1457,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) ...@@ -1517,34 +1457,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
__mod_lruvec_kmem_state(p, idx, -1); __mod_lruvec_kmem_state(p, idx, -1);
} }
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_state(struct mem_cgroup *memcg,
int idx)
{
mod_memcg_state(memcg, idx, 1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_state(struct mem_cgroup *memcg,
int idx)
{
mod_memcg_state(memcg, idx, -1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_page_state(struct page *page,
int idx)
{
mod_memcg_page_state(page, idx, 1);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_page_state(struct page *page,
int idx)
{
mod_memcg_page_state(page, idx, -1);
}
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
...@@ -1733,21 +1645,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order) ...@@ -1733,21 +1645,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
__memcg_kmem_uncharge_page(page, order); __memcg_kmem_uncharge_page(page, order);
} }
static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
unsigned int nr_pages)
{
if (memcg_kmem_enabled())
return __memcg_kmem_charge(memcg, gfp, nr_pages);
return 0;
}
static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
if (memcg_kmem_enabled())
__memcg_kmem_uncharge(memcg, nr_pages);
}
/* /*
* A helper for accessing memcg's kmem_id, used for getting * A helper for accessing memcg's kmem_id, used for getting
* corresponding LRU lists. * corresponding LRU lists.
......
...@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm); ...@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
BUG(); \ BUG(); \
} \ } \
} while (0) } while (0)
#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \
static bool __section(".data.once") __warned; \
int __ret_warn_once = !!(cond); \
\
if (unlikely(__ret_warn_once && !__warned)) { \
dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
__warned = true; \
WARN_ON(1); \
} \
unlikely(__ret_warn_once); \
})
#define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON(cond) (void)WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
...@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm); ...@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif #endif
......
...@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, ...@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout, int maxevents, int timeout,
const sigset_t __user *sigmask, const sigset_t __user *sigmask,
size_t sigsetsize); size_t sigsetsize);
asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
int maxevents,
const struct __kernel_timespec __user *timeout,
const sigset_t __user *sigmask,
size_t sigsetsize);
/* fs/fcntl.c */ /* fs/fcntl.c */
asmlinkage long sys_dup(unsigned int fildes); asmlinkage long sys_dup(unsigned int fildes);
......
...@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) ...@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440 #define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise) __SYSCALL(__NR_process_madvise, sys_process_madvise)
#define __NR_epoll_pwait2 441
__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 441 #define __NR_syscalls 442
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different
......
...@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1); ...@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl); COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait); COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait); COND_SYSCALL_COMPAT(epoll_pwait);
COND_SYSCALL(epoll_pwait2);
COND_SYSCALL_COMPAT(epoll_pwait2);
/* fs/fcntl.c */ /* fs/fcntl.c */
......
...@@ -713,7 +713,7 @@ config ZSMALLOC_STAT ...@@ -713,7 +713,7 @@ config ZSMALLOC_STAT
select DEBUG_FS select DEBUG_FS
help help
This option enables code in the zsmalloc to collect various This option enables code in the zsmalloc to collect various
statistics about whats happening in zsmalloc and exports that statistics about what's happening in zsmalloc and exports that
information to userspace via debugfs. information to userspace via debugfs.
If unsure, say N. If unsure, say N.
......
...@@ -1342,46 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) ...@@ -1342,46 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
} }
#endif #endif
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @pgdat: pgdat of the page
*
* This function relies on page's memcg being stable - see the
* access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
struct mem_cgroup_per_node *mz;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
lruvec = &pgdat->__lruvec;
goto out;
}
memcg = page_memcg(page);
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
*/
if (!memcg)
memcg = root_mem_cgroup;
mz = mem_cgroup_page_nodeinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->pgdat != pgdat))
lruvec->pgdat = pgdat;
return lruvec;
}
/** /**
* lock_page_lruvec - lock and return lruvec for a given page. * lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page * @page: the page
...@@ -6987,6 +6947,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) ...@@ -6987,6 +6947,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return; return;
memcg = page_memcg(oldpage); memcg = page_memcg(oldpage);
VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
if (!memcg) if (!memcg)
return; return;
...@@ -7178,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) ...@@ -7178,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_count(page), page);
if (mem_cgroup_disabled())
return;
if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return; return;
memcg = page_memcg(page); memcg = page_memcg(page);
/* Readahead page, never charged */ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg) if (!memcg)
return; return;
...@@ -7242,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) ...@@ -7242,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned short oldid; unsigned short oldid;
if (mem_cgroup_disabled())
return 0;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0; return 0;
memcg = page_memcg(page); memcg = page_memcg(page);
/* Readahead page, never charged */ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg) if (!memcg)
return 0; return 0;
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE #define _GNU_SOURCE
#include <asm/unistd.h>
#include <linux/time_types.h>
#include <poll.h> #include <poll.h>
#include <unistd.h> #include <unistd.h>
#include <assert.h> #include <assert.h>
...@@ -21,6 +23,19 @@ struct epoll_mtcontext ...@@ -21,6 +23,19 @@ struct epoll_mtcontext
pthread_t waiter; pthread_t waiter;
}; };
#ifndef __NR_epoll_pwait2
#define __NR_epoll_pwait2 -1
#endif
static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
int maxevents,
const struct __kernel_timespec *timeout,
const sigset_t *sigset, size_t sigsetsize)
{
return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
sigset, sigsetsize);
}
static void signal_handler(int signum) static void signal_handler(int signum)
{ {
} }
...@@ -3377,4 +3392,61 @@ TEST(epoll61) ...@@ -3377,4 +3392,61 @@ TEST(epoll61)
close(ctx.evfd); close(ctx.evfd);
} }
/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
TEST(epoll62)
{
int efd;
int sfd[2];
struct epoll_event e;
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
efd = epoll_create(1);
ASSERT_GE(efd, 0);
e.events = EPOLLIN;
ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
ASSERT_EQ(write(sfd[1], "w", 1), 1);
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
close(efd);
close(sfd[0]);
close(sfd[1]);
}
/* Epoll_pwait2 basic timeout test. */
TEST(epoll63)
{
const int cfg_delay_ms = 10;
unsigned long long tdiff;
struct __kernel_timespec ts;
int efd;
int sfd[2];
struct epoll_event e;
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
efd = epoll_create(1);
ASSERT_GE(efd, 0);
e.events = EPOLLIN;
ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
ts.tv_sec = 0;
ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
tdiff = msecs();
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
tdiff = msecs() - tdiff;
EXPECT_GE(tdiff, cfg_delay_ms);
close(efd);
close(sfd[0]);
close(sfd[1]);
}
TEST_HARNESS_MAIN TEST_HARNESS_MAIN
...@@ -111,7 +111,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) ...@@ -111,7 +111,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
{ {
struct page *page; struct page *page;
page = alloc_page(GFP_KERNEL | __GFP_ZERO); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
......
...@@ -3116,7 +3116,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) ...@@ -3116,7 +3116,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
} }
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
page = alloc_page(GFP_KERNEL | __GFP_ZERO); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) { if (!page) {
r = -ENOMEM; r = -ENOMEM;
goto vcpu_free; goto vcpu_free;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment