Commit 7fd3253a authored by Björn Töpel's avatar Björn Töpel Committed by Daniel Borkmann

net: Introduce preferred busy-polling

The existing busy-polling mode, enabled by the SO_BUSY_POLL socket
option or system-wide using the /proc/sys/net/core/busy_read knob, is
an opportunistic. That means that if the NAPI context is not
scheduled, it will poll it. If, after busy-polling, the budget is
exceeded the busy-polling logic will schedule the NAPI onto the
regular softirq handling.

One implication of the behavior above is that a busy/heavy loaded NAPI
context will never enter/allow for busy-polling. Some applications
prefer that most NAPI processing would be done by busy-polling.

This series adds a new socket option, SO_PREFER_BUSY_POLL, that works
in concert with the napi_defer_hard_irqs and gro_flush_timeout
knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were
introduced in commit 6f8b12d6 ("net: napi: add hard irqs deferral
feature"), and allows for a user to defer interrupts to be enabled and
instead schedule the NAPI context from a watchdog timer. When a user
enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled,
and the NAPI context is being processed by a softirq, the softirq NAPI
processing will exit early to allow the busy-polling to be performed.

If the application stops performing busy-polling via a system call,
the watchdog timer defined by gro_flush_timeout will timeout, and
regular softirq handling will resume.

In summary; Heavy traffic applications that prefer busy-polling over
softirq processing should use this option.

Example usage:

  $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs
  $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout

Note that the timeout should be larger than the userspace processing
window, otherwise the watchdog will timeout and fall back to regular
softirq processing.

Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket.
Signed-off-by: default avatarBjörn Töpel <bjorn.topel@intel.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarJakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
parent 854055c0
...@@ -124,6 +124,8 @@ ...@@ -124,6 +124,8 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -135,6 +135,8 @@ ...@@ -135,6 +135,8 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -116,6 +116,8 @@ ...@@ -116,6 +116,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042 #define SO_DETACH_REUSEPORT_BPF 0x4042
#define SO_PREFER_BUSY_POLL 0x4043
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -117,6 +117,8 @@ ...@@ -117,6 +117,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047 #define SO_DETACH_REUSEPORT_BPF 0x0047
#define SO_PREFER_BUSY_POLL 0x0048
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) ...@@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
unsigned int napi_id = READ_ONCE(ep->napi_id); unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false);
} }
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
......
...@@ -355,8 +355,9 @@ enum { ...@@ -355,8 +355,9 @@ enum {
NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_DISABLE, /* Disable pending */
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
}; };
enum { enum {
...@@ -367,6 +368,7 @@ enum { ...@@ -367,6 +368,7 @@ enum {
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
}; };
enum gro_result { enum gro_result {
...@@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n) ...@@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
return test_bit(NAPI_STATE_DISABLE, &n->state); return test_bit(NAPI_STATE_DISABLE, &n->state);
} }
static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}
bool napi_schedule_prep(struct napi_struct *n); bool napi_schedule_prep(struct napi_struct *n);
/** /**
......
...@@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); ...@@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
void napi_busy_loop(unsigned int napi_id, void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long), bool (*loop_end)(void *, unsigned long),
void *loop_end_arg); void *loop_end_arg, bool prefer_busy_poll);
#else /* CONFIG_NET_RX_BUSY_POLL */ #else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void) static inline unsigned long net_busy_loop_on(void)
...@@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) ...@@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
unsigned int napi_id = READ_ONCE(sk->sk_napi_id); unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
if (napi_id >= MIN_NAPI_ID) if (napi_id >= MIN_NAPI_ID)
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
READ_ONCE(sk->sk_prefer_busy_poll));
#endif #endif
} }
......
...@@ -301,6 +301,7 @@ struct bpf_local_storage; ...@@ -301,6 +301,7 @@ struct bpf_local_storage;
* @sk_ack_backlog: current listen backlog * @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen() * @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner * @sk_uid: user id of owner
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_priority: %SO_PRIORITY setting * @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc) * @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family * @sk_protocol: which protocol this socket belongs in this network family
...@@ -479,6 +480,9 @@ struct sock { ...@@ -479,6 +480,9 @@ struct sock {
u32 sk_ack_backlog; u32 sk_ack_backlog;
u32 sk_max_ack_backlog; u32 sk_max_ack_backlog;
kuid_t sk_uid; kuid_t sk_uid;
#ifdef CONFIG_NET_RX_BUSY_POLL
u8 sk_prefer_busy_poll;
#endif
struct pid *sk_peer_pid; struct pid *sk_peer_pid;
const struct cred *sk_peer_cred; const struct cred *sk_peer_cred;
long sk_rcvtimeo; long sk_rcvtimeo;
......
...@@ -119,6 +119,8 @@ ...@@ -119,6 +119,8 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
......
...@@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) ...@@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set, /* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time. * because we will call napi->poll() one more time.
...@@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) ...@@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#define BUSY_POLL_BUDGET 8 #define BUSY_POLL_BUDGET 8
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{ {
if (!skip_schedule) {
gro_normal_list(napi);
__napi_schedule(napi);
return;
}
if (napi->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(napi, HZ >= 1000);
}
gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll)
{
bool skip_schedule = false;
unsigned long timeout;
int rc; int rc;
/* Busy polling means there is a high chance device driver hard irq /* Busy polling means there is a high chance device driver hard irq
...@@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) ...@@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable(); local_bh_disable();
if (prefer_busy_poll) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
}
}
/* All we really want here is to re-enable device interrupts. /* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round. * Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/ */
...@@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) ...@@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
*/ */
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock); netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET) { if (rc == BUSY_POLL_BUDGET)
/* As the whole budget was spent, we still own the napi so can __busy_poll_stop(napi, skip_schedule);
* safely handle the rx_list.
*/
gro_normal_list(napi);
__napi_schedule(napi);
}
local_bh_enable(); local_bh_enable();
} }
void napi_busy_loop(unsigned int napi_id, void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long), bool (*loop_end)(void *, unsigned long),
void *loop_end_arg) void *loop_end_arg, bool prefer_busy_poll)
{ {
unsigned long start_time = loop_end ? busy_loop_current_time() : 0; unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget); int (*napi_poll)(struct napi_struct *napi, int budget);
...@@ -6565,12 +6591,18 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6565,12 +6591,18 @@ void napi_busy_loop(unsigned int napi_id,
* we avoid dirtying napi->state as much as we can. * we avoid dirtying napi->state as much as we can.
*/ */
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL)) NAPIF_STATE_IN_BUSY_POLL)) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count; goto count;
}
if (cmpxchg(&napi->state, val, if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL | val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val) NAPIF_STATE_SCHED) != val) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count; goto count;
}
have_poll_lock = netpoll_poll_lock(napi); have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll; napi_poll = napi->poll;
} }
...@@ -6588,7 +6620,7 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6588,7 +6620,7 @@ void napi_busy_loop(unsigned int napi_id,
if (unlikely(need_resched())) { if (unlikely(need_resched())) {
if (napi_poll) if (napi_poll)
busy_poll_stop(napi, have_poll_lock); busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
preempt_enable(); preempt_enable();
rcu_read_unlock(); rcu_read_unlock();
cond_resched(); cond_resched();
...@@ -6599,7 +6631,7 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6599,7 +6631,7 @@ void napi_busy_loop(unsigned int napi_id,
cpu_relax(); cpu_relax();
} }
if (napi_poll) if (napi_poll)
busy_poll_stop(napi, have_poll_lock); busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
preempt_enable(); preempt_enable();
out: out:
rcu_read_unlock(); rcu_read_unlock();
...@@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) ...@@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ. * NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/ */
if (!napi_disable_pending(napi) && if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi); __napi_schedule_irqoff(napi);
}
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
...@@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n) ...@@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer); hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state);
} }
EXPORT_SYMBOL(napi_disable); EXPORT_SYMBOL(napi_disable);
...@@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) ...@@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock; goto out_unlock;
} }
/* The NAPI context has more processing work, but busy-polling
* is preferred. Exit early.
*/
if (napi_prefer_busy_poll(n)) {
if (napi_complete_done(n, work)) {
/* If timeout is not set, we need to make sure
* that the NAPI is re-scheduled.
*/
napi_schedule(n);
}
goto out_unlock;
}
if (n->gro_bitmask) { if (n->gro_bitmask) {
/* flush too old packets /* flush too old packets
* If HZ < 1000, flush all packets. * If HZ < 1000, flush all packets.
......
...@@ -1159,6 +1159,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ...@@ -1159,6 +1159,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_ll_usec = val; sk->sk_ll_usec = val;
} }
break; break;
case SO_PREFER_BUSY_POLL:
if (valbool && !capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
#endif #endif
case SO_MAX_PACING_RATE: case SO_MAX_PACING_RATE:
...@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, ...@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL: case SO_BUSY_POLL:
v.val = sk->sk_ll_usec; v.val = sk->sk_ll_usec;
break; break;
case SO_PREFER_BUSY_POLL:
v.val = READ_ONCE(sk->sk_prefer_busy_poll);
break;
#endif #endif
case SO_MAX_PACING_RATE: case SO_MAX_PACING_RATE:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment