Commit fc3f9146 authored by David S. Miller's avatar David S. Miller

Merge branch 'busypoll-preemption-and-other-optimizations'

Eric Dumazet says:

====================
net: busy-poll: allow preemption and other optimizations

It is time to have preemption points in sk_busy_loop() and improve
its scalability.

Also napi_complete() and friends can tell drivers when it is safe to
not re-enable device interrupts, saving some overhead under
high busy polling.

mlx4 and bnx2x are changed accordingly, to show how this busy polling
status can be exploited by drivers.

Next steps will implement Zach Brown suggestion, where NAPI polling
would be enabled all the time for some chosen queues.
This is needed for efficient epoll() support anyway.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2874aa2e 80f1c21c
...@@ -3248,13 +3248,14 @@ static int bnx2x_poll(struct napi_struct *napi, int budget) ...@@ -3248,13 +3248,14 @@ static int bnx2x_poll(struct napi_struct *napi, int budget)
rmb(); rmb();
if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) { if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
napi_complete(napi); if (napi_complete_done(napi, rx_work_done)) {
/* Re-enable interrupts */ /* Re-enable interrupts */
DP(NETIF_MSG_RX_STATUS, DP(NETIF_MSG_RX_STATUS,
"Update index to %d\n", fp->fp_hc_idx); "Update index to %d\n", fp->fp_hc_idx);
bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID,
le16_to_cpu(fp->fp_hc_idx), le16_to_cpu(fp->fp_hc_idx),
IGU_INT_ENABLE, 1); IGU_INT_ENABLE, 1);
}
} else { } else {
rx_work_done = budget; rx_work_done = budget;
} }
......
...@@ -1137,8 +1137,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) ...@@ -1137,8 +1137,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
done = 0; done = 0;
} }
/* Done for now */ /* Done for now */
napi_complete_done(napi, done); if (napi_complete_done(napi, done))
mlx4_en_arm_cq(priv, cq); mlx4_en_arm_cq(priv, cq);
return done; return done;
} }
......
...@@ -334,6 +334,16 @@ enum { ...@@ -334,6 +334,16 @@ enum {
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
};
enum {
NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
}; };
enum gro_result { enum gro_result {
...@@ -453,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi) ...@@ -453,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi)
return false; return false;
} }
void __napi_complete(struct napi_struct *n); bool __napi_complete(struct napi_struct *n);
void napi_complete_done(struct napi_struct *n, int work_done); bool napi_complete_done(struct napi_struct *n, int work_done);
/** /**
* napi_complete - NAPI processing complete * napi_complete - NAPI processing complete
* @n: NAPI context * @n: NAPI context
* *
* Mark NAPI processing as complete. * Mark NAPI processing as complete.
* Consider using napi_complete_done() instead. * Consider using napi_complete_done() instead.
* Return false if device should avoid rearming interrupts.
*/ */
static inline void napi_complete(struct napi_struct *n) static inline bool napi_complete(struct napi_struct *n)
{ {
return napi_complete_done(n, 0); return napi_complete_done(n, 0);
} }
......
...@@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void) ...@@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void)
return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll);
} }
static inline bool sk_can_busy_loop(struct sock *sk) static inline bool sk_can_busy_loop(const struct sock *sk)
{ {
return sk->sk_ll_usec && sk->sk_napi_id && return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current);
!need_resched() && !signal_pending(current);
} }
......
...@@ -4898,26 +4898,36 @@ void __napi_schedule_irqoff(struct napi_struct *n) ...@@ -4898,26 +4898,36 @@ void __napi_schedule_irqoff(struct napi_struct *n)
} }
EXPORT_SYMBOL(__napi_schedule_irqoff); EXPORT_SYMBOL(__napi_schedule_irqoff);
void __napi_complete(struct napi_struct *n) bool __napi_complete(struct napi_struct *n)
{ {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
/* Some drivers call us directly, instead of calling
* napi_complete_done().
*/
if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
return false;
list_del_init(&n->poll_list); list_del_init(&n->poll_list);
smp_mb__before_atomic(); smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state); clear_bit(NAPI_STATE_SCHED, &n->state);
return true;
} }
EXPORT_SYMBOL(__napi_complete); EXPORT_SYMBOL(__napi_complete);
void napi_complete_done(struct napi_struct *n, int work_done) bool napi_complete_done(struct napi_struct *n, int work_done)
{ {
unsigned long flags; unsigned long flags;
/* /*
* don't let napi dequeue from the cpu poll list * 1) Don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu * just in case its running on a different cpu.
* 2) If we are busy polling, do nothing here, we have
* the guarantee we will be called later.
*/ */
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) if (unlikely(n->state & (NAPIF_STATE_NPSVC |
return; NAPIF_STATE_IN_BUSY_POLL)))
return false;
if (n->gro_list) { if (n->gro_list) {
unsigned long timeout = 0; unsigned long timeout = 0;
...@@ -4939,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) ...@@ -4939,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
__napi_complete(n); __napi_complete(n);
local_irq_restore(flags); local_irq_restore(flags);
} }
return true;
} }
EXPORT_SYMBOL(napi_complete_done); EXPORT_SYMBOL(napi_complete_done);
...@@ -4956,13 +4967,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) ...@@ -4956,13 +4967,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
} }
#if defined(CONFIG_NET_RX_BUSY_POLL) #if defined(CONFIG_NET_RX_BUSY_POLL)
#define BUSY_POLL_BUDGET 8 #define BUSY_POLL_BUDGET 8
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
{
int rc;
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
local_bh_enable();
if (local_softirq_pending())
do_softirq();
}
bool sk_busy_loop(struct sock *sk, int nonblock) bool sk_busy_loop(struct sock *sk, int nonblock)
{ {
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll)(struct napi_struct *dev); int (*busy_poll)(struct napi_struct *dev);
void *have_poll_lock = NULL;
struct napi_struct *napi; struct napi_struct *napi;
int rc = false; int rc;
restart:
rc = false;
napi_poll = NULL;
rcu_read_lock(); rcu_read_lock();
...@@ -4973,24 +5012,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) ...@@ -4973,24 +5012,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
/* Note: ndo_busy_poll method is optional in linux-4.5 */ /* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll; busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
do { preempt_disable();
for (;;) {
rc = 0; rc = 0;
local_bh_disable(); local_bh_disable();
if (busy_poll) { if (busy_poll) {
rc = busy_poll(napi); rc = busy_poll(napi);
} else if (napi_schedule_prep(napi)) { goto count;
void *have = netpoll_poll_lock(napi);
if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
rc = napi->poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
if (rc == BUSY_POLL_BUDGET) {
napi_complete_done(napi, rc);
napi_schedule(napi);
}
}
netpoll_poll_unlock(have);
} }
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
/* If multiple threads are competing for this napi,
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL))
goto count;
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val)
goto count;
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
rc = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
count:
if (rc > 0) if (rc > 0)
__NET_ADD_STATS(sock_net(sk), __NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc); LINUX_MIB_BUSYPOLLRXPACKETS, rc);
...@@ -4999,10 +5047,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) ...@@ -4999,10 +5047,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (rc == LL_FLUSH_FAILED) if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */ break; /* permanent failure */
cpu_relax(); if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && busy_loop_timeout(end_time))
!need_resched() && !busy_loop_timeout(end_time)); break;
if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
preempt_enable();
rcu_read_unlock();
cond_resched();
rc = !skb_queue_empty(&sk->sk_receive_queue);
if (rc || busy_loop_timeout(end_time))
return rc;
goto restart;
}
cpu_relax_lowlatency();
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue); rc = !skb_queue_empty(&sk->sk_receive_queue);
out: out:
rcu_read_unlock(); rcu_read_unlock();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment