Commit 4c532b14 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-napi-addition-of-napi_defer_hard_irqs'

Eric Dumazet says:

====================
net: napi: addition of napi_defer_hard_irqs

This patch series augments gro_glush_timeout feature with napi_defer_hard_irqs

As extensively described in first patch changelog, this can suppresss
the chit-chat traffic between NIC and host to signal interrupts and re-arming
them, since this can be an issue on high speed NIC with many queues.

The last patch in this series converts mlx4 TX completion to
napi_complete_done(), to enable this new mechanism.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e6acd2b6 cf4058db
...@@ -946,7 +946,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) ...@@ -946,7 +946,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring]; xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring];
if (xdp_tx_cq->xdp_busy) { if (xdp_tx_cq->xdp_busy) {
clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq, clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq,
budget); budget) < budget;
xdp_tx_cq->xdp_busy = !clean_complete; xdp_tx_cq->xdp_busy = !clean_complete;
} }
} }
......
...@@ -382,7 +382,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) ...@@ -382,7 +382,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
return cnt; return cnt;
} }
bool mlx4_en_process_tx_cq(struct net_device *dev, int mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget) struct mlx4_en_cq *cq, int napi_budget)
{ {
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
...@@ -405,7 +405,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -405,7 +405,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
u32 ring_cons; u32 ring_cons;
if (unlikely(!priv->port_up)) if (unlikely(!priv->port_up))
return true; return 0;
netdev_txq_bql_complete_prefetchw(ring->tx_queue); netdev_txq_bql_complete_prefetchw(ring->tx_queue);
...@@ -480,7 +480,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -480,7 +480,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped); WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
if (cq->type == TX_XDP) if (cq->type == TX_XDP)
return done < budget; return done;
netdev_tx_completed_queue(ring->tx_queue, packets, bytes); netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
...@@ -492,7 +492,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -492,7 +492,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
ring->wake_queue++; ring->wake_queue++;
} }
return done < budget; return done;
} }
void mlx4_en_tx_irq(struct mlx4_cq *mcq) void mlx4_en_tx_irq(struct mlx4_cq *mcq)
...@@ -512,13 +512,13 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget) ...@@ -512,13 +512,13 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
struct net_device *dev = cq->dev; struct net_device *dev = cq->dev;
struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_priv *priv = netdev_priv(dev);
bool clean_complete; int work_done;
clean_complete = mlx4_en_process_tx_cq(dev, cq, budget); work_done = mlx4_en_process_tx_cq(dev, cq, budget);
if (!clean_complete) if (work_done >= budget)
return budget; return budget;
napi_complete(napi); if (napi_complete_done(napi, work_done))
mlx4_en_arm_cq(priv, cq); mlx4_en_arm_cq(priv, cq);
return 0; return 0;
......
...@@ -737,7 +737,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, ...@@ -737,7 +737,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev,
int budget); int budget);
int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
bool mlx4_en_process_tx_cq(struct net_device *dev, int mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget); struct mlx4_en_cq *cq, int napi_budget);
u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring, struct mlx4_en_tx_ring *ring,
......
...@@ -329,6 +329,7 @@ struct napi_struct { ...@@ -329,6 +329,7 @@ struct napi_struct {
unsigned long state; unsigned long state;
int weight; int weight;
int defer_hard_irqs_count;
unsigned long gro_bitmask; unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int); int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL #ifdef CONFIG_NETPOLL
...@@ -1995,6 +1996,7 @@ struct net_device { ...@@ -1995,6 +1996,7 @@ struct net_device {
struct bpf_prog __rcu *xdp_prog; struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout; unsigned long gro_flush_timeout;
int napi_defer_hard_irqs;
rx_handler_func_t __rcu *rx_handler; rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data; void __rcu *rx_handler_data;
......
...@@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); ...@@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done) bool napi_complete_done(struct napi_struct *n, int work_done)
{ {
unsigned long flags, val, new; unsigned long flags, val, new, timeout = 0;
bool ret = true;
/* /*
* 1) Don't let napi dequeue from the cpu poll list * 1) Don't let napi dequeue from the cpu poll list
...@@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) ...@@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL))) NAPIF_STATE_IN_BUSY_POLL)))
return false; return false;
if (work_done) {
if (n->gro_bitmask)
timeout = READ_ONCE(n->dev->gro_flush_timeout);
n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
timeout = READ_ONCE(n->dev->gro_flush_timeout);
if (timeout)
ret = false;
}
if (n->gro_bitmask) { if (n->gro_bitmask) {
unsigned long timeout = 0;
if (work_done)
timeout = n->dev->gro_flush_timeout;
/* When the NAPI instance uses a timeout and keeps postponing /* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in * it, we need to bound somehow the time packets are kept in
* the GRO layer * the GRO layer
*/ */
napi_gro_flush(n, !!timeout); napi_gro_flush(n, !!timeout);
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
} }
gro_normal_list(n); gro_normal_list(n);
...@@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) ...@@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false; return false;
} }
return true; if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
return ret;
} }
EXPORT_SYMBOL(napi_complete_done); EXPORT_SYMBOL(napi_complete_done);
...@@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) ...@@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting /* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ. * NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/ */
if (napi->gro_bitmask && !napi_disable_pending(napi) && if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi); __napi_schedule_irqoff(napi);
......
...@@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); ...@@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{ {
dev->gro_flush_timeout = val; WRITE_ONCE(dev->gro_flush_timeout, val);
return 0; return 0;
} }
...@@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, ...@@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
} }
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
{
WRITE_ONCE(dev->napi_defer_hard_irqs, val);
return 0;
}
static ssize_t napi_defer_hard_irqs_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
if (!capable(CAP_NET_ADMIN))
return -EPERM;
return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
}
NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len) const char *buf, size_t len)
{ {
...@@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { ...@@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_flags.attr, &dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr, &dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr, &dev_attr_gro_flush_timeout.attr,
&dev_attr_napi_defer_hard_irqs.attr,
&dev_attr_phys_port_id.attr, &dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr, &dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr, &dev_attr_phys_switch_id.attr,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment