Commit e61caf04 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'page_pool-allow-caching-from-safely-localized-napi'

Jakub Kicinski says:

====================
page_pool: allow caching from safely localized NAPI

I went back to the explicit "are we in NAPI method", mostly
because I don't like having both around :( (even tho I maintain
that in_softirq() && !in_hardirq() is as safe, as softirqs do
not nest).

Still returning the skbs to a CPU, tho, not to the NAPI instance.
I reckon we could create a small refcounted struct per NAPI instance
which would allow sockets and other users so hold a persisent
and safe reference. But that's a bigger change, and I get 90+%
recycling thru the cache with just these patches (for RR and
streaming tests with 100% CPU use it's almost 100%).

Some numbers for streaming test with 100% CPU use (from previous version,
but really they perform the same):

		HW-GRO				page=page
		before		after		before		after
recycle:
cached:			0	138669686		0	150197505
cache_full:		0	   223391		0	    74582
ring:		138551933         9997191	149299454		0
ring_full: 		0             488	     3154	   127590
released_refcnt:	0		0		0		0

alloc:
fast:		136491361	148615710	146969587	150322859
slow:		     1772	     1799	      144	      105
slow_high_order:	0		0		0		0
empty:		     1772	     1799	      144	      105
refill:		  2165245	   156302	  2332880	     2128
waive:			0		0		0		0

v1: https://lore.kernel.org/all/20230411201800.596103-1-kuba@kernel.org/
rfcv2: https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/
====================

Link: https://lore.kernel.org/r/20230413042605.895677-1-kuba@kernel.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c11d2e71 294e39e0
...@@ -165,6 +165,7 @@ Registration ...@@ -165,6 +165,7 @@ Registration
pp_params.pool_size = DESC_NUM; pp_params.pool_size = DESC_NUM;
pp_params.nid = NUMA_NO_NODE; pp_params.nid = NUMA_NO_NODE;
pp_params.dev = priv->dev; pp_params.dev = priv->dev;
pp_params.napi = napi; /* only if locking is tied to NAPI */
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
page_pool = page_pool_create(&pp_params); page_pool = page_pool_create(&pp_params);
......
...@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, ...@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
pp.pool_size = bp->rx_ring_size; pp.pool_size = bp->rx_ring_size;
pp.nid = dev_to_node(&bp->pdev->dev); pp.nid = dev_to_node(&bp->pdev->dev);
pp.napi = &rxr->bnapi->napi;
pp.dev = &bp->pdev->dev; pp.dev = &bp->pdev->dev;
pp.dma_dir = DMA_BIDIRECTIONAL; pp.dma_dir = DMA_BIDIRECTIONAL;
......
...@@ -360,8 +360,11 @@ struct napi_struct { ...@@ -360,8 +360,11 @@ struct napi_struct {
unsigned long gro_bitmask; unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int); int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL #ifdef CONFIG_NETPOLL
/* CPU actively polling if netpoll is configured */
int poll_owner; int poll_owner;
#endif #endif
/* CPU on which NAPI has been scheduled for processing */
int list_owner;
struct net_device *dev; struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb; struct sk_buff *skb;
......
...@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) ...@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
__skb_frag_ref(&skb_shinfo(skb)->frags[f]); __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
} }
static inline void
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
{
struct page *page = skb_frag_page(frag);
#ifdef CONFIG_PAGE_POOL
if (recycle && page_pool_return_skb_page(page, napi_safe))
return;
#endif
put_page(page);
}
/** /**
* __skb_frag_unref - release a reference on a paged fragment. * __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment * @frag: the paged fragment
...@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) ...@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
*/ */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{ {
struct page *page = skb_frag_page(frag); napi_frag_unref(frag, recycle, false);
#ifdef CONFIG_PAGE_POOL
if (recycle && page_pool_return_skb_page(page))
return;
#endif
put_page(page);
} }
/** /**
......
...@@ -77,6 +77,7 @@ struct page_pool_params { ...@@ -77,6 +77,7 @@ struct page_pool_params {
unsigned int pool_size; unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */ int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */ struct device *dev; /* device, for DMA pre-mapping purposes */
struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
enum dma_data_direction dma_dir; /* DMA mapping direction */ enum dma_data_direction dma_dir; /* DMA mapping direction */
unsigned int max_len; /* max DMA sync memory size */ unsigned int max_len; /* max DMA sync memory size */
unsigned int offset; /* DMA addr offset */ unsigned int offset; /* DMA addr offset */
...@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) ...@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir; return pool->p.dma_dir;
} }
bool page_pool_return_skb_page(struct page *page); bool page_pool_return_skb_page(struct page *page, bool napi_safe);
struct page_pool *page_pool_create(const struct page_pool_params *params); struct page_pool *page_pool_create(const struct page_pool_params *params);
......
...@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, ...@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
} }
list_add_tail(&napi->poll_list, &sd->poll_list); list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action() /* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ. * we have to raise NET_RX_SOFTIRQ.
*/ */
...@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) ...@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list); list_del_init(&n->poll_list);
local_irq_restore(flags); local_irq_restore(flags);
} }
WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state); val = READ_ONCE(n->state);
do { do {
...@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, ...@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL #ifdef CONFIG_NETPOLL
napi->poll_owner = -1; napi->poll_owner = -1;
#endif #endif
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list); list_add_rcu(&napi->dev_list, &dev->napi_list);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/mm.h> /* for put_page() */ #include <linux/mm.h> /* for put_page() */
#include <linux/poison.h> #include <linux/poison.h>
#include <linux/ethtool.h> #include <linux/ethtool.h>
#include <linux/netdevice.h>
#include <trace/events/page_pool.h> #include <trace/events/page_pool.h>
...@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) ...@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
} }
EXPORT_SYMBOL(page_pool_update_nid); EXPORT_SYMBOL(page_pool_update_nid);
bool page_pool_return_skb_page(struct page *page) bool page_pool_return_skb_page(struct page *page, bool napi_safe)
{ {
struct napi_struct *napi;
struct page_pool *pp; struct page_pool *pp;
bool allow_direct;
page = compound_head(page); page = compound_head(page);
...@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page) ...@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
pp = page->pp; pp = page->pp;
/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
* no possible race.
*/
napi = pp->p.napi;
allow_direct = napi_safe && napi &&
READ_ONCE(napi->list_owner) == smp_processor_id();
/* Driver set this to memory recycling info. Reset it on recycle. /* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model. * This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the * The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not. * 'flipped' fragment being in use or not.
*/ */
page_pool_put_full_page(pp, page, false); page_pool_put_full_page(pp, page, allow_direct);
return true; return true;
} }
......
...@@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb) ...@@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list); skb_get(list);
} }
static bool skb_pp_recycle(struct sk_buff *skb, void *data) static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{ {
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false; return false;
return page_pool_return_skb_page(virt_to_page(data)); return page_pool_return_skb_page(virt_to_page(data), napi_safe);
} }
static void skb_kfree_head(void *head, unsigned int end_offset) static void skb_kfree_head(void *head, unsigned int end_offset)
...@@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) ...@@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head); kfree(head);
} }
static void skb_free_head(struct sk_buff *skb) static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{ {
unsigned char *head = skb->head; unsigned char *head = skb->head;
if (skb->head_frag) { if (skb->head_frag) {
if (skb_pp_recycle(skb, head)) if (skb_pp_recycle(skb, head, napi_safe))
return; return;
skb_free_frag(head); skb_free_frag(head);
} else { } else {
...@@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb) ...@@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
} }
} }
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
bool napi_safe)
{ {
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
int i; int i;
...@@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) ...@@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
} }
for (i = 0; i < shinfo->nr_frags; i++) for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
free_head: free_head:
if (shinfo->frag_list) if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason); kfree_skb_list_reason(shinfo->frag_list, reason);
skb_free_head(skb); skb_free_head(skb, napi_safe);
exit: exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle /* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races * bit is only set on the head though, so in order to avoid races
...@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb) ...@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
} }
/* Free everything but the sk_buff shell. */ /* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
bool napi_safe)
{ {
skb_release_head_state(skb); skb_release_head_state(skb);
if (likely(skb->head)) if (likely(skb->head))
skb_release_data(skb, reason); skb_release_data(skb, reason, napi_safe);
} }
/** /**
...@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) ...@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
void __kfree_skb(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb)
{ {
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
kfree_skbmem(skb); kfree_skbmem(skb);
} }
EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(__kfree_skb);
...@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, ...@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return; return;
} }
skb_release_all(skb, reason); skb_release_all(skb, reason, false);
sa->skb_array[sa->skb_count++] = skb; sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
...@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb); ...@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb) void __consume_stateless_skb(struct sk_buff *skb)
{ {
trace_consume_skb(skb, __builtin_return_address(0)); trace_consume_skb(skb, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED); skb_release_data(skb, SKB_CONSUMED, false);
kfree_skbmem(skb); kfree_skbmem(skb);
} }
...@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) ...@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb) void __kfree_skb_defer(struct sk_buff *skb)
{ {
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
napi_skb_cache_put(skb); napi_skb_cache_put(skb);
} }
...@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) ...@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return; return;
} }
skb_release_all(skb, SKB_CONSUMED); skb_release_all(skb, SKB_CONSUMED, !!budget);
napi_skb_cache_put(skb); napi_skb_cache_put(skb);
} }
EXPORT_SYMBOL(napi_consume_skb); EXPORT_SYMBOL(napi_consume_skb);
...@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); ...@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/ */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{ {
skb_release_all(dst, SKB_CONSUMED); skb_release_all(dst, SKB_CONSUMED, false);
return __skb_clone(dst, src); return __skb_clone(dst, src);
} }
EXPORT_SYMBOL_GPL(skb_morph); EXPORT_SYMBOL_GPL(skb_morph);
...@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ...@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb)) if (skb_has_frag_list(skb))
skb_clone_fraglist(skb); skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED); skb_release_data(skb, SKB_CONSUMED, false);
} else { } else {
skb_free_head(skb); skb_free_head(skb, false);
} }
off = (data + nhead) - skb->head; off = (data + nhead) - skb->head;
...@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, ...@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i); skb_frag_ref(skb, i);
if (skb_has_frag_list(skb)) if (skb_has_frag_list(skb))
skb_clone_fraglist(skb); skb_clone_fraglist(skb);
skb_release_data(skb, SKB_CONSUMED); skb_release_data(skb, SKB_CONSUMED, false);
} else { } else {
/* we can reuse existing recount- all we did was /* we can reuse existing recount- all we did was
* relocate values * relocate values
*/ */
skb_free_head(skb); skb_free_head(skb, false);
} }
skb->head = data; skb->head = data;
...@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, ...@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size); skb_kfree_head(data, size);
return -ENOMEM; return -ENOMEM;
} }
skb_release_data(skb, SKB_CONSUMED); skb_release_data(skb, SKB_CONSUMED, false);
skb->head = data; skb->head = data;
skb->head_frag = 0; skb->head_frag = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment