Commit 69cf8730 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue

Tony Nguyen says:

====================
idpf: XDP chapter I: convert Rx to libeth

Alexander Lobakin says:

XDP for idpf is currently 5 chapters:
* convert Rx to libeth (this);
* convert Tx and stats to libeth;
* generic XDP and XSk code changes, libeth_xdp;
* actual XDP for idpf via libeth_xdp;
* XSk for idpf (^).

Part I does the following:
* splits &idpf_queue into 4 (RQ, SQ, FQ, CQ) and puts them on a diet;
* ensures optimal cacheline placement, strictly asserts CL sizes;
* moves currently unused/dead singleq mode out of line;
* reuses libeth's Rx ptype definitions and helpers;
* uses libeth's Rx buffer management for both header and payload;
* eliminates memcpy()s and coherent DMA uses on hotpath, uses
  napi_build_skb() instead of in-place short skb allocation.

Most idpf patches, except for the queue split, removes more lines
than adds.

Expect far better memory utilization and +5-8% on Rx depending on
the case (+17% on skb XDP_DROP :>).

* '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  idpf: use libeth Rx buffer management for payload buffer
  idpf: convert header split mode to libeth + napi_build_skb()
  libeth: support different types of buffers for Rx
  idpf: remove legacy Page Pool Ethtool stats
  idpf: reuse libeth's definitions of parsed ptype structures
  idpf: compile singleq code only under default-n CONFIG_IDPF_SINGLEQ
  idpf: merge singleq and splitq &net_device_ops
  idpf: strictly assert cachelines of queue and queue vector structures
  idpf: avoid bloating &idpf_q_vector with big %NR_CPUS
  idpf: split &idpf_queue into 4 strictly-typed queue structures
  idpf: stop using macros for accessing queue descriptors
  libeth: add cacheline / struct layout assertion helpers
  page_pool: use __cacheline_group_{begin, end}_aligned()
  cache: add __cacheline_group_{begin, end}_aligned() (+ couple more)
====================

Link: https://patch.msgid.link/20240710203031.188081-1-anthony.l.nguyen@intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 26f45317 74d1412a
...@@ -384,17 +384,6 @@ config IGC_LEDS ...@@ -384,17 +384,6 @@ config IGC_LEDS
Optional support for controlling the NIC LED's with the netdev Optional support for controlling the NIC LED's with the netdev
LED trigger. LED trigger.
config IDPF source "drivers/net/ethernet/intel/idpf/Kconfig"
tristate "Intel(R) Infrastructure Data Path Function Support"
depends on PCI_MSI
select DIMLIB
select PAGE_POOL
select PAGE_POOL_STATS
help
This driver supports Intel(R) Infrastructure Data Path Function
devices.
To compile this driver as a module, choose M here. The module
will be called idpf.
endif # NET_VENDOR_INTEL endif # NET_VENDOR_INTEL
# SPDX-License-Identifier: GPL-2.0-only
# Copyright (C) 2024 Intel Corporation
config IDPF
tristate "Intel(R) Infrastructure Data Path Function Support"
depends on PCI_MSI
select DIMLIB
select LIBETH
help
This driver supports Intel(R) Infrastructure Data Path Function
devices.
To compile this driver as a module, choose M here. The module
will be called idpf.
if IDPF
config IDPF_SINGLEQ
bool "idpf singleq support"
help
This option enables support for legacy single Rx/Tx queues w/no
completion and fill queues. Only enable if you have hardware which
wants to work in this mode as it increases the driver size and adds
runtme checks on hotpath.
endif # IDPF
...@@ -12,7 +12,8 @@ idpf-y := \ ...@@ -12,7 +12,8 @@ idpf-y := \
idpf_ethtool.o \ idpf_ethtool.o \
idpf_lib.o \ idpf_lib.o \
idpf_main.o \ idpf_main.o \
idpf_singleq_txrx.o \
idpf_txrx.o \ idpf_txrx.o \
idpf_virtchnl.o \ idpf_virtchnl.o \
idpf_vf_dev.o idpf_vf_dev.o
idpf-$(CONFIG_IDPF_SINGLEQ) += idpf_singleq_txrx.o
...@@ -17,10 +17,8 @@ struct idpf_vport_max_q; ...@@ -17,10 +17,8 @@ struct idpf_vport_max_q;
#include <linux/sctp.h> #include <linux/sctp.h>
#include <linux/ethtool_netlink.h> #include <linux/ethtool_netlink.h>
#include <net/gro.h> #include <net/gro.h>
#include <linux/dim.h>
#include "virtchnl2.h" #include "virtchnl2.h"
#include "idpf_lan_txrx.h"
#include "idpf_txrx.h" #include "idpf_txrx.h"
#include "idpf_controlq.h" #include "idpf_controlq.h"
...@@ -266,7 +264,6 @@ struct idpf_port_stats { ...@@ -266,7 +264,6 @@ struct idpf_port_stats {
* the worst case. * the worst case.
* @num_bufqs_per_qgrp: Buffer queues per RX queue in a given grouping * @num_bufqs_per_qgrp: Buffer queues per RX queue in a given grouping
* @bufq_desc_count: Buffer queue descriptor count * @bufq_desc_count: Buffer queue descriptor count
* @bufq_size: Size of buffers in ring (e.g. 2K, 4K, etc)
* @num_rxq_grp: Number of RX queues in a group * @num_rxq_grp: Number of RX queues in a group
* @rxq_grps: Total number of RX groups. Number of groups * number of RX per * @rxq_grps: Total number of RX groups. Number of groups * number of RX per
* group will yield total number of RX queues. * group will yield total number of RX queues.
...@@ -302,7 +299,7 @@ struct idpf_vport { ...@@ -302,7 +299,7 @@ struct idpf_vport {
u16 num_txq_grp; u16 num_txq_grp;
struct idpf_txq_group *txq_grps; struct idpf_txq_group *txq_grps;
u32 txq_model; u32 txq_model;
struct idpf_queue **txqs; struct idpf_tx_queue **txqs;
bool crc_enable; bool crc_enable;
u16 num_rxq; u16 num_rxq;
...@@ -310,11 +307,10 @@ struct idpf_vport { ...@@ -310,11 +307,10 @@ struct idpf_vport {
u32 rxq_desc_count; u32 rxq_desc_count;
u8 num_bufqs_per_qgrp; u8 num_bufqs_per_qgrp;
u32 bufq_desc_count[IDPF_MAX_BUFQS_PER_RXQ_GRP]; u32 bufq_desc_count[IDPF_MAX_BUFQS_PER_RXQ_GRP];
u32 bufq_size[IDPF_MAX_BUFQS_PER_RXQ_GRP];
u16 num_rxq_grp; u16 num_rxq_grp;
struct idpf_rxq_group *rxq_grps; struct idpf_rxq_group *rxq_grps;
u32 rxq_model; u32 rxq_model;
struct idpf_rx_ptype_decoded rx_ptype_lkup[IDPF_RX_MAX_PTYPE]; struct libeth_rx_pt *rx_ptype_lkup;
struct idpf_adapter *adapter; struct idpf_adapter *adapter;
struct net_device *netdev; struct net_device *netdev;
...@@ -601,7 +597,8 @@ struct idpf_adapter { ...@@ -601,7 +597,8 @@ struct idpf_adapter {
*/ */
static inline int idpf_is_queue_model_split(u16 q_model) static inline int idpf_is_queue_model_split(u16 q_model)
{ {
return q_model == VIRTCHNL2_QUEUE_MODEL_SPLIT; return !IS_ENABLED(CONFIG_IDPF_SINGLEQ) ||
q_model == VIRTCHNL2_QUEUE_MODEL_SPLIT;
} }
#define idpf_is_cap_ena(adapter, field, flag) \ #define idpf_is_cap_ena(adapter, field, flag) \
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#ifndef _IDPF_LAN_TXRX_H_ #ifndef _IDPF_LAN_TXRX_H_
#define _IDPF_LAN_TXRX_H_ #define _IDPF_LAN_TXRX_H_
#include <linux/bits.h>
enum idpf_rss_hash { enum idpf_rss_hash {
IDPF_HASH_INVALID = 0, IDPF_HASH_INVALID = 0,
/* Values 1 - 28 are reserved for future use */ /* Values 1 - 28 are reserved for future use */
......
...@@ -4,8 +4,7 @@ ...@@ -4,8 +4,7 @@
#include "idpf.h" #include "idpf.h"
#include "idpf_virtchnl.h" #include "idpf_virtchnl.h"
static const struct net_device_ops idpf_netdev_ops_splitq; static const struct net_device_ops idpf_netdev_ops;
static const struct net_device_ops idpf_netdev_ops_singleq;
/** /**
* idpf_init_vector_stack - Fill the MSIX vector stack with vector index * idpf_init_vector_stack - Fill the MSIX vector stack with vector index
...@@ -69,7 +68,7 @@ static void idpf_deinit_vector_stack(struct idpf_adapter *adapter) ...@@ -69,7 +68,7 @@ static void idpf_deinit_vector_stack(struct idpf_adapter *adapter)
static void idpf_mb_intr_rel_irq(struct idpf_adapter *adapter) static void idpf_mb_intr_rel_irq(struct idpf_adapter *adapter)
{ {
clear_bit(IDPF_MB_INTR_MODE, adapter->flags); clear_bit(IDPF_MB_INTR_MODE, adapter->flags);
free_irq(adapter->msix_entries[0].vector, adapter); kfree(free_irq(adapter->msix_entries[0].vector, adapter));
queue_delayed_work(adapter->mbx_wq, &adapter->mbx_task, 0); queue_delayed_work(adapter->mbx_wq, &adapter->mbx_task, 0);
} }
...@@ -124,15 +123,14 @@ static void idpf_mb_irq_enable(struct idpf_adapter *adapter) ...@@ -124,15 +123,14 @@ static void idpf_mb_irq_enable(struct idpf_adapter *adapter)
*/ */
static int idpf_mb_intr_req_irq(struct idpf_adapter *adapter) static int idpf_mb_intr_req_irq(struct idpf_adapter *adapter)
{ {
struct idpf_q_vector *mb_vector = &adapter->mb_vector;
int irq_num, mb_vidx = 0, err; int irq_num, mb_vidx = 0, err;
char *name;
irq_num = adapter->msix_entries[mb_vidx].vector; irq_num = adapter->msix_entries[mb_vidx].vector;
mb_vector->name = kasprintf(GFP_KERNEL, "%s-%s-%d", name = kasprintf(GFP_KERNEL, "%s-%s-%d",
dev_driver_string(&adapter->pdev->dev), dev_driver_string(&adapter->pdev->dev),
"Mailbox", mb_vidx); "Mailbox", mb_vidx);
err = request_irq(irq_num, adapter->irq_mb_handler, 0, err = request_irq(irq_num, adapter->irq_mb_handler, 0, name, adapter);
mb_vector->name, adapter);
if (err) { if (err) {
dev_err(&adapter->pdev->dev, dev_err(&adapter->pdev->dev,
"IRQ request for mailbox failed, error: %d\n", err); "IRQ request for mailbox failed, error: %d\n", err);
...@@ -765,10 +763,7 @@ static int idpf_cfg_netdev(struct idpf_vport *vport) ...@@ -765,10 +763,7 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
} }
/* assign netdev_ops */ /* assign netdev_ops */
if (idpf_is_queue_model_split(vport->txq_model)) netdev->netdev_ops = &idpf_netdev_ops;
netdev->netdev_ops = &idpf_netdev_ops_splitq;
else
netdev->netdev_ops = &idpf_netdev_ops_singleq;
/* setup watchdog timeout value to be 5 second */ /* setup watchdog timeout value to be 5 second */
netdev->watchdog_timeo = 5 * HZ; netdev->watchdog_timeo = 5 * HZ;
...@@ -946,6 +941,9 @@ static void idpf_decfg_netdev(struct idpf_vport *vport) ...@@ -946,6 +941,9 @@ static void idpf_decfg_netdev(struct idpf_vport *vport)
{ {
struct idpf_adapter *adapter = vport->adapter; struct idpf_adapter *adapter = vport->adapter;
kfree(vport->rx_ptype_lkup);
vport->rx_ptype_lkup = NULL;
unregister_netdev(vport->netdev); unregister_netdev(vport->netdev);
free_netdev(vport->netdev); free_netdev(vport->netdev);
vport->netdev = NULL; vport->netdev = NULL;
...@@ -1318,14 +1316,14 @@ static void idpf_rx_init_buf_tail(struct idpf_vport *vport) ...@@ -1318,14 +1316,14 @@ static void idpf_rx_init_buf_tail(struct idpf_vport *vport)
if (idpf_is_queue_model_split(vport->rxq_model)) { if (idpf_is_queue_model_split(vport->rxq_model)) {
for (j = 0; j < vport->num_bufqs_per_qgrp; j++) { for (j = 0; j < vport->num_bufqs_per_qgrp; j++) {
struct idpf_queue *q = const struct idpf_buf_queue *q =
&grp->splitq.bufq_sets[j].bufq; &grp->splitq.bufq_sets[j].bufq;
writel(q->next_to_alloc, q->tail); writel(q->next_to_alloc, q->tail);
} }
} else { } else {
for (j = 0; j < grp->singleq.num_rxq; j++) { for (j = 0; j < grp->singleq.num_rxq; j++) {
struct idpf_queue *q = const struct idpf_rx_queue *q =
grp->singleq.rxqs[j]; grp->singleq.rxqs[j];
writel(q->next_to_alloc, q->tail); writel(q->next_to_alloc, q->tail);
...@@ -1855,7 +1853,7 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport, ...@@ -1855,7 +1853,7 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport,
enum idpf_vport_state current_state = np->state; enum idpf_vport_state current_state = np->state;
struct idpf_adapter *adapter = vport->adapter; struct idpf_adapter *adapter = vport->adapter;
struct idpf_vport *new_vport; struct idpf_vport *new_vport;
int err, i; int err;
/* If the system is low on memory, we can end up in bad state if we /* If the system is low on memory, we can end up in bad state if we
* free all the memory for queue resources and try to allocate them * free all the memory for queue resources and try to allocate them
...@@ -1929,46 +1927,6 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport, ...@@ -1929,46 +1927,6 @@ int idpf_initiate_soft_reset(struct idpf_vport *vport,
*/ */
memcpy(vport, new_vport, offsetof(struct idpf_vport, link_speed_mbps)); memcpy(vport, new_vport, offsetof(struct idpf_vport, link_speed_mbps));
/* Since idpf_vport_queues_alloc was called with new_port, the queue
* back pointers are currently pointing to the local new_vport. Reset
* the backpointers to the original vport here
*/
for (i = 0; i < vport->num_txq_grp; i++) {
struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i];
int j;
tx_qgrp->vport = vport;
for (j = 0; j < tx_qgrp->num_txq; j++)
tx_qgrp->txqs[j]->vport = vport;
if (idpf_is_queue_model_split(vport->txq_model))
tx_qgrp->complq->vport = vport;
}
for (i = 0; i < vport->num_rxq_grp; i++) {
struct idpf_rxq_group *rx_qgrp = &vport->rxq_grps[i];
struct idpf_queue *q;
u16 num_rxq;
int j;
rx_qgrp->vport = vport;
for (j = 0; j < vport->num_bufqs_per_qgrp; j++)
rx_qgrp->splitq.bufq_sets[j].bufq.vport = vport;
if (idpf_is_queue_model_split(vport->rxq_model))
num_rxq = rx_qgrp->splitq.num_rxq_sets;
else
num_rxq = rx_qgrp->singleq.num_rxq;
for (j = 0; j < num_rxq; j++) {
if (idpf_is_queue_model_split(vport->rxq_model))
q = &rx_qgrp->splitq.rxq_sets[j]->rxq;
else
q = rx_qgrp->singleq.rxqs[j];
q->vport = vport;
}
}
if (reset_cause == IDPF_SR_Q_CHANGE) if (reset_cause == IDPF_SR_Q_CHANGE)
idpf_vport_alloc_vec_indexes(vport); idpf_vport_alloc_vec_indexes(vport);
...@@ -2393,24 +2351,10 @@ void idpf_free_dma_mem(struct idpf_hw *hw, struct idpf_dma_mem *mem) ...@@ -2393,24 +2351,10 @@ void idpf_free_dma_mem(struct idpf_hw *hw, struct idpf_dma_mem *mem)
mem->pa = 0; mem->pa = 0;
} }
static const struct net_device_ops idpf_netdev_ops_splitq = { static const struct net_device_ops idpf_netdev_ops = {
.ndo_open = idpf_open,
.ndo_stop = idpf_stop,
.ndo_start_xmit = idpf_tx_splitq_start,
.ndo_features_check = idpf_features_check,
.ndo_set_rx_mode = idpf_set_rx_mode,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = idpf_set_mac,
.ndo_change_mtu = idpf_change_mtu,
.ndo_get_stats64 = idpf_get_stats64,
.ndo_set_features = idpf_set_features,
.ndo_tx_timeout = idpf_tx_timeout,
};
static const struct net_device_ops idpf_netdev_ops_singleq = {
.ndo_open = idpf_open, .ndo_open = idpf_open,
.ndo_stop = idpf_stop, .ndo_stop = idpf_stop,
.ndo_start_xmit = idpf_tx_singleq_start, .ndo_start_xmit = idpf_tx_start,
.ndo_features_check = idpf_features_check, .ndo_features_check = idpf_features_check,
.ndo_set_rx_mode = idpf_set_rx_mode, .ndo_set_rx_mode = idpf_set_rx_mode,
.ndo_validate_addr = eth_validate_addr, .ndo_validate_addr = eth_validate_addr,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#define DRV_SUMMARY "Intel(R) Infrastructure Data Path Function Linux Driver" #define DRV_SUMMARY "Intel(R) Infrastructure Data Path Function Linux Driver"
MODULE_DESCRIPTION(DRV_SUMMARY); MODULE_DESCRIPTION(DRV_SUMMARY);
MODULE_IMPORT_NS(LIBETH);
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
/** /**
......
This diff is collapsed.
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
/* Rx buffer management */ /* Rx buffer management */
/** /**
* libeth_rx_hw_len - get the actual buffer size to be passed to HW * libeth_rx_hw_len_mtu - get the actual buffer size to be passed to HW
* @pp: &page_pool_params of the netdev to calculate the size for * @pp: &page_pool_params of the netdev to calculate the size for
* @max_len: maximum buffer size for a single descriptor * @max_len: maximum buffer size for a single descriptor
* *
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
* MTU the @dev has, HW required alignment, minimum and maximum allowed values, * MTU the @dev has, HW required alignment, minimum and maximum allowed values,
* and system's page size. * and system's page size.
*/ */
static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len) static u32 libeth_rx_hw_len_mtu(const struct page_pool_params *pp, u32 max_len)
{ {
u32 len; u32 len;
...@@ -26,6 +26,118 @@ static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len) ...@@ -26,6 +26,118 @@ static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len)
return len; return len;
} }
/**
* libeth_rx_hw_len_truesize - get the short buffer size to be passed to HW
* @pp: &page_pool_params of the netdev to calculate the size for
* @max_len: maximum buffer size for a single descriptor
* @truesize: desired truesize for the buffers
*
* Return: HW-writeable length per one buffer to pass it to the HW ignoring the
* MTU and closest to the passed truesize. Can be used for "short" buffer
* queues to fragment pages more efficiently.
*/
static u32 libeth_rx_hw_len_truesize(const struct page_pool_params *pp,
u32 max_len, u32 truesize)
{
u32 min, len;
min = SKB_HEAD_ALIGN(pp->offset + LIBETH_RX_BUF_STRIDE);
truesize = clamp(roundup_pow_of_two(truesize), roundup_pow_of_two(min),
PAGE_SIZE << LIBETH_RX_PAGE_ORDER);
len = SKB_WITH_OVERHEAD(truesize - pp->offset);
len = ALIGN_DOWN(len, LIBETH_RX_BUF_STRIDE) ? : LIBETH_RX_BUF_STRIDE;
len = min3(len, ALIGN_DOWN(max_len ? : U32_MAX, LIBETH_RX_BUF_STRIDE),
pp->max_len);
return len;
}
/**
* libeth_rx_page_pool_params - calculate params with the stack overhead
* @fq: buffer queue to calculate the size for
* @pp: &page_pool_params of the netdev
*
* Set the PP params to will all needed stack overhead (headroom, tailroom) and
* both the HW buffer length and the truesize for all types of buffers. For
* "short" buffers, truesize never exceeds the "wanted" one; for the rest,
* it can be up to the page size.
*
* Return: true on success, false on invalid input params.
*/
static bool libeth_rx_page_pool_params(struct libeth_fq *fq,
struct page_pool_params *pp)
{
pp->offset = LIBETH_SKB_HEADROOM;
/* HW-writeable / syncable length per one page */
pp->max_len = LIBETH_RX_PAGE_LEN(pp->offset);
/* HW-writeable length per buffer */
switch (fq->type) {
case LIBETH_FQE_MTU:
fq->buf_len = libeth_rx_hw_len_mtu(pp, fq->buf_len);
break;
case LIBETH_FQE_SHORT:
fq->buf_len = libeth_rx_hw_len_truesize(pp, fq->buf_len,
fq->truesize);
break;
case LIBETH_FQE_HDR:
fq->buf_len = ALIGN(LIBETH_MAX_HEAD, LIBETH_RX_BUF_STRIDE);
break;
default:
return false;
}
/* Buffer size to allocate */
fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp->offset +
fq->buf_len));
return true;
}
/**
* libeth_rx_page_pool_params_zc - calculate params without the stack overhead
* @fq: buffer queue to calculate the size for
* @pp: &page_pool_params of the netdev
*
* Set the PP params to exclude the stack overhead and both the buffer length
* and the truesize, which are equal for the data buffers. Note that this
* requires separate header buffers to be always active and account the
* overhead.
* With the MTU == ``PAGE_SIZE``, this allows the kernel to enable the zerocopy
* mode.
*
* Return: true on success, false on invalid input params.
*/
static bool libeth_rx_page_pool_params_zc(struct libeth_fq *fq,
struct page_pool_params *pp)
{
u32 mtu, max;
pp->offset = 0;
pp->max_len = PAGE_SIZE << LIBETH_RX_PAGE_ORDER;
switch (fq->type) {
case LIBETH_FQE_MTU:
mtu = READ_ONCE(pp->netdev->mtu);
break;
case LIBETH_FQE_SHORT:
mtu = fq->truesize;
break;
default:
return false;
}
mtu = roundup_pow_of_two(mtu);
max = min(rounddown_pow_of_two(fq->buf_len ? : U32_MAX),
pp->max_len);
fq->buf_len = clamp(mtu, LIBETH_RX_BUF_STRIDE, max);
fq->truesize = fq->buf_len;
return true;
}
/** /**
* libeth_rx_fq_create - create a PP with the default libeth settings * libeth_rx_fq_create - create a PP with the default libeth settings
* @fq: buffer queue struct to fill * @fq: buffer queue struct to fill
...@@ -44,19 +156,17 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) ...@@ -44,19 +156,17 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
.netdev = napi->dev, .netdev = napi->dev,
.napi = napi, .napi = napi,
.dma_dir = DMA_FROM_DEVICE, .dma_dir = DMA_FROM_DEVICE,
.offset = LIBETH_SKB_HEADROOM,
}; };
struct libeth_fqe *fqes; struct libeth_fqe *fqes;
struct page_pool *pool; struct page_pool *pool;
bool ret;
/* HW-writeable / syncable length per one page */ if (!fq->hsplit)
pp.max_len = LIBETH_RX_PAGE_LEN(pp.offset); ret = libeth_rx_page_pool_params(fq, &pp);
else
/* HW-writeable length per buffer */ ret = libeth_rx_page_pool_params_zc(fq, &pp);
fq->buf_len = libeth_rx_hw_len(&pp, fq->buf_len); if (!ret)
/* Buffer size to allocate */ return -EINVAL;
fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp.offset +
fq->buf_len));
pool = page_pool_create(&pp); pool = page_pool_create(&pp);
if (IS_ERR(pool)) if (IS_ERR(pool))
......
...@@ -13,6 +13,32 @@ ...@@ -13,6 +13,32 @@
#define SMP_CACHE_BYTES L1_CACHE_BYTES #define SMP_CACHE_BYTES L1_CACHE_BYTES
#endif #endif
/**
* SMP_CACHE_ALIGN - align a value to the L2 cacheline size
* @x: value to align
*
* On some architectures, L2 ("SMP") CL size is bigger than L1, and sometimes,
* this needs to be accounted.
*
* Return: aligned value.
*/
#ifndef SMP_CACHE_ALIGN
#define SMP_CACHE_ALIGN(x) ALIGN(x, SMP_CACHE_BYTES)
#endif
/*
* ``__aligned_largest`` aligns a field to the value most optimal for the
* target architecture to perform memory operations. Get the actual value
* to be able to use it anywhere else.
*/
#ifndef __LARGEST_ALIGN
#define __LARGEST_ALIGN sizeof(struct { long x; } __aligned_largest)
#endif
#ifndef LARGEST_ALIGN
#define LARGEST_ALIGN(x) ALIGN(x, __LARGEST_ALIGN)
#endif
/* /*
* __read_mostly is used to keep rarely changing variables out of frequently * __read_mostly is used to keep rarely changing variables out of frequently
* updated cachelines. Its use should be reserved for data that is used * updated cachelines. Its use should be reserved for data that is used
...@@ -95,6 +121,39 @@ ...@@ -95,6 +121,39 @@
__u8 __cacheline_group_end__##GROUP[0] __u8 __cacheline_group_end__##GROUP[0]
#endif #endif
/**
* __cacheline_group_begin_aligned - declare an aligned group start
* @GROUP: name of the group
* @...: optional group alignment
*
* The following block inside a struct:
*
* __cacheline_group_begin_aligned(grp);
* field a;
* field b;
* __cacheline_group_end_aligned(grp);
*
* will always be aligned to either the specified alignment or
* ``SMP_CACHE_BYTES``.
*/
#define __cacheline_group_begin_aligned(GROUP, ...) \
__cacheline_group_begin(GROUP) \
__aligned((__VA_ARGS__ + 0) ? : SMP_CACHE_BYTES)
/**
* __cacheline_group_end_aligned - declare an aligned group end
* @GROUP: name of the group
* @...: optional alignment (same as was in __cacheline_group_begin_aligned())
*
* Note that the end marker is aligned to sizeof(long) to allow more precise
* size assertion. It also declares a padding at the end to avoid next field
* falling into this cacheline.
*/
#define __cacheline_group_end_aligned(GROUP, ...) \
__cacheline_group_end(GROUP) __aligned(sizeof(long)); \
struct { } __cacheline_group_pad__##GROUP \
__aligned((__VA_ARGS__ + 0) ? : SMP_CACHE_BYTES)
#ifndef CACHELINE_ASSERT_GROUP_MEMBER #ifndef CACHELINE_ASSERT_GROUP_MEMBER
#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \ #define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \
BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \ BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \
......
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (C) 2024 Intel Corporation */
#ifndef __LIBETH_CACHE_H
#define __LIBETH_CACHE_H
#include <linux/cache.h>
/**
* libeth_cacheline_group_assert - make sure cacheline group size is expected
* @type: type of the structure containing the group
* @grp: group name inside the struct
* @sz: expected group size
*/
#if defined(CONFIG_64BIT) && SMP_CACHE_BYTES == 64
#define libeth_cacheline_group_assert(type, grp, sz) \
static_assert(offsetof(type, __cacheline_group_end__##grp) - \
offsetofend(type, __cacheline_group_begin__##grp) == \
(sz))
#define __libeth_cacheline_struct_assert(type, sz) \
static_assert(sizeof(type) == (sz))
#else /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */
#define libeth_cacheline_group_assert(type, grp, sz) \
static_assert(offsetof(type, __cacheline_group_end__##grp) - \
offsetofend(type, __cacheline_group_begin__##grp) <= \
(sz))
#define __libeth_cacheline_struct_assert(type, sz) \
static_assert(sizeof(type) <= (sz))
#endif /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */
#define __libeth_cls1(sz1) SMP_CACHE_ALIGN(sz1)
#define __libeth_cls2(sz1, sz2) (SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2))
#define __libeth_cls3(sz1, sz2, sz3) \
(SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2) + SMP_CACHE_ALIGN(sz3))
#define __libeth_cls(...) \
CONCATENATE(__libeth_cls, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
/**
* libeth_cacheline_struct_assert - make sure CL-based struct size is expected
* @type: type of the struct
* @...: from 1 to 3 CL group sizes (read-mostly, read-write, cold)
*
* When a struct contains several CL groups, it's difficult to predict its size
* on different architectures. The macro instead takes sizes of all of the
* groups the structure contains and generates the final struct size.
*/
#define libeth_cacheline_struct_assert(type, ...) \
__libeth_cacheline_struct_assert(type, __libeth_cls(__VA_ARGS__)); \
static_assert(__alignof(type) >= SMP_CACHE_BYTES)
/**
* libeth_cacheline_set_assert - make sure CL-based struct layout is expected
* @type: type of the struct
* @ro: expected size of the read-mostly group
* @rw: expected size of the read-write group
* @c: expected size of the cold group
*
* Check that each group size is expected and then do final struct size check.
*/
#define libeth_cacheline_set_assert(type, ro, rw, c) \
libeth_cacheline_group_assert(type, read_mostly, ro); \
libeth_cacheline_group_assert(type, read_write, rw); \
libeth_cacheline_group_assert(type, cold, c); \
libeth_cacheline_struct_assert(type, ro, rw, c)
#endif /* __LIBETH_CACHE_H */
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM #define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM
/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
/* Maximum supported L2-L4 header length */
#define LIBETH_MAX_HEAD roundup_pow_of_two(max(MAX_HEADER, 256))
/* Always use order-0 pages */ /* Always use order-0 pages */
#define LIBETH_RX_PAGE_ORDER 0 #define LIBETH_RX_PAGE_ORDER 0
...@@ -43,6 +45,18 @@ struct libeth_fqe { ...@@ -43,6 +45,18 @@ struct libeth_fqe {
u32 truesize; u32 truesize;
} __aligned_largest; } __aligned_largest;
/**
* enum libeth_fqe_type - enum representing types of Rx buffers
* @LIBETH_FQE_MTU: buffer size is determined by MTU
* @LIBETH_FQE_SHORT: buffer size is smaller than MTU, for short frames
* @LIBETH_FQE_HDR: buffer size is ```LIBETH_MAX_HEAD```-sized, for headers
*/
enum libeth_fqe_type {
LIBETH_FQE_MTU = 0U,
LIBETH_FQE_SHORT,
LIBETH_FQE_HDR,
};
/** /**
* struct libeth_fq - structure representing a buffer (fill) queue * struct libeth_fq - structure representing a buffer (fill) queue
* @fp: hotpath part of the structure * @fp: hotpath part of the structure
...@@ -50,6 +64,8 @@ struct libeth_fqe { ...@@ -50,6 +64,8 @@ struct libeth_fqe {
* @fqes: array of Rx buffers * @fqes: array of Rx buffers
* @truesize: size to allocate per buffer, w/overhead * @truesize: size to allocate per buffer, w/overhead
* @count: number of descriptors/buffers the queue has * @count: number of descriptors/buffers the queue has
* @type: type of the buffers this queue has
* @hsplit: flag whether header split is enabled
* @buf_len: HW-writeable length per each buffer * @buf_len: HW-writeable length per each buffer
* @nid: ID of the closest NUMA node with memory * @nid: ID of the closest NUMA node with memory
*/ */
...@@ -63,6 +79,9 @@ struct libeth_fq { ...@@ -63,6 +79,9 @@ struct libeth_fq {
); );
/* Cold fields */ /* Cold fields */
enum libeth_fqe_type type:2;
bool hsplit:1;
u32 buf_len; u32 buf_len;
int nid; int nid;
}; };
......
...@@ -129,6 +129,16 @@ struct page_pool_stats { ...@@ -129,6 +129,16 @@ struct page_pool_stats {
}; };
#endif #endif
/* The whole frag API block must stay within one cacheline. On 32-bit systems,
* sizeof(long) == sizeof(int), so that the block size is ``3 * sizeof(long)``.
* On 64-bit systems, the actual size is ``2 * sizeof(long) + sizeof(int)``.
* The closest pow-2 to both of them is ``4 * sizeof(long)``, so just use that
* one for simplicity.
* Having it aligned to a cacheline boundary may be excessive and doesn't bring
* any good.
*/
#define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long))
struct page_pool { struct page_pool {
struct page_pool_params_fast p; struct page_pool_params_fast p;
...@@ -142,19 +152,11 @@ struct page_pool { ...@@ -142,19 +152,11 @@ struct page_pool {
bool system:1; /* This is a global percpu pool */ bool system:1; /* This is a global percpu pool */
#endif #endif
/* The following block must stay within one cacheline. On 32-bit __cacheline_group_begin_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN);
* systems, sizeof(long) == sizeof(int), so that the block size is
* ``3 * sizeof(long)``. On 64-bit systems, the actual size is
* ``2 * sizeof(long) + sizeof(int)``. The closest pow-2 to both of
* them is ``4 * sizeof(long)``, so just use that one for simplicity.
* Having it aligned to a cacheline boundary may be excessive and
* doesn't bring any good.
*/
__cacheline_group_begin(frag) __aligned(4 * sizeof(long));
long frag_users; long frag_users;
netmem_ref frag_page; netmem_ref frag_page;
unsigned int frag_offset; unsigned int frag_offset;
__cacheline_group_end(frag); __cacheline_group_end_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN);
struct delayed_work release_dw; struct delayed_work release_dw;
void (*disconnect)(void *pool); void (*disconnect)(void *pool);
......
...@@ -178,7 +178,8 @@ static void page_pool_struct_check(void) ...@@ -178,7 +178,8 @@ static void page_pool_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long)); CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
PAGE_POOL_FRAG_GROUP_ALIGN);
} }
static int page_pool_init(struct page_pool *pool, static int page_pool_init(struct page_pool *pool,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment