Merge branch '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue

Tony Nguyen says: ==================== idpf: XDP chapter II: convert Tx completion to libeth Alexander Lobakin says: XDP for idpf is currently 5 chapters: * convert Rx to libeth; * convert Tx completion to libeth (this); * generic XDP and XSk code changes; * actual XDP for idpf via libeth_xdp; * XSk for idpf (^). Part II does the following: * adds generic libeth Tx completion routines; * converts idpf to use generic libeth Tx comp routines; * fixes Tx queue timeouts and robustifies Tx completion in general; * fixes Tx event/descriptor flushes (writebacks). Most idpf patches again remove more lines than adds. Generic Tx completion helpers and structs are needed as libeth_xdp (Ch. III) makes use of them. WB_ON_ITR is needed since XDPSQs don't want to work without it at all. Tx queue timeouts fixes are needed since without them, it's way easier to catch a Tx timeout event when WB_ON_ITR is enabled. * '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue: idpf: enable WB_ON_ITR idpf: fix netdev Tx queue stop/wake idpf: refactor Tx completion routines netdevice: add netdev_tx_reset_subqueue() shorthand idpf: convert to libeth Tx buffer completion libeth: add Tx buffer completion helpers ==================== Link: https://patch.msgid.link/20240909205323.3110312-1-anthony.l.nguyen@intel.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue
Tony Nguyen says: ==================== idpf: XDP chapter II: convert Tx completion to libeth Alexander Lobakin says: XDP for idpf is currently 5 chapters: * convert Rx to libeth; * convert Tx completion to libeth (this); * generic XDP and XSk code changes; * actual XDP for idpf via libeth_xdp; * XSk for idpf (^). Part II does the following: * adds generic libeth Tx completion routines; * converts idpf to use generic libeth Tx comp routines; * fixes Tx queue timeouts and robustifies Tx completion in general; * fixes Tx event/descriptor flushes (writebacks). Most idpf patches again remove more lines than adds. Generic Tx completion helpers and structs are needed as libeth_xdp (Ch. III) makes use of them. WB_ON_ITR is needed since XDPSQs don't want to work without it at all. Tx queue timeouts fixes are needed since without them, it's way easier to catch a Tx timeout event when WB_ON_ITR is enabled. * '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue: idpf: enable WB_ON_ITR idpf: fix netdev Tx queue stop/wake idpf: refactor Tx completion routines netdevice: add netdev_tx_reset_subqueue() shorthand idpf: convert to libeth Tx buffer completion libeth: add Tx buffer completion helpers ==================== Link: https://patch.msgid.link/20240909205323.3110312-1-anthony.l.nguyen@intel.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
24b8c193 · Jakub Kicinski · b2c8a506 · 9c4a27da · 24b8c193 · 24b8c193
Commit 24b8c193 authored Sep 11, 2024 by Jakub Kicinski
8 changed files
--- a/drivers/net/ethernet/intel/idpf/idpf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c
@@ -97,8 +97,10 @@ static int idpf_intr_reg_init(struct idpf_vport *vport)
 		intr->dyn_ctl = idpf_get_reg_addr(adapter,
 						  reg_vals[vec_id].dyn_ctl_reg);
 		intr->dyn_ctl_intena_m = PF_GLINT_DYN_CTL_INTENA_M;
+		intr->dyn_ctl_intena_msk_m = PF_GLINT_DYN_CTL_INTENA_MSK_M;
 		intr->dyn_ctl_itridx_s = PF_GLINT_DYN_CTL_ITR_INDX_S;
 		intr->dyn_ctl_intrvl_s = PF_GLINT_DYN_CTL_INTERVAL_S;
+		intr->dyn_ctl_wb_on_itr_m = PF_GLINT_DYN_CTL_WB_ON_ITR_M;

 		spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing,
 					       IDPF_PF_ITR_IDX_SPACING);

--- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
@@ -2,6 +2,7 @@
 /* Copyright (C) 2023 Intel Corporation */

 #include <net/libeth/rx.h>
+#include <net/libeth/tx.h>

 #include "idpf.h"

@@ -224,6 +225,7 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
 		/* record length, and DMA address */
 		dma_unmap_len_set(tx_buf, len, size);
 		dma_unmap_addr_set(tx_buf, dma, dma);
+		tx_buf->type = LIBETH_SQE_FRAG;

 		/* align size to end of page */
 		max_data += -dma & (IDPF_TX_MAX_READ_REQ_SIZE - 1);
@@ -237,14 +239,17 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
 								  offsets,
 								  max_data,
 								  td_tag);
-			tx_desc++;
-			i++;
-
-			if (i == tx_q->desc_count) {
+			if (unlikely(++i == tx_q->desc_count)) {
+				tx_buf = &tx_q->tx_buf[0];
 				tx_desc = &tx_q->base_tx[0];
 				i = 0;
+			} else {
+				tx_buf++;
+				tx_desc++;
 			}

+			tx_buf->type = LIBETH_SQE_EMPTY;
+
 			dma += max_data;
 			size -= max_data;

@@ -257,12 +262,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,

 		tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets,
 							  size, td_tag);
-		tx_desc++;
-		i++;

-		if (i == tx_q->desc_count) {
+		if (unlikely(++i == tx_q->desc_count)) {
+			tx_buf = &tx_q->tx_buf[0];
 			tx_desc = &tx_q->base_tx[0];
 			i = 0;
+		} else {
+			tx_buf++;
+			tx_desc++;
 		}

 		size = skb_frag_size(frag);
@@ -270,8 +277,6 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,

 		dma = skb_frag_dma_map(tx_q->dev, frag, 0, size,
 				       DMA_TO_DEVICE);
-
-		tx_buf = &tx_q->tx_buf[i];
 	}

 	skb_tx_timestamp(first->skb);
@@ -282,13 +287,13 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
 	tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets,
 						  size, td_tag);

-	IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i);
+	first->type = LIBETH_SQE_SKB;
+	first->rs_idx = i;

-	/* set next_to_watch value indicating a packet is present */
-	first->next_to_watch = tx_desc;
+	IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i);

 	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
-	netdev_tx_sent_queue(nq, first->bytecount);
+	netdev_tx_sent_queue(nq, first->bytes);

 	idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more());
 }
@@ -306,8 +311,7 @@ idpf_tx_singleq_get_ctx_desc(struct idpf_tx_queue *txq)
 	struct idpf_base_tx_ctx_desc *ctx_desc;
 	int ntu = txq->next_to_use;

-	memset(&txq->tx_buf[ntu], 0, sizeof(struct idpf_tx_buf));
-	txq->tx_buf[ntu].ctx_entry = true;
+	txq->tx_buf[ntu].type = LIBETH_SQE_CTX;

 	ctx_desc = &txq->base_ctx[ntu];

@@ -371,6 +375,10 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 				      IDPF_TX_DESCS_FOR_CTX)) {
 		idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false);

+		u64_stats_update_begin(&tx_q->stats_sync);
+		u64_stats_inc(&tx_q->q_stats.q_busy);
+		u64_stats_update_end(&tx_q->stats_sync);
+
 		return NETDEV_TX_BUSY;
 	}

@@ -396,11 +404,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 	first->skb = skb;

 	if (tso) {
-		first->gso_segs = offload.tso_segs;
-		first->bytecount = skb->len + ((first->gso_segs - 1) * offload.tso_hdr_len);
+		first->packets = offload.tso_segs;
+		first->bytes = skb->len + ((first->packets - 1) * offload.tso_hdr_len);
 	} else {
-		first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
-		first->gso_segs = 1;
+		first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		first->packets = 1;
 	}
 	idpf_tx_singleq_map(tx_q, first, &offload);

@@ -420,10 +428,15 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
 				  int *cleaned)
 {
-	unsigned int total_bytes = 0, total_pkts = 0;
+	struct libeth_sq_napi_stats ss = { };
 	struct idpf_base_tx_desc *tx_desc;
 	u32 budget = tx_q->clean_budget;
 	s16 ntc = tx_q->next_to_clean;
+	struct libeth_cq_pp cp = {
+		.dev	= tx_q->dev,
+		.ss	= &ss,
+		.napi	= napi_budget,
+	};
 	struct idpf_netdev_priv *np;
 	struct idpf_tx_buf *tx_buf;
 	struct netdev_queue *nq;
@@ -441,47 +454,26 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
 		 * such. We can skip this descriptor since there is no buffer
 		 * to clean.
 		 */
-		if (tx_buf->ctx_entry) {
-			/* Clear this flag here to avoid stale flag values when
-			 * this buffer is used for actual data in the future.
-			 * There are cases where the tx_buf struct / the flags
-			 * field will not be cleared before being reused.
-			 */
-			tx_buf->ctx_entry = false;
+		if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) {
+			tx_buf->type = LIBETH_SQE_EMPTY;
 			goto fetch_next_txq_desc;
 		}

-		/* if next_to_watch is not set then no work pending */
-		eop_desc = (struct idpf_base_tx_desc *)tx_buf->next_to_watch;
-		if (!eop_desc)
+		if (unlikely(tx_buf->type != LIBETH_SQE_SKB))
 			break;

-		/* prevent any other reads prior to eop_desc */
+		/* prevent any other reads prior to type */
 		smp_rmb();

+		eop_desc = &tx_q->base_tx[tx_buf->rs_idx];
+
 		/* if the descriptor isn't done, no work yet to do */
 		if (!(eop_desc->qw1 &
 		      cpu_to_le64(IDPF_TX_DESC_DTYPE_DESC_DONE)))
 			break;

-		/* clear next_to_watch to prevent false hangs */
-		tx_buf->next_to_watch = NULL;
-
 		/* update the statistics for this packet */
-		total_bytes += tx_buf->bytecount;
-		total_pkts += tx_buf->gso_segs;
-
-		napi_consume_skb(tx_buf->skb, napi_budget);
-
-		/* unmap skb header data */
-		dma_unmap_single(tx_q->dev,
-				 dma_unmap_addr(tx_buf, dma),
-				 dma_unmap_len(tx_buf, len),
-				 DMA_TO_DEVICE);
-
-		/* clear tx_buf data */
-		tx_buf->skb = NULL;
-		dma_unmap_len_set(tx_buf, len, 0);
+		libeth_tx_complete(tx_buf, &cp);

 		/* unmap remaining buffers */
 		while (tx_desc != eop_desc) {
@@ -495,13 +487,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
 			}

 			/* unmap any remaining paged data */
-			if (dma_unmap_len(tx_buf, len)) {
-				dma_unmap_page(tx_q->dev,
-					       dma_unmap_addr(tx_buf, dma),
-					       dma_unmap_len(tx_buf, len),
-					       DMA_TO_DEVICE);
-				dma_unmap_len_set(tx_buf, len, 0);
-			}
+			libeth_tx_complete(tx_buf, &cp);
 		}

 		/* update budget only if we did something */
@@ -521,11 +507,11 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
 	ntc += tx_q->desc_count;
 	tx_q->next_to_clean = ntc;

-	*cleaned += total_pkts;
+	*cleaned += ss.packets;

 	u64_stats_update_begin(&tx_q->stats_sync);
-	u64_stats_add(&tx_q->q_stats.packets, total_pkts);
-	u64_stats_add(&tx_q->q_stats.bytes, total_bytes);
+	u64_stats_add(&tx_q->q_stats.packets, ss.packets);
+	u64_stats_add(&tx_q->q_stats.bytes, ss.bytes);
 	u64_stats_update_end(&tx_q->stats_sync);

 	np = netdev_priv(tx_q->netdev);
@@ -533,7 +519,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,

 	dont_wake = np->state != __IDPF_VPORT_UP ||
 		    !netif_carrier_ok(tx_q->netdev);
-	__netif_txq_completed_wake(nq, total_pkts, total_bytes,
+	__netif_txq_completed_wake(nq, ss.packets, ss.bytes,
 				   IDPF_DESC_UNUSED(tx_q), IDPF_TX_WAKE_THRESH,
 				   dont_wake);

@@ -1134,8 +1120,10 @@ int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget)
 						    &work_done);

 	/* If work not completed, return budget and polling will return */
-	if (!clean_complete)
+	if (!clean_complete) {
+		idpf_vport_intr_set_wb_on_itr(q_vector);
 		return budget;
+	}

 	work_done = min_t(int, work_done, budget - 1);

@@ -1144,6 +1132,8 @@ int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget)
 	 */
 	if (likely(napi_complete_done(napi, work_done)))
 		idpf_vport_intr_update_itr_ena_irq(q_vector);
+	else
+		idpf_vport_intr_set_wb_on_itr(q_vector);

 	return work_done;
 }
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -2,10 +2,19 @@
 /* Copyright (C) 2023 Intel Corporation */

 #include <net/libeth/rx.h>
+#include <net/libeth/tx.h>

 #include "idpf.h"
 #include "idpf_virtchnl.h"

+struct idpf_tx_stash {
+	struct hlist_node hlist;
+	struct libeth_sqe buf;
+};
+
+#define idpf_tx_buf_compl_tag(buf)	(*(u32 *)&(buf)->priv)
+LIBETH_SQE_CHECK_PRIV(u32);
+
 static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs,
 			       unsigned int count);

@@ -60,42 +69,21 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue)
 	}
 }

-/**
- * idpf_tx_buf_rel - Release a Tx buffer
- * @tx_q: the queue that owns the buffer
- * @tx_buf: the buffer to free
- */
-static void idpf_tx_buf_rel(struct idpf_tx_queue *tx_q,
-			    struct idpf_tx_buf *tx_buf)
-{
-	if (tx_buf->skb) {
-		if (dma_unmap_len(tx_buf, len))
-			dma_unmap_single(tx_q->dev,
-					 dma_unmap_addr(tx_buf, dma),
-					 dma_unmap_len(tx_buf, len),
-					 DMA_TO_DEVICE);
-		dev_kfree_skb_any(tx_buf->skb);
-	} else if (dma_unmap_len(tx_buf, len)) {
-		dma_unmap_page(tx_q->dev,
-			       dma_unmap_addr(tx_buf, dma),
-			       dma_unmap_len(tx_buf, len),
-			       DMA_TO_DEVICE);
-	}
-
-	tx_buf->next_to_watch = NULL;
-	tx_buf->skb = NULL;
-	tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG;
-	dma_unmap_len_set(tx_buf, len, 0);
-}
-
 /**
 * idpf_tx_buf_rel_all - Free any empty Tx buffers
 * @txq: queue to be cleaned
 */
 static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq)
 {
+	struct libeth_sq_napi_stats ss = { };
 	struct idpf_buf_lifo *buf_stack;
-	u16 i;
+	struct idpf_tx_stash *stash;
+	struct libeth_cq_pp cp = {
+		.dev	= txq->dev,
+		.ss	= &ss,
+	};
+	struct hlist_node *tmp;
+	u32 i, tag;

 	/* Buffers already cleared, nothing to do */
 	if (!txq->tx_buf)
@@ -103,7 +91,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq)

 	/* Free all the Tx buffer sk_buffs */
 	for (i = 0; i < txq->desc_count; i++)
-		idpf_tx_buf_rel(txq, &txq->tx_buf[i]);
+		libeth_tx_complete(&txq->tx_buf[i], &cp);

 	kfree(txq->tx_buf);
 	txq->tx_buf = NULL;
@@ -115,6 +103,20 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq)
 	if (!buf_stack->bufs)
 		return;

+	/*
+	 * If a Tx timeout occurred, there are potentially still bufs in the
+	 * hash table, free them here.
+	 */
+	hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash,
+			   hlist) {
+		if (!stash)
+			continue;
+
+		libeth_tx_complete(&stash->buf, &cp);
+		hash_del(&stash->hlist);
+		idpf_buf_lifo_push(buf_stack, stash);
+	}
+
 	for (i = 0; i < buf_stack->size; i++)
 		kfree(buf_stack->bufs[i]);

@@ -131,6 +133,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq)
 static void idpf_tx_desc_rel(struct idpf_tx_queue *txq)
 {
 	idpf_tx_buf_rel_all(txq);
+	netdev_tx_reset_subqueue(txq->netdev, txq->idx);

 	if (!txq->desc_ring)
 		return;
@@ -203,10 +206,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q)
 	if (!tx_q->tx_buf)
 		return -ENOMEM;

-	/* Initialize tx_bufs with invalid completion tags */
-	for (i = 0; i < tx_q->desc_count; i++)
-		tx_q->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG;
-
 	if (!idpf_queue_has(FLOW_SCH_EN, tx_q))
 		return 0;

@@ -1655,37 +1654,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q)
 	wake_up(&vport->sw_marker_wq);
 }

-/**
- * idpf_tx_splitq_clean_hdr - Clean TX buffer resources for header portion of
- * packet
- * @tx_q: tx queue to clean buffer from
- * @tx_buf: buffer to be cleaned
- * @cleaned: pointer to stats struct to track cleaned packets/bytes
- * @napi_budget: Used to determine if we are in netpoll
- */
-static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q,
-				     struct idpf_tx_buf *tx_buf,
-				     struct idpf_cleaned_stats *cleaned,
-				     int napi_budget)
-{
-	napi_consume_skb(tx_buf->skb, napi_budget);
-
-	if (dma_unmap_len(tx_buf, len)) {
-		dma_unmap_single(tx_q->dev,
-				 dma_unmap_addr(tx_buf, dma),
-				 dma_unmap_len(tx_buf, len),
-				 DMA_TO_DEVICE);
-
-		dma_unmap_len_set(tx_buf, len, 0);
-	}
-
-	/* clear tx_buf data */
-	tx_buf->skb = NULL;
-
-	cleaned->bytes += tx_buf->bytecount;
-	cleaned->packets += tx_buf->gso_segs;
-}
-
 /**
 * idpf_tx_clean_stashed_bufs - clean bufs that were stored for
 * out of order completions
@@ -1696,33 +1664,28 @@ static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q,
 */
 static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq,
 				       u16 compl_tag,
-				       struct idpf_cleaned_stats *cleaned,
+				       struct libeth_sq_napi_stats *cleaned,
 				       int budget)
 {
 	struct idpf_tx_stash *stash;
 	struct hlist_node *tmp_buf;
+	struct libeth_cq_pp cp = {
+		.dev	= txq->dev,
+		.ss	= cleaned,
+		.napi	= budget,
+	};

 	/* Buffer completion */
 	hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf,
 				    hlist, compl_tag) {
-		if (unlikely(stash->buf.compl_tag != (int)compl_tag))
+		if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag))
 			continue;

-		if (stash->buf.skb) {
-			idpf_tx_splitq_clean_hdr(txq, &stash->buf, cleaned,
-						 budget);
-		} else if (dma_unmap_len(&stash->buf, len)) {
-			dma_unmap_page(txq->dev,
-				       dma_unmap_addr(&stash->buf, dma),
-				       dma_unmap_len(&stash->buf, len),
-				       DMA_TO_DEVICE);
-			dma_unmap_len_set(&stash->buf, len, 0);
-		}
+		hash_del(&stash->hlist);
+		libeth_tx_complete(&stash->buf, &cp);

 		/* Push shadow buf back onto stack */
 		idpf_buf_lifo_push(&txq->stash->buf_stack, stash);
-
-		hash_del(&stash->hlist);
 	}
 }

@@ -1737,8 +1700,7 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq,
 {
 	struct idpf_tx_stash *stash;

-	if (unlikely(!dma_unmap_addr(tx_buf, dma) &&
-		     !dma_unmap_len(tx_buf, len)))
+	if (unlikely(tx_buf->type <= LIBETH_SQE_CTX))
 		return 0;

 	stash = idpf_buf_lifo_pop(&txq->stash->buf_stack);
@@ -1751,29 +1713,27 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq,

 	/* Store buffer params in shadow buffer */
 	stash->buf.skb = tx_buf->skb;
-	stash->buf.bytecount = tx_buf->bytecount;
-	stash->buf.gso_segs = tx_buf->gso_segs;
+	stash->buf.bytes = tx_buf->bytes;
+	stash->buf.packets = tx_buf->packets;
+	stash->buf.type = tx_buf->type;
+	stash->buf.nr_frags = tx_buf->nr_frags;
 	dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma));
 	dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len));
-	stash->buf.compl_tag = tx_buf->compl_tag;
+	idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf);

 	/* Add buffer to buf_hash table to be freed later */
 	hash_add(txq->stash->sched_buf_hash, &stash->hlist,
-		 stash->buf.compl_tag);
+		 idpf_tx_buf_compl_tag(&stash->buf));

-	memset(tx_buf, 0, sizeof(struct idpf_tx_buf));
-
-	/* Reinitialize buf_id portion of tag */
-	tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG;
+	tx_buf->type = LIBETH_SQE_EMPTY;

 	return 0;
 }

 #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf)	\
 do {								\
-	(ntc)++;						\
-	if (unlikely(!(ntc))) {					\
-		ntc -= (txq)->desc_count;			\
+	if (unlikely(++(ntc) == (txq)->desc_count)) {		\
+		ntc = 0;					\
 		buf = (txq)->tx_buf;				\
 		desc = &(txq)->flex_tx[0];			\
 	} else {						\
@@ -1797,69 +1757,71 @@ do {								\
 * Separate packet completion events will be reported on the completion queue,
 * and the buffers will be cleaned separately. The stats are not updated from
 * this function when using flow-based scheduling.
+ *
+ * Furthermore, in flow scheduling mode, check to make sure there are enough
+ * reserve buffers to stash the packet. If there are not, return early, which
+ * will leave next_to_clean pointing to the packet that failed to be stashed.
+ *
+ * Return: false in the scenario above, true otherwise.
 */
-static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
+static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
 				 int napi_budget,
-				 struct idpf_cleaned_stats *cleaned,
+				 struct libeth_sq_napi_stats *cleaned,
 				 bool descs_only)
 {
 	union idpf_tx_flex_desc *next_pending_desc = NULL;
 	union idpf_tx_flex_desc *tx_desc;
-	s16 ntc = tx_q->next_to_clean;
+	u32 ntc = tx_q->next_to_clean;
+	struct libeth_cq_pp cp = {
+		.dev	= tx_q->dev,
+		.ss	= cleaned,
+		.napi	= napi_budget,
+	};
 	struct idpf_tx_buf *tx_buf;
+	bool clean_complete = true;

 	tx_desc = &tx_q->flex_tx[ntc];
 	next_pending_desc = &tx_q->flex_tx[end];
 	tx_buf = &tx_q->tx_buf[ntc];
-	ntc -= tx_q->desc_count;

 	while (tx_desc != next_pending_desc) {
-		union idpf_tx_flex_desc *eop_desc;
+		u32 eop_idx;

 		/* If this entry in the ring was used as a context descriptor,
-		 * it's corresponding entry in the buffer ring will have an
-		 * invalid completion tag since no buffer was used.  We can
-		 * skip this descriptor since there is no buffer to clean.
+		 * it's corresponding entry in the buffer ring is reserved. We
+		 * can skip this descriptor since there is no buffer to clean.
 		 */
-		if (unlikely(tx_buf->compl_tag == IDPF_SPLITQ_TX_INVAL_COMPL_TAG))
+		if (tx_buf->type <= LIBETH_SQE_CTX)
 			goto fetch_next_txq_desc;

-		eop_desc = (union idpf_tx_flex_desc *)tx_buf->next_to_watch;
+		if (unlikely(tx_buf->type != LIBETH_SQE_SKB))
+			break;

-		/* clear next_to_watch to prevent false hangs */
-		tx_buf->next_to_watch = NULL;
+		eop_idx = tx_buf->rs_idx;

 		if (descs_only) {
-			if (idpf_stash_flow_sch_buffers(tx_q, tx_buf))
+			if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) {
+				clean_complete = false;
 				goto tx_splitq_clean_out;
+			}

-			while (tx_desc != eop_desc) {
+			idpf_stash_flow_sch_buffers(tx_q, tx_buf);
+
+			while (ntc != eop_idx) {
 				idpf_tx_splitq_clean_bump_ntc(tx_q, ntc,
 							      tx_desc, tx_buf);
-
-				if (dma_unmap_len(tx_buf, len)) {
-					if (idpf_stash_flow_sch_buffers(tx_q,
-									tx_buf))
-						goto tx_splitq_clean_out;
-				}
+				idpf_stash_flow_sch_buffers(tx_q, tx_buf);
 			}
 		} else {
-			idpf_tx_splitq_clean_hdr(tx_q, tx_buf, cleaned,
-						 napi_budget);
+			libeth_tx_complete(tx_buf, &cp);

 			/* unmap remaining buffers */
-			while (tx_desc != eop_desc) {
+			while (ntc != eop_idx) {
 				idpf_tx_splitq_clean_bump_ntc(tx_q, ntc,
 							      tx_desc, tx_buf);

 				/* unmap any remaining paged data */
-				if (dma_unmap_len(tx_buf, len)) {
-					dma_unmap_page(tx_q->dev,
-						       dma_unmap_addr(tx_buf, dma),
-						       dma_unmap_len(tx_buf, len),
-						       DMA_TO_DEVICE);
-					dma_unmap_len_set(tx_buf, len, 0);
-				}
+				libeth_tx_complete(tx_buf, &cp);
 			}
 		}

@@ -1868,8 +1830,9 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
 	}

 tx_splitq_clean_out:
-	ntc += tx_q->desc_count;
 	tx_q->next_to_clean = ntc;
+
+	return clean_complete;
 }

 #define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf)	\
@@ -1895,57 +1858,68 @@ do {							\
 * this completion tag.
 */
 static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag,
-				   struct idpf_cleaned_stats *cleaned,
+				   struct libeth_sq_napi_stats *cleaned,
 				   int budget)
 {
 	u16 idx = compl_tag & txq->compl_tag_bufid_m;
 	struct idpf_tx_buf *tx_buf = NULL;
-	u16 ntc = txq->next_to_clean;
-	u16 num_descs_cleaned = 0;
-	u16 orig_idx = idx;
+	struct libeth_cq_pp cp = {
+		.dev	= txq->dev,
+		.ss	= cleaned,
+		.napi	= budget,
+	};
+	u16 ntc, orig_idx = idx;

 	tx_buf = &txq->tx_buf[idx];

-	while (tx_buf->compl_tag == (int)compl_tag) {
-		if (tx_buf->skb) {
-			idpf_tx_splitq_clean_hdr(txq, tx_buf, cleaned, budget);
-		} else if (dma_unmap_len(tx_buf, len)) {
-			dma_unmap_page(txq->dev,
-				       dma_unmap_addr(tx_buf, dma),
-				       dma_unmap_len(tx_buf, len),
-				       DMA_TO_DEVICE);
-			dma_unmap_len_set(tx_buf, len, 0);
-		}
+	if (unlikely(tx_buf->type <= LIBETH_SQE_CTX ||
+		     idpf_tx_buf_compl_tag(tx_buf) != compl_tag))
+		return false;

-		memset(tx_buf, 0, sizeof(struct idpf_tx_buf));
-		tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG;
+	if (tx_buf->type == LIBETH_SQE_SKB)
+		libeth_tx_complete(tx_buf, &cp);

-		num_descs_cleaned++;
 	idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf);
-	}

-	/* If we didn't clean anything on the ring for this completion, there's
-	 * nothing more to do.
-	 */
-	if (unlikely(!num_descs_cleaned))
-		return false;
+	while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) {
+		libeth_tx_complete(tx_buf, &cp);
+		idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf);
+	}

-	/* Otherwise, if we did clean a packet on the ring directly, it's safe
-	 * to assume that the descriptors starting from the original
-	 * next_to_clean up until the previously cleaned packet can be reused.
-	 * Therefore, we will go back in the ring and stash any buffers still
-	 * in the ring into the hash table to be cleaned later.
-	 */
+	/*
+	 * It's possible the packet we just cleaned was an out of order
+	 * completion, which means we can stash the buffers starting from
+	 * the original next_to_clean and reuse the descriptors. We need
+	 * to compare the descriptor ring next_to_clean packet's "first" buffer
+	 * to the "first" buffer of the packet we just cleaned to determine if
+	 * this is the case. Howevever, next_to_clean can point to either a
+	 * reserved buffer that corresponds to a context descriptor used for the
+	 * next_to_clean packet (TSO packet) or the "first" buffer (single
+	 * packet). The orig_idx from the packet we just cleaned will always
+	 * point to the "first" buffer. If next_to_clean points to a reserved
+	 * buffer, let's bump ntc once and start the comparison from there.
+	 */
+	ntc = txq->next_to_clean;
 	tx_buf = &txq->tx_buf[ntc];
-	while (tx_buf != &txq->tx_buf[orig_idx]) {
-		idpf_stash_flow_sch_buffers(txq, tx_buf);
+
+	if (tx_buf->type == LIBETH_SQE_CTX)
 		idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf);
-	}

-	/* Finally, update next_to_clean to reflect the work that was just done
-	 * on the ring, if any. If the packet was only cleaned from the hash
-	 * table, the ring will not be impacted, therefore we should not touch
-	 * next_to_clean. The updated idx is used here
+	/*
+	 * If ntc still points to a different "first" buffer, clean the
+	 * descriptor ring and stash all of the buffers for later cleaning. If
+	 * we cannot stash all of the buffers, next_to_clean will point to the
+	 * "first" buffer of the packet that could not be stashed and cleaning
+	 * will start there next time.
+	 */
+	if (unlikely(tx_buf != &txq->tx_buf[orig_idx] &&
+		     !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned,
+					   true)))
+		return true;
+
+	/*
+	 * Otherwise, update next_to_clean to reflect the cleaning that was
+	 * done above.
 	 */
 	txq->next_to_clean = idx;

@@ -1965,7 +1939,7 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag,
 */
 static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq,
 					 struct idpf_splitq_tx_compl_desc *desc,
-					 struct idpf_cleaned_stats *cleaned,
+					 struct libeth_sq_napi_stats *cleaned,
 					 int budget)
 {
 	u16 compl_tag;
@@ -1973,7 +1947,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq,
 	if (!idpf_queue_has(FLOW_SCH_EN, txq)) {
 		u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head);

-		return idpf_tx_splitq_clean(txq, head, budget, cleaned, false);
+		idpf_tx_splitq_clean(txq, head, budget, cleaned, false);
+		return;
 	}

 	compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag);
@@ -2008,7 +1983,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget,
 	ntc -= complq->desc_count;

 	do {
-		struct idpf_cleaned_stats cleaned_stats = { };
+		struct libeth_sq_napi_stats cleaned_stats = { };
 		struct idpf_tx_queue *tx_q;
 		int rel_tx_qid;
 		u16 hw_head;
@@ -2157,29 +2132,6 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc,
 	desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag);
 }

-/**
- * idpf_tx_maybe_stop_common - 1st level check for common Tx stop conditions
- * @tx_q: the queue to be checked
- * @size: number of descriptors we want to assure is available
- *
- * Returns 0 if stop is not needed
- */
-int idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q, unsigned int size)
-{
-	struct netdev_queue *nq;
-
-	if (likely(IDPF_DESC_UNUSED(tx_q) >= size))
-		return 0;
-
-	u64_stats_update_begin(&tx_q->stats_sync);
-	u64_stats_inc(&tx_q->q_stats.q_busy);
-	u64_stats_update_end(&tx_q->stats_sync);
-
-	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
-
-	return netif_txq_maybe_stop(nq, IDPF_DESC_UNUSED(tx_q), size, size);
-}
-
 /**
 * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions
 * @tx_q: the queue to be checked
@@ -2191,7 +2143,7 @@ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q,
 				     unsigned int descs_needed)
 {
 	if (idpf_tx_maybe_stop_common(tx_q, descs_needed))
-		goto splitq_stop;
+		goto out;

 	/* If there are too many outstanding completions expected on the
 	 * completion queue, stop the TX queue to give the device some time to
@@ -2210,10 +2162,12 @@ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q,
 	return 0;

 splitq_stop:
+	netif_stop_subqueue(tx_q->netdev, tx_q->idx);
+
+out:
 	u64_stats_update_begin(&tx_q->stats_sync);
 	u64_stats_inc(&tx_q->q_stats.q_busy);
 	u64_stats_update_end(&tx_q->stats_sync);
-	netif_stop_subqueue(tx_q->netdev, tx_q->idx);

 	return -EBUSY;
 }
@@ -2236,7 +2190,11 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val,
 	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
 	tx_q->next_to_use = val;

-	idpf_tx_maybe_stop_common(tx_q, IDPF_TX_DESC_NEEDED);
+	if (idpf_tx_maybe_stop_common(tx_q, IDPF_TX_DESC_NEEDED)) {
+		u64_stats_update_begin(&tx_q->stats_sync);
+		u64_stats_inc(&tx_q->q_stats.q_busy);
+		u64_stats_update_end(&tx_q->stats_sync);
+	}

 	/* Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
@@ -2307,6 +2265,12 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
 void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
 			   struct idpf_tx_buf *first, u16 idx)
 {
+	struct libeth_sq_napi_stats ss = { };
+	struct libeth_cq_pp cp = {
+		.dev	= txq->dev,
+		.ss	= &ss,
+	};
+
 	u64_stats_update_begin(&txq->stats_sync);
 	u64_stats_inc(&txq->q_stats.dma_map_errs);
 	u64_stats_update_end(&txq->stats_sync);
@@ -2316,7 +2280,7 @@ void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
 		struct idpf_tx_buf *tx_buf;

 		tx_buf = &txq->tx_buf[idx];
-		idpf_tx_buf_rel(txq, tx_buf);
+		libeth_tx_complete(tx_buf, &cp);
 		if (tx_buf == first)
 			break;
 		if (idx == 0)
@@ -2395,6 +2359,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 	dma = dma_map_single(tx_q->dev, skb->data, size, DMA_TO_DEVICE);

 	tx_buf = first;
+	first->nr_frags = 0;

 	params->compl_tag =
 		(tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i;
@@ -2405,7 +2370,9 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 		if (dma_mapping_error(tx_q->dev, dma))
 			return idpf_tx_dma_map_error(tx_q, skb, first, i);

-		tx_buf->compl_tag = params->compl_tag;
+		first->nr_frags++;
+		idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag;
+		tx_buf->type = LIBETH_SQE_FRAG;

 		/* record length, and DMA address */
 		dma_unmap_len_set(tx_buf, len, size);
@@ -2459,14 +2426,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 			idpf_tx_splitq_build_desc(tx_desc, params, td_cmd,
 						  max_data);

-			tx_desc++;
-			i++;
-
-			if (i == tx_q->desc_count) {
+			if (unlikely(++i == tx_q->desc_count)) {
+				tx_buf = tx_q->tx_buf;
 				tx_desc = &tx_q->flex_tx[0];
 				i = 0;
 				tx_q->compl_tag_cur_gen =
 					IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q);
+			} else {
+				tx_buf++;
+				tx_desc++;
 			}

 			/* Since this packet has a buffer that is going to span
@@ -2479,8 +2447,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 			 * simply pass over these holes and finish cleaning the
 			 * rest of the packet.
 			 */
-			memset(&tx_q->tx_buf[i], 0, sizeof(struct idpf_tx_buf));
-			tx_q->tx_buf[i].compl_tag = params->compl_tag;
+			tx_buf->type = LIBETH_SQE_EMPTY;

 			/* Adjust the DMA offset and the remaining size of the
 			 * fragment.  On the first iteration of this loop,
@@ -2504,13 +2471,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 			break;

 		idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size);
-		tx_desc++;
-		i++;

-		if (i == tx_q->desc_count) {
+		if (unlikely(++i == tx_q->desc_count)) {
+			tx_buf = tx_q->tx_buf;
 			tx_desc = &tx_q->flex_tx[0];
 			i = 0;
 			tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q);
+		} else {
+			tx_buf++;
+			tx_desc++;
 		}

 		size = skb_frag_size(frag);
@@ -2518,26 +2487,24 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,

 		dma = skb_frag_dma_map(tx_q->dev, frag, 0, size,
 				       DMA_TO_DEVICE);
-
-		tx_buf = &tx_q->tx_buf[i];
 	}

 	/* record SW timestamp if HW timestamp is not available */
 	skb_tx_timestamp(skb);

+	first->type = LIBETH_SQE_SKB;
+
 	/* write last descriptor with RS and EOP bits */
+	first->rs_idx = i;
 	td_cmd |= params->eop_cmd;
 	idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size);
 	i = idpf_tx_splitq_bump_ntu(tx_q, i);

-	/* set next_to_watch value indicating a packet is present */
-	first->next_to_watch = tx_desc;
-
 	tx_q->txq_grp->num_completions_pending++;

 	/* record bytecount for BQL */
 	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
-	netdev_tx_sent_queue(nq, first->bytecount);
+	netdev_tx_sent_queue(nq, first->bytes);

 	idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more());
 }
@@ -2737,8 +2704,7 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq)
 	struct idpf_flex_tx_ctx_desc *desc;
 	int i = txq->next_to_use;

-	memset(&txq->tx_buf[i], 0, sizeof(struct idpf_tx_buf));
-	txq->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG;
+	txq->tx_buf[i].type = LIBETH_SQE_CTX;

 	/* grab the next descriptor */
 	desc = &txq->flex_ctx[i];
@@ -2822,12 +2788,12 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb,
 	first->skb = skb;

 	if (tso) {
-		first->gso_segs = tx_params.offload.tso_segs;
-		first->bytecount = skb->len +
-			((first->gso_segs - 1) * tx_params.offload.tso_hdr_len);
+		first->packets = tx_params.offload.tso_segs;
+		first->bytes = skb->len +
+			((first->packets - 1) * tx_params.offload.tso_hdr_len);
 	} else {
-		first->gso_segs = 1;
-		first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
+		first->packets = 1;
+		first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
 	}

 	if (idpf_queue_has(FLOW_SCH_EN, tx_q)) {
@@ -3749,6 +3715,7 @@ void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector)
 	/* net_dim() updates ITR out-of-band using a work item */
 	idpf_net_dim(q_vector);

+	q_vector->wb_on_itr = false;
 	intval = idpf_vport_intr_buildreg_itr(q_vector,
 					      IDPF_NO_ITR_UPDATE_IDX, 0);

@@ -4051,8 +4018,10 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget)
 	clean_complete &= idpf_tx_splitq_clean_all(q_vector, budget, &work_done);

 	/* If work not completed, return budget and polling will return */
-	if (!clean_complete)
+	if (!clean_complete) {
+		idpf_vport_intr_set_wb_on_itr(q_vector);
 		return budget;
+	}

 	work_done = min_t(int, work_done, budget - 1);

@@ -4061,6 +4030,8 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget)
 	 */
 	if (likely(napi_complete_done(napi, work_done)))
 		idpf_vport_intr_update_itr_ena_irq(q_vector);
+	else
+		idpf_vport_intr_set_wb_on_itr(q_vector);

 	/* Switch to poll mode in the tear-down path after sending disable
 	 * queues virtchnl message, as the interrupts will be disabled after

--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -127,11 +127,10 @@ do {								\
 */
 #define IDPF_TX_COMPLQ_PENDING(txq)	\
 	(((txq)->num_completions_pending >= (txq)->complq->num_completions ? \
-	0 : U64_MAX) + \
+	0 : U32_MAX) + \
 	(txq)->num_completions_pending - (txq)->complq->num_completions)

 #define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH	16
-#define IDPF_SPLITQ_TX_INVAL_COMPL_TAG	-1
 /* Adjust the generation for the completion tag and wrap if necessary */
 #define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \
 	((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \
@@ -149,47 +148,7 @@ union idpf_tx_flex_desc {
 	struct idpf_flex_tx_sched_desc flow; /* flow based scheduling */
 };

-/**
- * struct idpf_tx_buf
- * @next_to_watch: Next descriptor to clean
- * @skb: Pointer to the skb
- * @dma: DMA address
- * @len: DMA length
- * @bytecount: Number of bytes
- * @gso_segs: Number of GSO segments
- * @compl_tag: Splitq only, unique identifier for a buffer. Used to compare
- *	       with completion tag returned in buffer completion event.
- *	       Because the completion tag is expected to be the same in all
- *	       data descriptors for a given packet, and a single packet can
- *	       span multiple buffers, we need this field to track all
- *	       buffers associated with this completion tag independently of
- *	       the buf_id. The tag consists of a N bit buf_id and M upper
- *	       order "generation bits". See compl_tag_bufid_m and
- *	       compl_tag_gen_s in struct idpf_queue. We'll use a value of -1
- *	       to indicate the tag is not valid.
- * @ctx_entry: Singleq only. Used to indicate the corresponding entry
- *	       in the descriptor ring was used for a context descriptor and
- *	       this buffer entry should be skipped.
- */
-struct idpf_tx_buf {
-	void *next_to_watch;
-	struct sk_buff *skb;
-	DEFINE_DMA_UNMAP_ADDR(dma);
-	DEFINE_DMA_UNMAP_LEN(len);
-	unsigned int bytecount;
-	unsigned short gso_segs;
-
-	union {
-		int compl_tag;
-
-		bool ctx_entry;
-	};
-};
-
-struct idpf_tx_stash {
-	struct hlist_node hlist;
-	struct idpf_tx_buf buf;
-};
+#define idpf_tx_buf libeth_sqe

 /**
 * struct idpf_buf_lifo - LIFO for managing OOO completions
@@ -390,9 +349,11 @@ struct idpf_vec_regs {
 * struct idpf_intr_reg
 * @dyn_ctl: Dynamic control interrupt register
 * @dyn_ctl_intena_m: Mask for dyn_ctl interrupt enable
+ * @dyn_ctl_intena_msk_m: Mask for dyn_ctl interrupt enable mask
 * @dyn_ctl_itridx_s: Register bit offset for ITR index
 * @dyn_ctl_itridx_m: Mask for ITR index
 * @dyn_ctl_intrvl_s: Register bit offset for ITR interval
+ * @dyn_ctl_wb_on_itr_m: Mask for WB on ITR feature
 * @rx_itr: RX ITR register
 * @tx_itr: TX ITR register
 * @icr_ena: Interrupt cause register offset
@@ -401,9 +362,11 @@ struct idpf_vec_regs {
 struct idpf_intr_reg {
 	void __iomem *dyn_ctl;
 	u32 dyn_ctl_intena_m;
+	u32 dyn_ctl_intena_msk_m;
 	u32 dyn_ctl_itridx_s;
 	u32 dyn_ctl_itridx_m;
 	u32 dyn_ctl_intrvl_s;
+	u32 dyn_ctl_wb_on_itr_m;
 	void __iomem *rx_itr;
 	void __iomem *tx_itr;
 	void __iomem *icr_ena;
@@ -424,6 +387,7 @@ struct idpf_intr_reg {
 * @intr_reg: See struct idpf_intr_reg
 * @napi: napi handler
 * @total_events: Number of interrupts processed
+ * @wb_on_itr: whether WB on ITR is enabled
 * @tx_dim: Data for TX net_dim algorithm
 * @tx_itr_value: TX interrupt throttling rate
 * @tx_intr_mode: Dynamic ITR or not
@@ -454,6 +418,7 @@ struct idpf_q_vector {
 	__cacheline_group_begin_aligned(read_write);
 	struct napi_struct napi;
 	u16 total_events;
+	bool wb_on_itr;

 	struct dim tx_dim;
 	u16 tx_itr_value;
@@ -472,7 +437,7 @@ struct idpf_q_vector {
 	cpumask_var_t affinity_mask;
 	__cacheline_group_end_aligned(cold);
 };
-libeth_cacheline_set_assert(struct idpf_q_vector, 104,
+libeth_cacheline_set_assert(struct idpf_q_vector, 112,
 			    424 + 2 * sizeof(struct dim),
 			    8 + sizeof(cpumask_var_t));

@@ -496,11 +461,6 @@ struct idpf_tx_queue_stats {
 	u64_stats_t dma_map_errs;
 };

-struct idpf_cleaned_stats {
-	u32 packets;
-	u32 bytes;
-};
-
 #define IDPF_ITR_DYNAMIC	1
 #define IDPF_ITR_MAX		0x1FE0
 #define IDPF_ITR_20K		0x0032
@@ -688,7 +648,7 @@ struct idpf_tx_queue {

 		void *desc_ring;
 	};
-	struct idpf_tx_buf *tx_buf;
+	struct libeth_sqe *tx_buf;
 	struct idpf_txq_group *txq_grp;
 	struct device *dev;
 	void __iomem *tail;
@@ -831,7 +791,7 @@ struct idpf_compl_queue {
 	u32 next_to_use;
 	u32 next_to_clean;

-	u32 num_completions;
+	aligned_u64 num_completions;
 	__cacheline_group_end_aligned(read_write);

 	__cacheline_group_begin_aligned(cold);
@@ -963,7 +923,7 @@ struct idpf_txq_group {

 	struct idpf_compl_queue *complq;

-	u32 num_completions_pending;
+	aligned_u64 num_completions_pending;
 };

 static inline int idpf_q_vector_to_mem(const struct idpf_q_vector *q_vector)
@@ -1033,6 +993,25 @@ static inline void idpf_tx_splitq_build_desc(union idpf_tx_flex_desc *desc,
 		idpf_tx_splitq_build_flow_desc(desc, params, td_cmd, size);
 }

+/**
+ * idpf_vport_intr_set_wb_on_itr - enable descriptor writeback on disabled interrupts
+ * @q_vector: pointer to queue vector struct
+ */
+static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector)
+{
+	struct idpf_intr_reg *reg;
+
+	if (q_vector->wb_on_itr)
+		return;
+
+	q_vector->wb_on_itr = true;
+	reg = &q_vector->intr_reg;
+
+	writel(reg->dyn_ctl_wb_on_itr_m | reg->dyn_ctl_intena_msk_m |
+	       (IDPF_NO_ITR_UPDATE_IDX << reg->dyn_ctl_itridx_s),
+	       reg->dyn_ctl);
+}
+
 int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget);
 void idpf_vport_init_num_qs(struct idpf_vport *vport,
 			    struct virtchnl2_create_vport *vport_msg);
@@ -1064,7 +1043,6 @@ void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
 			   struct idpf_tx_buf *first, u16 ring_idx);
 unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
 					 struct sk_buff *skb);
-int idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q, unsigned int size);
 void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue);
 netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 				  struct idpf_tx_queue *tx_q);
@@ -1073,4 +1051,12 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rxq,
 				      u16 cleaned_count);
 int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off);

+static inline bool idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q,
+					     u32 needed)
+{
+	return !netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx,
+					  IDPF_DESC_UNUSED(tx_q),
+					  needed, needed);
+}
+
 #endif /* !_IDPF_TXRX_H_ */
--- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
@@ -97,7 +97,9 @@ static int idpf_vf_intr_reg_init(struct idpf_vport *vport)
 		intr->dyn_ctl = idpf_get_reg_addr(adapter,
 						  reg_vals[vec_id].dyn_ctl_reg);
 		intr->dyn_ctl_intena_m = VF_INT_DYN_CTLN_INTENA_M;
+		intr->dyn_ctl_intena_msk_m = VF_INT_DYN_CTLN_INTENA_MSK_M;
 		intr->dyn_ctl_itridx_s = VF_INT_DYN_CTLN_ITR_INDX_S;
+		intr->dyn_ctl_wb_on_itr_m = VF_INT_DYN_CTLN_WB_ON_ITR_M;

 		spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing,
 					       IDPF_VF_ITR_IDX_SPACING);

--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3566,6 +3566,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q)
 #endif
 }

+/**
+ * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue
+ * @dev: network device
+ * @qid: stack index of the queue to reset
+ */
+static inline void netdev_tx_reset_subqueue(const struct net_device *dev,
+					    u32 qid)
+{
+	netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid));
+}
+
 /**
 * 	netdev_reset_queue - reset the packets and bytes count of a network device
 * 	@dev_queue: network device
@@ -3575,7 +3586,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q)
 */
 static inline void netdev_reset_queue(struct net_device *dev_queue)
 {
-	netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
+	netdev_tx_reset_subqueue(dev_queue, 0);
 }

 /**

--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2024 Intel Corporation */
+
+#ifndef __LIBETH_TX_H
+#define __LIBETH_TX_H
+
+#include <linux/skbuff.h>
+
+#include <net/libeth/types.h>
+
+/* Tx buffer completion */
+
+/**
+ * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
+ * @LIBETH_SQE_EMPTY: unused/empty, no action required
+ * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
+ * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
+ * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
+ * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats
+ */
+enum libeth_sqe_type {
+	LIBETH_SQE_EMPTY		= 0U,
+	LIBETH_SQE_CTX,
+	LIBETH_SQE_SLAB,
+	LIBETH_SQE_FRAG,
+	LIBETH_SQE_SKB,
+};
+
+/**
+ * struct libeth_sqe - represents a Send Queue Element / Tx buffer
+ * @type: type of the buffer, see the enum above
+ * @rs_idx: index of the last buffer from the batch this one was sent in
+ * @raw: slab buffer to free via kfree()
+ * @skb: &sk_buff to consume
+ * @dma: DMA address to unmap
+ * @len: length of the mapped region to unmap
+ * @nr_frags: number of frags in the frame this buffer belongs to
+ * @packets: number of physical packets sent for this frame
+ * @bytes: number of physical bytes sent for this frame
+ * @priv: driver-private scratchpad
+ */
+struct libeth_sqe {
+	enum libeth_sqe_type		type:32;
+	u32				rs_idx;
+
+	union {
+		void				*raw;
+		struct sk_buff			*skb;
+	};
+
+	DEFINE_DMA_UNMAP_ADDR(dma);
+	DEFINE_DMA_UNMAP_LEN(len);
+
+	u32				nr_frags;
+	u32				packets;
+	u32				bytes;
+
+	unsigned long			priv;
+} __aligned_largest;
+
+/**
+ * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data
+ * @p: type or name of the object the driver wants to fit into &libeth_sqe
+ *
+ * Make sure the driver's private data fits into libeth_sqe::priv. To be used
+ * right after its declaration.
+ */
+#define LIBETH_SQE_CHECK_PRIV(p)					  \
+	static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv))
+
+/**
+ * struct libeth_cq_pp - completion queue poll params
+ * @dev: &device to perform DMA unmapping
+ * @ss: onstack NAPI stats to fill
+ * @napi: whether it's called from the NAPI context
+ *
+ * libeth uses this structure to access objects needed for performing full
+ * Tx complete operation without passing lots of arguments and change the
+ * prototypes each time a new one is added.
+ */
+struct libeth_cq_pp {
+	struct device			*dev;
+	struct libeth_sq_napi_stats	*ss;
+
+	bool				napi;
+};
+
+/**
+ * libeth_tx_complete - perform Tx completion for one SQE
+ * @sqe: SQE to complete
+ * @cp: poll params
+ *
+ * Do Tx complete for all the types of buffers, incl. freeing, unmapping,
+ * updating the stats etc.
+ */
+static inline void libeth_tx_complete(struct libeth_sqe *sqe,
+				      const struct libeth_cq_pp *cp)
+{
+	switch (sqe->type) {
+	case LIBETH_SQE_EMPTY:
+		return;
+	case LIBETH_SQE_SKB:
+	case LIBETH_SQE_FRAG:
+	case LIBETH_SQE_SLAB:
+		dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma),
+			       dma_unmap_len(sqe, len), DMA_TO_DEVICE);
+		break;
+	default:
+		break;
+	}
+
+	switch (sqe->type) {
+	case LIBETH_SQE_SKB:
+		cp->ss->packets += sqe->packets;
+		cp->ss->bytes += sqe->bytes;
+
+		napi_consume_skb(sqe->skb, cp->napi);
+		break;
+	case LIBETH_SQE_SLAB:
+		kfree(sqe->raw);
+		break;
+	default:
+		break;
+	}
+
+	sqe->type = LIBETH_SQE_EMPTY;
+}
+
+#endif /* __LIBETH_TX_H */
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2024 Intel Corporation */
+
+#ifndef __LIBETH_TYPES_H
+#define __LIBETH_TYPES_H
+
+#include <linux/types.h>
+
+/**
+ * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
+ * @packets: completed frames counter
+ * @bytes: sum of bytes of completed frames above
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_sq_napi_stats {
+	union {
+		struct {
+							u32 packets;
+							u32 bytes;
+		};
+		DECLARE_FLEX_ARRAY(u32, raw);
+	};
+};
+
+#endif /* __LIBETH_TYPES_H */