Commit 2b245cb2 authored by Anirudh Venkataramanan's avatar Anirudh Venkataramanan Committed by Jeff Kirsher

ice: Implement transmit and NAPI support

This patch implements ice_start_xmit (the handler for ndo_start_xmit) and
related functions. ice_start_xmit ultimately calls ice_tx_map, where the
Tx descriptor is built and posted to the hardware by bumping the ring tail.

This patch also implements ice_napi_poll, which is invoked when there's an
interrupt on the VSI's queues. The interrupt can be due to either a
completed Tx or an Rx event. In case of a completed Tx/Rx event, resources
are reclaimed. Additionally, in case of an Rx event, the skb is fetched
and passed up to the network stack.
Signed-off-by: default avatarAnirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: default avatarTony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: default avatarJeff Kirsher <jeffrey.t.kirsher@intel.com>
parent cdedef59
......@@ -60,6 +60,7 @@
(((val) << ICE_AQ_VSI_UP_TABLE_UP##i##_S) & \
ICE_AQ_VSI_UP_TABLE_UP##i##_M)
#define ICE_TX_DESC(R, i) (&(((struct ice_tx_desc *)((R)->desc))[i]))
#define ICE_RX_DESC(R, i) (&(((union ice_32b_rx_flex_desc *)((R)->desc))[i]))
#define ice_for_each_txq(vsi, i) \
......
......@@ -131,6 +131,33 @@ enum ice_rx_flg64_bits {
ICE_RXFLG_RSVD = 63
};
/* for ice_32byte_rx_flex_desc.ptype_flexi_flags0 member */
#define ICE_RX_FLEX_DESC_PTYPE_M (0x3FF) /* 10-bits */
/* for ice_32byte_rx_flex_desc.pkt_length member */
#define ICE_RX_FLX_DESC_PKT_LEN_M (0x3FFF) /* 14-bits */
enum ice_rx_flex_desc_status_error_0_bits {
/* Note: These are predefined bit offsets */
ICE_RX_FLEX_DESC_STATUS0_DD_S = 0,
ICE_RX_FLEX_DESC_STATUS0_EOF_S,
ICE_RX_FLEX_DESC_STATUS0_HBO_S,
ICE_RX_FLEX_DESC_STATUS0_L3L4P_S,
ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S,
ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S,
ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S,
ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S,
ICE_RX_FLEX_DESC_STATUS0_LPBK_S,
ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S,
ICE_RX_FLEX_DESC_STATUS0_RXE_S,
ICE_RX_FLEX_DESC_STATUS0_CRCP_S,
ICE_RX_FLEX_DESC_STATUS0_RSS_VALID_S,
ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S,
ICE_RX_FLEX_DESC_STATUS0_XTRMD0_VALID_S,
ICE_RX_FLEX_DESC_STATUS0_XTRMD1_VALID_S,
ICE_RX_FLEX_DESC_STATUS0_LAST /* this entry must be last!!! */
};
#define ICE_RXQ_CTX_SIZE_DWORDS 8
#define ICE_RXQ_CTX_SZ (ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32))
......@@ -201,6 +228,25 @@ struct ice_tx_desc {
__le64 cmd_type_offset_bsz;
};
enum ice_tx_desc_dtype_value {
ICE_TX_DESC_DTYPE_DATA = 0x0,
ICE_TX_DESC_DTYPE_CTX = 0x1,
/* DESC_DONE - HW has completed write-back of descriptor */
ICE_TX_DESC_DTYPE_DESC_DONE = 0xF,
};
#define ICE_TXD_QW1_CMD_S 4
#define ICE_TXD_QW1_CMD_M (0xFFFUL << ICE_TXD_QW1_CMD_S)
enum ice_tx_desc_cmd_bits {
ICE_TX_DESC_CMD_EOP = 0x0001,
ICE_TX_DESC_CMD_RS = 0x0002,
};
#define ICE_TXD_QW1_OFFSET_S 16
#define ICE_TXD_QW1_TX_BUF_SZ_S 34
#define ICE_TXD_QW1_L2TAG1_S 48
#define ICE_LAN_TXQ_MAX_QGRPS 127
#define ICE_LAN_TXQ_MAX_QDIS 1023
......
......@@ -1258,6 +1258,23 @@ static int ice_vsi_alloc_arrays(struct ice_vsi *vsi, bool alloc_qvectors)
return -ENOMEM;
}
/**
* ice_msix_clean_rings - MSIX mode Interrupt Handler
* @irq: interrupt number
* @data: pointer to a q_vector
*/
static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
{
struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
if (!q_vector->tx.ring && !q_vector->rx.ring)
return IRQ_HANDLED;
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
/**
* ice_vsi_alloc - Allocates the next available struct vsi in the PF
* @pf: board private structure
......@@ -1298,6 +1315,8 @@ static struct ice_vsi *ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type)
if (ice_vsi_alloc_arrays(vsi, true))
goto err_rings;
/* Setup default MSIX irq handler for VSI */
vsi->irq_handler = ice_msix_clean_rings;
break;
default:
dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
......@@ -1741,6 +1760,9 @@ static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx)
if (cpu_online(v_idx))
cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
if (vsi->netdev)
netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll,
NAPI_POLL_WEIGHT);
/* tie q_vector and vsi together */
vsi->q_vectors[v_idx] = q_vector;
......@@ -2914,6 +2936,21 @@ static int ice_vsi_stop_tx_rx_rings(struct ice_vsi *vsi)
return 0;
}
/**
* ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
* @vsi: the VSI being configured
*/
static void ice_napi_enable_all(struct ice_vsi *vsi)
{
int q_idx;
if (!vsi->netdev)
return;
for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
napi_enable(&vsi->q_vectors[q_idx]->napi);
}
/**
* ice_up_complete - Finish the last steps of bringing up a connection
* @vsi: The VSI being configured
......@@ -2939,6 +2976,7 @@ static int ice_up_complete(struct ice_vsi *vsi)
return err;
clear_bit(__ICE_DOWN, vsi->state);
ice_napi_enable_all(vsi);
ice_vsi_ena_irq(vsi);
if (vsi->port_info &&
......@@ -2954,6 +2992,21 @@ static int ice_up_complete(struct ice_vsi *vsi)
return err;
}
/**
* ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
* @vsi: VSI having NAPI disabled
*/
static void ice_napi_disable_all(struct ice_vsi *vsi)
{
int q_idx;
if (!vsi->netdev)
return;
for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
napi_disable(&vsi->q_vectors[q_idx]->napi);
}
/**
* ice_down - Shutdown the connection
* @vsi: The VSI being stopped
......@@ -2972,6 +3025,7 @@ static int ice_down(struct ice_vsi *vsi)
ice_vsi_dis_irq(vsi);
err = ice_vsi_stop_tx_rx_rings(vsi);
ice_napi_disable_all(vsi);
ice_for_each_txq(vsi, i)
ice_clean_tx_ring(vsi->tx_rings[i]);
......@@ -3251,4 +3305,5 @@ static int ice_stop(struct net_device *netdev)
static const struct net_device_ops ice_netdev_ops = {
.ndo_open = ice_open,
.ndo_stop = ice_stop,
.ndo_start_xmit = ice_start_xmit,
};
......@@ -7,6 +7,8 @@
#include <linux/mm.h>
#include "ice.h"
#define ICE_RX_HDR_SIZE 256
/**
* ice_unmap_and_free_tx_buf - Release a Tx buffer
* @ring: the ring that owns the buffer
......@@ -92,6 +94,129 @@ void ice_free_tx_ring(struct ice_ring *tx_ring)
}
}
/**
* ice_clean_tx_irq - Reclaim resources after transmit completes
* @vsi: the VSI we care about
* @tx_ring: Tx ring to clean
* @napi_budget: Used to determine if we are in netpoll
*
* Returns true if there's any budget left (e.g. the clean is finished)
*/
static bool ice_clean_tx_irq(struct ice_vsi *vsi, struct ice_ring *tx_ring,
int napi_budget)
{
unsigned int total_bytes = 0, total_pkts = 0;
unsigned int budget = vsi->work_lmt;
s16 i = tx_ring->next_to_clean;
struct ice_tx_desc *tx_desc;
struct ice_tx_buf *tx_buf;
tx_buf = &tx_ring->tx_buf[i];
tx_desc = ICE_TX_DESC(tx_ring, i);
i -= tx_ring->count;
do {
struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
/* if next_to_watch is not set then there is no work pending */
if (!eop_desc)
break;
smp_rmb(); /* prevent any other reads prior to eop_desc */
/* if the descriptor isn't done, no work yet to do */
if (!(eop_desc->cmd_type_offset_bsz &
cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
break;
/* clear next_to_watch to prevent false hangs */
tx_buf->next_to_watch = NULL;
/* update the statistics for this packet */
total_bytes += tx_buf->bytecount;
total_pkts += tx_buf->gso_segs;
/* free the skb */
napi_consume_skb(tx_buf->skb, napi_budget);
/* unmap skb header data */
dma_unmap_single(tx_ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
/* clear tx_buf data */
tx_buf->skb = NULL;
dma_unmap_len_set(tx_buf, len, 0);
/* unmap remaining buffers */
while (tx_desc != eop_desc) {
tx_buf++;
tx_desc++;
i++;
if (unlikely(!i)) {
i -= tx_ring->count;
tx_buf = tx_ring->tx_buf;
tx_desc = ICE_TX_DESC(tx_ring, 0);
}
/* unmap any remaining paged data */
if (dma_unmap_len(tx_buf, len)) {
dma_unmap_page(tx_ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
dma_unmap_len_set(tx_buf, len, 0);
}
}
/* move us one more past the eop_desc for start of next pkt */
tx_buf++;
tx_desc++;
i++;
if (unlikely(!i)) {
i -= tx_ring->count;
tx_buf = tx_ring->tx_buf;
tx_desc = ICE_TX_DESC(tx_ring, 0);
}
prefetch(tx_desc);
/* update budget accounting */
budget--;
} while (likely(budget));
i += tx_ring->count;
tx_ring->next_to_clean = i;
u64_stats_update_begin(&tx_ring->syncp);
tx_ring->stats.bytes += total_bytes;
tx_ring->stats.pkts += total_pkts;
u64_stats_update_end(&tx_ring->syncp);
tx_ring->q_vector->tx.total_bytes += total_bytes;
tx_ring->q_vector->tx.total_pkts += total_pkts;
netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts,
total_bytes);
#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) &&
(ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
/* Make sure that anybody stopping the queue after this
* sees the new next_to_clean.
*/
smp_mb();
if (__netif_subqueue_stopped(tx_ring->netdev,
tx_ring->q_index) &&
!test_bit(__ICE_DOWN, vsi->state)) {
netif_wake_subqueue(tx_ring->netdev,
tx_ring->q_index);
++tx_ring->tx_stats.restart_q;
}
}
return !!budget;
}
/**
* ice_setup_tx_ring - Allocate the Tx descriptors
* @tx_ring: the tx ring to set up
......@@ -274,13 +399,17 @@ static bool ice_alloc_mapped_page(struct ice_ring *rx_ring,
dma_addr_t dma;
/* since we are recycling buffers we should seldom need to alloc */
if (likely(page))
if (likely(page)) {
rx_ring->rx_stats.page_reuse_count++;
return true;
}
/* alloc new page for storage */
page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!page))
if (unlikely(!page)) {
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
/* map page for use */
dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
......@@ -290,6 +419,7 @@ static bool ice_alloc_mapped_page(struct ice_ring *rx_ring,
*/
if (dma_mapping_error(rx_ring->dev, dma)) {
__free_pages(page, 0);
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
......@@ -359,3 +489,895 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
*/
return true;
}
/**
* ice_page_is_reserved - check if reuse is possible
* @page: page struct to check
*/
static bool ice_page_is_reserved(struct page *page)
{
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
}
/**
* ice_add_rx_frag - Add contents of Rx buffer to sk_buff
* @rx_buf: buffer containing page to add
* @rx_desc: descriptor containing length of buffer written by hardware
* @skb: sk_buf to place the data into
*
* This function will add the data contained in rx_buf->page to the skb.
* This is done either through a direct copy if the data in the buffer is
* less than the skb header size, otherwise it will just attach the page as
* a frag to the skb.
*
* The function will then update the page offset if necessary and return
* true if the buffer can be reused by the adapter.
*/
static bool ice_add_rx_frag(struct ice_rx_buf *rx_buf,
union ice_32b_rx_flex_desc *rx_desc,
struct sk_buff *skb)
{
#if (PAGE_SIZE < 8192)
unsigned int truesize = ICE_RXBUF_2048;
#else
unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
unsigned int truesize;
#endif /* PAGE_SIZE < 8192) */
struct page *page;
unsigned int size;
size = le16_to_cpu(rx_desc->wb.pkt_len) &
ICE_RX_FLX_DESC_PKT_LEN_M;
page = rx_buf->page;
#if (PAGE_SIZE >= 8192)
truesize = ALIGN(size, L1_CACHE_BYTES);
#endif /* PAGE_SIZE >= 8192) */
/* will the data fit in the skb we allocated? if so, just
* copy it as it is pretty small anyway
*/
if (size <= ICE_RX_HDR_SIZE && !skb_is_nonlinear(skb)) {
unsigned char *va = page_address(page) + rx_buf->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
/* page is not reserved, we can reuse buffer as-is */
if (likely(!ice_page_is_reserved(page)))
return true;
/* this page cannot be reused so discard it */
__free_pages(page, 0);
return false;
}
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
rx_buf->page_offset, size, truesize);
/* avoid re-using remote pages */
if (unlikely(ice_page_is_reserved(page)))
return false;
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
if (unlikely(page_count(page) != 1))
return false;
/* flip page offset to other buffer */
rx_buf->page_offset ^= truesize;
#else
/* move offset up to the next cache line */
rx_buf->page_offset += truesize;
if (rx_buf->page_offset > last_offset)
return false;
#endif /* PAGE_SIZE < 8192) */
/* Even if we own the page, we are not allowed to use atomic_set()
* This would break get_page_unless_zero() users.
*/
get_page(rx_buf->page);
return true;
}
/**
* ice_reuse_rx_page - page flip buffer and store it back on the ring
* @rx_ring: rx descriptor ring to store buffers on
* @old_buf: donor buffer to have page reused
*
* Synchronizes page for reuse by the adapter
*/
static void ice_reuse_rx_page(struct ice_ring *rx_ring,
struct ice_rx_buf *old_buf)
{
u16 nta = rx_ring->next_to_alloc;
struct ice_rx_buf *new_buf;
new_buf = &rx_ring->rx_buf[nta];
/* update, and store next to alloc */
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
*new_buf = *old_buf;
}
/**
* ice_fetch_rx_buf - Allocate skb and populate it
* @rx_ring: rx descriptor ring to transact packets on
* @rx_desc: descriptor containing info written by hardware
*
* This function allocates an skb on the fly, and populates it with the page
* data from the current receive descriptor, taking care to set up the skb
* correctly, as well as handling calling the page recycle function if
* necessary.
*/
static struct sk_buff *ice_fetch_rx_buf(struct ice_ring *rx_ring,
union ice_32b_rx_flex_desc *rx_desc)
{
struct ice_rx_buf *rx_buf;
struct sk_buff *skb;
struct page *page;
rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
page = rx_buf->page;
prefetchw(page);
skb = rx_buf->skb;
if (likely(!skb)) {
u8 *page_addr = page_address(page) + rx_buf->page_offset;
/* prefetch first cache line of first page */
prefetch(page_addr);
#if L1_CACHE_BYTES < 128
prefetch((void *)(page_addr + L1_CACHE_BYTES));
#endif /* L1_CACHE_BYTES */
/* allocate a skb to store the frags */
skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
ICE_RX_HDR_SIZE,
GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb)) {
rx_ring->rx_stats.alloc_buf_failed++;
return NULL;
}
/* we will be copying header into skb->data in
* pskb_may_pull so it is in our interest to prefetch
* it now to avoid a possible cache miss
*/
prefetchw(skb->data);
skb_record_rx_queue(skb, rx_ring->q_index);
} else {
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
rx_buf->page_offset,
ICE_RXBUF_2048,
DMA_FROM_DEVICE);
rx_buf->skb = NULL;
}
/* pull page into skb */
if (ice_add_rx_frag(rx_buf, rx_desc, skb)) {
/* hand second half of page back to the ring */
ice_reuse_rx_page(rx_ring, rx_buf);
rx_ring->rx_stats.page_reuse_count++;
} else {
/* we are not reusing the buffer so unmap it */
dma_unmap_page(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
DMA_FROM_DEVICE);
}
/* clear contents of buffer_info */
rx_buf->page = NULL;
return skb;
}
/**
* ice_pull_tail - ice specific version of skb_pull_tail
* @skb: pointer to current skb being adjusted
*
* This function is an ice specific version of __pskb_pull_tail. The
* main difference between this version and the original function is that
* this function can make several assumptions about the state of things
* that allow for significant optimizations versus the standard function.
* As a result we can do things like drop a frag and maintain an accurate
* truesize for the skb.
*/
static void ice_pull_tail(struct sk_buff *skb)
{
struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
unsigned int pull_len;
unsigned char *va;
/* it is valid to use page_address instead of kmap since we are
* working with pages allocated out of the lomem pool per
* alloc_page(GFP_ATOMIC)
*/
va = skb_frag_address(frag);
/* we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad.
*/
pull_len = eth_get_headlen(va, ICE_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
/* update all of the pointers */
skb_frag_size_sub(frag, pull_len);
frag->page_offset += pull_len;
skb->data_len -= pull_len;
skb->tail += pull_len;
}
/**
* ice_cleanup_headers - Correct empty headers
* @skb: pointer to current skb being fixed
*
* Also address the case where we are pulling data in on pages only
* and as such no data is present in the skb header.
*
* In addition if skb is not at least 60 bytes we need to pad it so that
* it is large enough to qualify as a valid Ethernet frame.
*
* Returns true if an error was encountered and skb was freed.
*/
static bool ice_cleanup_headers(struct sk_buff *skb)
{
/* place header in linear portion of buffer */
if (skb_is_nonlinear(skb))
ice_pull_tail(skb);
/* if eth_skb_pad returns an error the skb was freed */
if (eth_skb_pad(skb))
return true;
return false;
}
/**
* ice_test_staterr - tests bits in Rx descriptor status and error fields
* @rx_desc: pointer to receive descriptor (in le64 format)
* @stat_err_bits: value to mask
*
* This function does some fast chicanery in order to return the
* value of the mask which is really only used for boolean tests.
* The status_error_len doesn't need to be shifted because it begins
* at offset zero.
*/
static bool ice_test_staterr(union ice_32b_rx_flex_desc *rx_desc,
const u16 stat_err_bits)
{
return !!(rx_desc->wb.status_error0 &
cpu_to_le16(stat_err_bits));
}
/**
* ice_is_non_eop - process handling of non-EOP buffers
* @rx_ring: Rx ring being processed
* @rx_desc: Rx descriptor for current buffer
* @skb: Current socket buffer containing buffer in progress
*
* This function updates next to clean. If the buffer is an EOP buffer
* this function exits returning false, otherwise it will place the
* sk_buff in the next buffer to be chained and return true indicating
* that this is in fact a non-EOP buffer.
*/
static bool ice_is_non_eop(struct ice_ring *rx_ring,
union ice_32b_rx_flex_desc *rx_desc,
struct sk_buff *skb)
{
u32 ntc = rx_ring->next_to_clean + 1;
/* fetch, update, and store next to clean */
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(ICE_RX_DESC(rx_ring, ntc));
/* if we are the last buffer then there is nothing else to do */
#define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
return false;
/* place skb in next buffer to be received */
rx_ring->rx_buf[ntc].skb = skb;
rx_ring->rx_stats.non_eop_descs++;
return true;
}
/**
* ice_receive_skb - Send a completed packet up the stack
* @rx_ring: rx ring in play
* @skb: packet to send up
* @vlan_tag: vlan tag for packet
*
* This function sends the completed packet (via. skb) up the stack using
* gro receive functions (with/without vlan tag)
*/
static void ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb,
u16 vlan_tag)
{
if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
(vlan_tag & VLAN_VID_MASK)) {
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
}
napi_gro_receive(&rx_ring->q_vector->napi, skb);
}
/**
* ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
* @rx_ring: rx descriptor ring to transact packets on
* @budget: Total limit on number of packets to process
*
* This function provides a "bounce buffer" approach to Rx interrupt
* processing. The advantage to this is that on systems that have
* expensive overhead for IOMMU access this provides a means of avoiding
* it by maintaining the mapping of the page to the system.
*
* Returns amount of work completed
*/
static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
bool failure = false;
/* start the loop to process RX packets bounded by 'budget' */
while (likely(total_rx_pkts < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
struct sk_buff *skb;
u16 stat_err_bits;
u16 vlan_tag = 0;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= ICE_RX_BUF_WRITE) {
failure = failure ||
ice_alloc_rx_bufs(rx_ring, cleaned_count);
cleaned_count = 0;
}
/* get the RX desc from RX ring based on 'next_to_clean' */
rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
/* status_error_len will always be zero for unused descriptors
* because it's cleared in cleanup, and overlaps with hdr_addr
* which is always zero because packet split isn't used, if the
* hardware wrote DD then it will be non-zero
*/
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
if (!ice_test_staterr(rx_desc, stat_err_bits))
break;
/* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* DD bit is set.
*/
dma_rmb();
/* allocate (if needed) and populate skb */
skb = ice_fetch_rx_buf(rx_ring, rx_desc);
if (!skb)
break;
cleaned_count++;
/* skip if it is NOP desc */
if (ice_is_non_eop(rx_ring, rx_desc, skb))
continue;
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
dev_kfree_skb_any(skb);
continue;
}
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
if (ice_test_staterr(rx_desc, stat_err_bits))
vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
/* correct empty headers and pad skb if needed (to make valid
* ethernet frame
*/
if (ice_cleanup_headers(skb)) {
skb = NULL;
continue;
}
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
/* send completed skb up the stack */
ice_receive_skb(rx_ring, skb, vlan_tag);
/* update budget accounting */
total_rx_pkts++;
}
/* update queue and vector specific stats */
u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.pkts += total_rx_pkts;
rx_ring->stats.bytes += total_rx_bytes;
u64_stats_update_end(&rx_ring->syncp);
rx_ring->q_vector->rx.total_pkts += total_rx_pkts;
rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
/* guarantee a trip back through this routine if there was a failure */
return failure ? budget : (int)total_rx_pkts;
}
/**
* ice_napi_poll - NAPI polling Rx/Tx cleanup routine
* @napi: napi struct with our devices info in it
* @budget: amount of work driver is allowed to do this pass, in packets
*
* This function will clean all queues associated with a q_vector.
*
* Returns the amount of work done
*/
int ice_napi_poll(struct napi_struct *napi, int budget)
{
struct ice_q_vector *q_vector =
container_of(napi, struct ice_q_vector, napi);
struct ice_vsi *vsi = q_vector->vsi;
struct ice_pf *pf = vsi->back;
bool clean_complete = true;
int budget_per_ring = 0;
struct ice_ring *ring;
int work_done = 0;
/* Since the actual Tx work is minimal, we can give the Tx a larger
* budget and be more aggressive about cleaning up the Tx descriptors.
*/
ice_for_each_ring(ring, q_vector->tx)
if (!ice_clean_tx_irq(vsi, ring, budget))
clean_complete = false;
/* Handle case where we are called by netpoll with a budget of 0 */
if (budget <= 0)
return budget;
/* We attempt to distribute budget to each Rx queue fairly, but don't
* allow the budget to go below 1 because that would exit polling early.
*/
if (q_vector->num_ring_rx)
budget_per_ring = max(budget / q_vector->num_ring_rx, 1);
ice_for_each_ring(ring, q_vector->rx) {
int cleaned;
cleaned = ice_clean_rx_irq(ring, budget_per_ring);
work_done += cleaned;
/* if we clean as many as budgeted, we must not be done */
if (cleaned >= budget_per_ring)
clean_complete = false;
}
/* If work not completed, return budget and polling will return */
if (!clean_complete)
return budget;
/* Work is done so exit the polling mode and re-enable the interrupt */
napi_complete_done(napi, work_done);
if (test_bit(ICE_FLAG_MSIX_ENA, pf->flags))
ice_irq_dynamic_ena(&vsi->back->hw, vsi, q_vector);
return 0;
}
/* helper function for building cmd/type/offset */
static __le64
build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
{
return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
(td_cmd << ICE_TXD_QW1_CMD_S) |
(td_offset << ICE_TXD_QW1_OFFSET_S) |
((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
(td_tag << ICE_TXD_QW1_L2TAG1_S));
}
/**
* __ice_maybe_stop_tx - 2nd level check for tx stop conditions
* @tx_ring: the ring to be checked
* @size: the size buffer we want to assure is available
*
* Returns -EBUSY if a stop is needed, else 0
*/
static int __ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size)
{
netif_stop_subqueue(tx_ring->netdev, tx_ring->q_index);
/* Memory barrier before checking head and tail */
smp_mb();
/* Check again in a case another CPU has just made room available. */
if (likely(ICE_DESC_UNUSED(tx_ring) < size))
return -EBUSY;
/* A reprieve! - use start_subqueue because it doesn't call schedule */
netif_start_subqueue(tx_ring->netdev, tx_ring->q_index);
++tx_ring->tx_stats.restart_q;
return 0;
}
/**
* ice_maybe_stop_tx - 1st level check for tx stop conditions
* @tx_ring: the ring to be checked
* @size: the size buffer we want to assure is available
*
* Returns 0 if stop is not needed
*/
static int ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size)
{
if (likely(ICE_DESC_UNUSED(tx_ring) >= size))
return 0;
return __ice_maybe_stop_tx(tx_ring, size);
}
/**
* ice_tx_map - Build the Tx descriptor
* @tx_ring: ring to send buffer on
* @first: first buffer info buffer to use
*
* This function loops over the skb data pointed to by *first
* and gets a physical address for each memory location and programs
* it and the length into the transmit descriptor.
*/
static void ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first)
{
u64 td_offset = 0, td_tag = 0, td_cmd = 0;
u16 i = tx_ring->next_to_use;
struct skb_frag_struct *frag;
unsigned int data_len, size;
struct ice_tx_desc *tx_desc;
struct ice_tx_buf *tx_buf;
struct sk_buff *skb;
dma_addr_t dma;
skb = first->skb;
data_len = skb->data_len;
size = skb_headlen(skb);
tx_desc = ICE_TX_DESC(tx_ring, i);
dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
tx_buf = first;
for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
if (dma_mapping_error(tx_ring->dev, dma))
goto dma_error;
/* record length, and DMA address */
dma_unmap_len_set(tx_buf, len, size);
dma_unmap_addr_set(tx_buf, dma, dma);
/* align size to end of page */
max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1);
tx_desc->buf_addr = cpu_to_le64(dma);
/* account for data chunks larger than the hardware
* can handle
*/
while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, max_data, td_tag);
tx_desc++;
i++;
if (i == tx_ring->count) {
tx_desc = ICE_TX_DESC(tx_ring, 0);
i = 0;
}
dma += max_data;
size -= max_data;
max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
tx_desc->buf_addr = cpu_to_le64(dma);
}
if (likely(!data_len))
break;
tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
size, td_tag);
tx_desc++;
i++;
if (i == tx_ring->count) {
tx_desc = ICE_TX_DESC(tx_ring, 0);
i = 0;
}
size = skb_frag_size(frag);
data_len -= size;
dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
DMA_TO_DEVICE);
tx_buf = &tx_ring->tx_buf[i];
}
/* record bytecount for BQL */
netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
/* record SW timestamp if HW timestamp is not available */
skb_tx_timestamp(first->skb);
i++;
if (i == tx_ring->count)
i = 0;
/* write last descriptor with RS and EOP bits */
td_cmd |= (u64)(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS);
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag);
/* Force memory writes to complete before letting h/w know there
* are new descriptors to fetch.
*
* We also use this memory barrier to make certain all of the
* status bits have been updated before next_to_watch is written.
*/
wmb();
/* set next_to_watch value indicating a packet is present */
first->next_to_watch = tx_desc;
tx_ring->next_to_use = i;
ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
/* notify HW of packet */
if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
writel(i, tx_ring->tail);
/* we need this if more than one processor can write to our tail
* at a time, it synchronizes IO on IA64/Altix systems
*/
mmiowb();
}
return;
dma_error:
/* clear dma mappings for failed tx_buf map */
for (;;) {
tx_buf = &tx_ring->tx_buf[i];
ice_unmap_and_free_tx_buf(tx_ring, tx_buf);
if (tx_buf == first)
break;
if (i == 0)
i = tx_ring->count;
i--;
}
tx_ring->next_to_use = i;
}
/**
* ice_txd_use_count - estimate the number of descriptors needed for Tx
* @size: transmit request size in bytes
*
* Due to hardware alignment restrictions (4K alignment), we need to
* assume that we can have no more than 12K of data per descriptor, even
* though each descriptor can take up to 16K - 1 bytes of aligned memory.
* Thus, we need to divide by 12K. But division is slow! Instead,
* we decompose the operation into shifts and one relatively cheap
* multiply operation.
*
* To divide by 12K, we first divide by 4K, then divide by 3:
* To divide by 4K, shift right by 12 bits
* To divide by 3, multiply by 85, then divide by 256
* (Divide by 256 is done by shifting right by 8 bits)
* Finally, we add one to round up. Because 256 isn't an exact multiple of
* 3, we'll underestimate near each multiple of 12K. This is actually more
* accurate as we have 4K - 1 of wiggle room that we can fit into the last
* segment. For our purposes this is accurate out to 1M which is orders of
* magnitude greater than our largest possible GSO size.
*
* This would then be implemented as:
* return (((size >> 12) * 85) >> 8) + 1;
*
* Since multiplication and division are commutative, we can reorder
* operations into:
* return ((size * 85) >> 20) + 1;
*/
static unsigned int ice_txd_use_count(unsigned int size)
{
return ((size * 85) >> 20) + 1;
}
/**
* ice_xmit_desc_count - calculate number of tx descriptors needed
* @skb: send buffer
*
* Returns number of data descriptors needed for this skb.
*/
static unsigned int ice_xmit_desc_count(struct sk_buff *skb)
{
const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
unsigned int count = 0, size = skb_headlen(skb);
for (;;) {
count += ice_txd_use_count(size);
if (!nr_frags--)
break;
size = skb_frag_size(frag++);
}
return count;
}
/**
* __ice_chk_linearize - Check if there are more than 8 buffers per packet
* @skb: send buffer
*
* Note: This HW can't DMA more than 8 buffers to build a packet on the wire
* and so we need to figure out the cases where we need to linearize the skb.
*
* For TSO we need to count the TSO header and segment payload separately.
* As such we need to check cases where we have 7 fragments or more as we
* can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
* the segment payload in the first descriptor, and another 7 for the
* fragments.
*/
static bool __ice_chk_linearize(struct sk_buff *skb)
{
const struct skb_frag_struct *frag, *stale;
int nr_frags, sum;
/* no need to check if number of frags is less than 7 */
nr_frags = skb_shinfo(skb)->nr_frags;
if (nr_frags < (ICE_MAX_BUF_TXD - 1))
return false;
/* We need to walk through the list and validate that each group
* of 6 fragments totals at least gso_size.
*/
nr_frags -= ICE_MAX_BUF_TXD - 2;
frag = &skb_shinfo(skb)->frags[0];
/* Initialize size to the negative value of gso_size minus 1. We
* use this as the worst case scenerio in which the frag ahead
* of us only provides one byte which is why we are limited to 6
* descriptors for a single transmit as the header and previous
* fragment are already consuming 2 descriptors.
*/
sum = 1 - skb_shinfo(skb)->gso_size;
/* Add size of frags 0 through 4 to create our initial sum */
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
sum += skb_frag_size(frag++);
/* Walk through fragments adding latest fragment, testing it, and
* then removing stale fragments from the sum.
*/
stale = &skb_shinfo(skb)->frags[0];
for (;;) {
sum += skb_frag_size(frag++);
/* if sum is negative we failed to make sufficient progress */
if (sum < 0)
return true;
if (!nr_frags--)
break;
sum -= skb_frag_size(stale++);
}
return false;
}
/**
* ice_chk_linearize - Check if there are more than 8 fragments per packet
* @skb: send buffer
* @count: number of buffers used
*
* Note: Our HW can't scatter-gather more than 8 fragments to build
* a packet on the wire and so we need to figure out the cases where we
* need to linearize the skb.
*/
static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
{
/* Both TSO and single send will work if count is less than 8 */
if (likely(count < ICE_MAX_BUF_TXD))
return false;
if (skb_is_gso(skb))
return __ice_chk_linearize(skb);
/* we can support up to 8 data buffers for a single send */
return count != ICE_MAX_BUF_TXD;
}
/**
* ice_xmit_frame_ring - Sends buffer on Tx ring
* @skb: send buffer
* @tx_ring: ring to send buffer on
*
* Returns NETDEV_TX_OK if sent, else an error code
*/
static netdev_tx_t
ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
{
struct ice_tx_buf *first;
unsigned int count;
count = ice_xmit_desc_count(skb);
if (ice_chk_linearize(skb, count)) {
if (__skb_linearize(skb))
goto out_drop;
count = ice_txd_use_count(skb->len);
tx_ring->tx_stats.tx_linearize++;
}
/* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
* + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
* + 4 desc gap to avoid the cache line where head is,
* + 1 desc for context descriptor,
* otherwise try next time
*/
if (ice_maybe_stop_tx(tx_ring, count + 4 + 1)) {
tx_ring->tx_stats.tx_busy++;
return NETDEV_TX_BUSY;
}
/* record the location of the first descriptor for this packet */
first = &tx_ring->tx_buf[tx_ring->next_to_use];
first->skb = skb;
first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
first->gso_segs = 1;
ice_tx_map(tx_ring, first);
return NETDEV_TX_OK;
out_drop:
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
/**
* ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
* @skb: send buffer
* @netdev: network interface device structure
*
* Returns NETDEV_TX_OK if sent, else an error code
*/
netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
{
struct ice_netdev_priv *np = netdev_priv(netdev);
struct ice_vsi *vsi = np->vsi;
struct ice_ring *tx_ring;
tx_ring = vsi->tx_rings[skb->queue_mapping];
/* hardware can't handle really short frames, hardware padding works
* beyond this point
*/
if (skb_put_padto(skb, ICE_MIN_TX_LEN))
return NETDEV_TX_OK;
return ice_xmit_frame_ring(skb, tx_ring);
}
......@@ -7,8 +7,23 @@
#define ICE_DFLT_IRQ_WORK 256
#define ICE_RXBUF_2048 2048
#define ICE_MAX_CHAINED_RX_BUFS 5
#define ICE_MAX_BUF_TXD 8
#define ICE_MIN_TX_LEN 17
/* The size limit for a transmit buffer in a descriptor is (16K - 1).
* In order to align with the read requests we will align the value to
* the nearest 4K which represents our maximum read request size.
*/
#define ICE_MAX_READ_REQ_SIZE 4096
#define ICE_MAX_DATA_PER_TXD (16 * 1024 - 1)
#define ICE_MAX_DATA_PER_TXD_ALIGNED \
(~(ICE_MAX_READ_REQ_SIZE - 1) & ICE_MAX_DATA_PER_TXD)
#define ICE_RX_BUF_WRITE 16 /* Must be power of 2 */
#define ICE_MAX_TXQ_PER_TXQG 128
/* Tx Descriptors needed, worst case */
#define DESC_NEEDED (MAX_SKB_FRAGS + 4)
#define ICE_DESC_UNUSED(R) \
((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
(R)->next_to_clean - (R)->next_to_use - 1)
......@@ -30,6 +45,24 @@ struct ice_rx_buf {
unsigned int page_offset;
};
struct ice_q_stats {
u64 pkts;
u64 bytes;
};
struct ice_txq_stats {
u64 restart_q;
u64 tx_busy;
u64 tx_linearize;
};
struct ice_rxq_stats {
u64 non_eop_descs;
u64 alloc_page_failed;
u64 alloc_buf_failed;
u64 page_reuse_count;
};
/* this enum matches hardware bits and is meant to be used by DYN_CTLN
* registers and QINT registers or more generally anywhere in the manual
* mentioning ITR_INDX, ITR_NONE cannot be used as an index 'n' into any
......@@ -94,6 +127,15 @@ struct ice_ring {
u16 next_to_clean;
bool ring_active; /* is ring online or not */
/* stats structs */
struct ice_q_stats stats;
struct u64_stats_sync syncp;
union {
struct ice_txq_stats tx_stats;
struct ice_rxq_stats rx_stats;
};
unsigned int size; /* length of descriptor ring in bytes */
dma_addr_t dma; /* physical address of ring */
struct rcu_head rcu; /* to avoid race on free */
......@@ -121,10 +163,13 @@ struct ice_ring_container {
for (pos = (head).ring; pos; pos = pos->next)
bool ice_alloc_rx_bufs(struct ice_ring *rxr, u16 cleaned_count);
netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev);
void ice_clean_tx_ring(struct ice_ring *tx_ring);
void ice_clean_rx_ring(struct ice_ring *rx_ring);
int ice_setup_tx_ring(struct ice_ring *tx_ring);
int ice_setup_rx_ring(struct ice_ring *rx_ring);
void ice_free_tx_ring(struct ice_ring *tx_ring);
void ice_free_rx_ring(struct ice_ring *rx_ring);
int ice_napi_poll(struct napi_struct *napi, int budget);
#endif /* _ICE_TXRX_H_ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment