Commit d5fbda61 authored by Arjun Vynipadath's avatar Arjun Vynipadath Committed by David S. Miller

cxgb4: Add support for FW_ETH_TX_PKT_VM_WR

The present TX workrequest(FW_ETH_TX_PKT_WR) cant be used for
host->vf communication, since it doesn't loopback the outgoing
packets to virtual interfaces on the same port. This can be done
using FW_ETH_TX_PKT_VM_WR.
This fix depends on ethtool_flags to determine what WR to use for
TX path. Support for setting this flags by user is added in next
commit.

Based on the original work by : Casey Leedom <leedom@chelsio.com>
Signed-off-by: default avatarCasey Leedom <leedom@chelsio.com>
Signed-off-by: default avatarArjun Vynipadath <arjun@chelsio.com>
Signed-off-by: default avatarGanesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b0e9a2fe
...@@ -522,6 +522,15 @@ enum { ...@@ -522,6 +522,15 @@ enum {
MAX_INGQ = MAX_ETH_QSETS + INGQ_EXTRAS, MAX_INGQ = MAX_ETH_QSETS + INGQ_EXTRAS,
}; };
enum {
PRIV_FLAG_PORT_TX_VM_BIT,
};
#define PRIV_FLAG_PORT_TX_VM BIT(PRIV_FLAG_PORT_TX_VM_BIT)
#define PRIV_FLAGS_ADAP 0
#define PRIV_FLAGS_PORT PRIV_FLAG_PORT_TX_VM
struct adapter; struct adapter;
struct sge_rspq; struct sge_rspq;
...@@ -558,6 +567,7 @@ struct port_info { ...@@ -558,6 +567,7 @@ struct port_info {
struct hwtstamp_config tstamp_config; struct hwtstamp_config tstamp_config;
bool ptp_enable; bool ptp_enable;
struct sched_table *sched_tbl; struct sched_table *sched_tbl;
u32 eth_flags;
}; };
struct dentry; struct dentry;
...@@ -868,6 +878,7 @@ struct adapter { ...@@ -868,6 +878,7 @@ struct adapter {
unsigned int flags; unsigned int flags;
unsigned int adap_idx; unsigned int adap_idx;
enum chip_type chip; enum chip_type chip;
u32 eth_flags;
int msg_enable; int msg_enable;
__be16 vxlan_port; __be16 vxlan_port;
...@@ -1335,7 +1346,7 @@ void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat); ...@@ -1335,7 +1346,7 @@ void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
void t4_free_sge_resources(struct adapter *adap); void t4_free_sge_resources(struct adapter *adap);
void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q); void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q);
irq_handler_t t4_intr_handler(struct adapter *adap); irq_handler_t t4_intr_handler(struct adapter *adap);
netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev);
int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
const struct pkt_gl *gl); const struct pkt_gl *gl);
int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb); int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
......
...@@ -3217,7 +3217,7 @@ static netdev_features_t cxgb_fix_features(struct net_device *dev, ...@@ -3217,7 +3217,7 @@ static netdev_features_t cxgb_fix_features(struct net_device *dev,
static const struct net_device_ops cxgb4_netdev_ops = { static const struct net_device_ops cxgb4_netdev_ops = {
.ndo_open = cxgb_open, .ndo_open = cxgb_open,
.ndo_stop = cxgb_close, .ndo_stop = cxgb_close,
.ndo_start_xmit = t4_eth_xmit, .ndo_start_xmit = t4_start_xmit,
.ndo_select_queue = cxgb_select_queue, .ndo_select_queue = cxgb_select_queue,
.ndo_get_stats64 = cxgb_get_stats, .ndo_get_stats64 = cxgb_get_stats,
.ndo_set_rx_mode = cxgb_set_rxmode, .ndo_set_rx_mode = cxgb_set_rxmode,
......
...@@ -1288,13 +1288,13 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb, ...@@ -1288,13 +1288,13 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb,
} }
/** /**
* t4_eth_xmit - add a packet to an Ethernet Tx queue * cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
* @skb: the packet * @skb: the packet
* @dev: the egress net device * @dev: the egress net device
* *
* Add a packet to an SGE Ethernet Tx queue. Runs with softirqs disabled. * Add a packet to an SGE Ethernet Tx queue. Runs with softirqs disabled.
*/ */
netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
{ {
u32 wr_mid, ctrl0, op; u32 wr_mid, ctrl0, op;
u64 cntrl, *end, *sgl; u64 cntrl, *end, *sgl;
...@@ -1547,6 +1547,374 @@ out_free: dev_kfree_skb_any(skb); ...@@ -1547,6 +1547,374 @@ out_free: dev_kfree_skb_any(skb);
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
/* Constants ... */
enum {
/* Egress Queue sizes, producer and consumer indices are all in units
* of Egress Context Units bytes. Note that as far as the hardware is
* concerned, the free list is an Egress Queue (the host produces free
* buffers which the hardware consumes) and free list entries are
* 64-bit PCI DMA addresses.
*/
EQ_UNIT = SGE_EQ_IDXSIZE,
FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) +
sizeof(struct cpl_tx_pkt_lso_core) +
sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64),
};
/**
* t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data?
* @skb: the packet
*
* Returns whether an Ethernet packet is small enough to fit completely as
* immediate data.
*/
static inline int t4vf_is_eth_imm(const struct sk_buff *skb)
{
/* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request
* which does not accommodate immediate data. We could dike out all
* of the support code for immediate data but that would tie our hands
* too much if we ever want to enhace the firmware. It would also
* create more differences between the PF and VF Drivers.
*/
return false;
}
/**
* t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR
* @skb: the packet
*
* Returns the number of flits needed for a TX Work Request for the
* given Ethernet packet, including the needed WR and CPL headers.
*/
static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb)
{
unsigned int flits;
/* If the skb is small enough, we can pump it out as a work request
* with only immediate data. In that case we just have to have the
* TX Packet header plus the skb data in the Work Request.
*/
if (t4vf_is_eth_imm(skb))
return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt),
sizeof(__be64));
/* Otherwise, we're going to have to construct a Scatter gather list
* of the skb body and fragments. We also include the flits necessary
* for the TX Packet Work Request and CPL. We always have a firmware
* Write Header (incorporated as part of the cpl_tx_pkt_lso and
* cpl_tx_pkt structures), followed by either a TX Packet Write CPL
* message or, if we're doing a Large Send Offload, an LSO CPL message
* with an embedded TX Packet Write CPL message.
*/
flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
if (skb_shinfo(skb)->gso_size)
flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
sizeof(struct cpl_tx_pkt_lso_core) +
sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
else
flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
return flits;
}
/**
* cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
* @skb: the packet
* @dev: the egress net device
*
* Add a packet to an SGE Ethernet TX queue. Runs with softirqs disabled.
*/
static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
struct net_device *dev)
{
dma_addr_t addr[MAX_SKB_FRAGS + 1];
const struct skb_shared_info *ssi;
struct fw_eth_tx_pkt_vm_wr *wr;
int qidx, credits, max_pkt_len;
struct cpl_tx_pkt_core *cpl;
const struct port_info *pi;
unsigned int flits, ndesc;
struct sge_eth_txq *txq;
struct adapter *adapter;
u64 cntrl, *end;
u32 wr_mid;
const size_t fw_hdr_copy_len = sizeof(wr->ethmacdst) +
sizeof(wr->ethmacsrc) +
sizeof(wr->ethtype) +
sizeof(wr->vlantci);
/* The chip minimum packet length is 10 octets but the firmware
* command that we are using requires that we copy the Ethernet header
* (including the VLAN tag) into the header so we reject anything
* smaller than that ...
*/
if (unlikely(skb->len < fw_hdr_copy_len))
goto out_free;
/* Discard the packet if the length is greater than mtu */
max_pkt_len = ETH_HLEN + dev->mtu;
if (skb_vlan_tag_present(skb))
max_pkt_len += VLAN_HLEN;
if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
goto out_free;
/* Figure out which TX Queue we're going to use. */
pi = netdev_priv(dev);
adapter = pi->adapter;
qidx = skb_get_queue_mapping(skb);
WARN_ON(qidx >= pi->nqsets);
txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
/* Take this opportunity to reclaim any TX Descriptors whose DMA
* transfers have completed.
*/
cxgb4_reclaim_completed_tx(adapter, &txq->q, true);
/* Calculate the number of flits and TX Descriptors we're going to
* need along with how many TX Descriptors will be left over after
* we inject our Work Request.
*/
flits = t4vf_calc_tx_flits(skb);
ndesc = flits_to_desc(flits);
credits = txq_avail(&txq->q) - ndesc;
if (unlikely(credits < 0)) {
/* Not enough room for this packet's Work Request. Stop the
* TX Queue and return a "busy" condition. The queue will get
* started later on when the firmware informs us that space
* has opened up.
*/
eth_txq_stop(txq);
dev_err(adapter->pdev_dev,
"%s: TX ring %u full while queue awake!\n",
dev->name, qidx);
return NETDEV_TX_BUSY;
}
if (!t4vf_is_eth_imm(skb) &&
unlikely(cxgb4_map_skb(adapter->pdev_dev, skb, addr) < 0)) {
/* We need to map the skb into PCI DMA space (because it can't
* be in-lined directly into the Work Request) and the mapping
* operation failed. Record the error and drop the packet.
*/
txq->mapping_err++;
goto out_free;
}
wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
if (unlikely(credits < ETHTXQ_STOP_THRES)) {
/* After we're done injecting the Work Request for this
* packet, we'll be below our "stop threshold" so stop the TX
* Queue now and schedule a request for an SGE Egress Queue
* Update message. The queue will get started later on when
* the firmware processes this Work Request and sends us an
* Egress Queue Status Update message indicating that space
* has opened up.
*/
eth_txq_stop(txq);
wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
}
/* Start filling in our Work Request. Note that we do _not_ handle
* the WR Header wrapping around the TX Descriptor Ring. If our
* maximum header size ever exceeds one TX Descriptor, we'll need to
* do something else here.
*/
WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
wr = (void *)&txq->q.desc[txq->q.pidx];
wr->equiq_to_len16 = cpu_to_be32(wr_mid);
wr->r3[0] = cpu_to_be32(0);
wr->r3[1] = cpu_to_be32(0);
skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
end = (u64 *)wr + flits;
/* If this is a Large Send Offload packet we'll put in an LSO CPL
* message with an encapsulated TX Packet CPL message. Otherwise we
* just use a TX Packet CPL message.
*/
ssi = skb_shinfo(skb);
if (ssi->gso_size) {
struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
int l3hdr_len = skb_network_header_len(skb);
int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
wr->op_immdlen =
cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
FW_WR_IMMDLEN_V(sizeof(*lso) +
sizeof(*cpl)));
/* Fill in the LSO CPL message. */
lso->lso_ctrl =
cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
LSO_FIRST_SLICE_F |
LSO_LAST_SLICE_F |
LSO_IPV6_V(v6) |
LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
LSO_IPHDR_LEN_V(l3hdr_len / 4) |
LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
lso->ipid_ofst = cpu_to_be16(0);
lso->mss = cpu_to_be16(ssi->gso_size);
lso->seqno_offset = cpu_to_be32(0);
if (is_t4(adapter->params.chip))
lso->len = cpu_to_be32(skb->len);
else
lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
/* Set up TX Packet CPL pointer, control word and perform
* accounting.
*/
cpl = (void *)(lso + 1);
if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
else
cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
TXPKT_IPHDR_LEN_V(l3hdr_len);
txq->tso++;
txq->tx_cso += ssi->gso_segs;
} else {
int len;
len = (t4vf_is_eth_imm(skb)
? skb->len + sizeof(*cpl)
: sizeof(*cpl));
wr->op_immdlen =
cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
FW_WR_IMMDLEN_V(len));
/* Set up TX Packet CPL pointer, control word and perform
* accounting.
*/
cpl = (void *)(wr + 1);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
cntrl = hwcsum(adapter->params.chip, skb) |
TXPKT_IPCSUM_DIS_F;
txq->tx_cso++;
} else {
cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
}
}
/* If there's a VLAN tag present, add that to the list of things to
* do in this Work Request.
*/
if (skb_vlan_tag_present(skb)) {
txq->vlan_ins++;
cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
}
/* Fill in the TX Packet CPL message header. */
cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
TXPKT_INTF_V(pi->port_id) |
TXPKT_PF_V(0));
cpl->pack = cpu_to_be16(0);
cpl->len = cpu_to_be16(skb->len);
cpl->ctrl1 = cpu_to_be64(cntrl);
/* Fill in the body of the TX Packet CPL message with either in-lined
* data or a Scatter/Gather List.
*/
if (t4vf_is_eth_imm(skb)) {
/* In-line the packet's data and free the skb since we don't
* need it any longer.
*/
cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
dev_consume_skb_any(skb);
} else {
/* Write the skb's Scatter/Gather list into the TX Packet CPL
* message and retain a pointer to the skb so we can free it
* later when its DMA completes. (We store the skb pointer
* in the Software Descriptor corresponding to the last TX
* Descriptor used by the Work Request.)
*
* The retained skb will be freed when the corresponding TX
* Descriptors are reclaimed after their DMAs complete.
* However, this could take quite a while since, in general,
* the hardware is set up to be lazy about sending DMA
* completion notifications to us and we mostly perform TX
* reclaims in the transmit routine.
*
* This is good for performamce but means that we rely on new
* TX packets arriving to run the destructors of completed
* packets, which open up space in their sockets' send queues.
* Sometimes we do not get such new packets causing TX to
* stall. A single UDP transmitter is a good example of this
* situation. We have a clean up timer that periodically
* reclaims completed packets but it doesn't run often enough
* (nor do we want it to) to prevent lengthy stalls. A
* solution to this problem is to run the destructor early,
* after the packet is queued but before it's DMAd. A con is
* that we lie to socket memory accounting, but the amount of
* extra memory is reasonable (limited by the number of TX
* descriptors), the packets do actually get freed quickly by
* new packets almost always, and for protocols like TCP that
* wait for acks to really free up the data the extra memory
* is even less. On the positive side we run the destructors
* on the sending CPU rather than on a potentially different
* completing CPU, usually a good thing.
*
* Run the destructor before telling the DMA engine about the
* packet to make sure it doesn't complete and get freed
* prematurely.
*/
struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
struct sge_txq *tq = &txq->q;
int last_desc;
/* If the Work Request header was an exact multiple of our TX
* Descriptor length, then it's possible that the starting SGL
* pointer lines up exactly with the end of our TX Descriptor
* ring. If that's the case, wrap around to the beginning
* here ...
*/
if (unlikely((void *)sgl == (void *)tq->stat)) {
sgl = (void *)tq->desc;
end = (void *)((void *)tq->desc +
((void *)end - (void *)tq->stat));
}
cxgb4_write_sgl(skb, tq, sgl, end, 0, addr);
skb_orphan(skb);
last_desc = tq->pidx + ndesc - 1;
if (last_desc >= tq->size)
last_desc -= tq->size;
tq->sdesc[last_desc].skb = skb;
tq->sdesc[last_desc].sgl = sgl;
}
/* Advance our internal TX Queue state, tell the hardware about
* the new TX descriptors and return success.
*/
txq_advance(&txq->q, ndesc);
cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
return NETDEV_TX_OK;
out_free:
/* An error of some sort happened. Free the TX skb and tell the
* OS that we've "dealt" with the packet ...
*/
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct port_info *pi = netdev_priv(dev);
if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM))
return cxgb4_vf_eth_xmit(skb, dev);
return cxgb4_eth_xmit(skb, dev);
}
/** /**
* reclaim_completed_tx_imm - reclaim completed control-queue Tx descs * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
* @q: the SGE control Tx queue * @q: the SGE control Tx queue
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment