Commit 2c010730 authored by Erez Shitrit's avatar Erez Shitrit Committed by Doug Ledford

IB/ipoib: Handle QP in SQE state

As the result of a completion error the QP can moved to SQE state by
the hardware. Since it's not the Error state, there are no flushes
and hence the driver doesn't know about that.

The fix creates a task that after completion with error which is not a
flush tracks the QP state and if it is in SQE state moves it back to RTS.
Signed-off-by: default avatarErez Shitrit <erezsh@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 3fd0605c
...@@ -299,6 +299,11 @@ struct ipoib_neigh_table { ...@@ -299,6 +299,11 @@ struct ipoib_neigh_table {
struct completion deleted; struct completion deleted;
}; };
struct ipoib_qp_state_validate {
struct work_struct work;
struct ipoib_dev_priv *priv;
};
/* /*
* Device private locking: network stack tx_lock protects members used * Device private locking: network stack tx_lock protects members used
* in TX fast path, lock protects everything else. lock nests inside * in TX fast path, lock protects everything else. lock nests inside
......
...@@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, ...@@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
} }
} }
/*
* As the result of a completion error the QP Can be transferred to SQE states.
* The function checks if the (send)QP is in SQE state and
* moves it back to RTS state, that in order to have it functional again.
*/
static void ipoib_qp_state_validate_work(struct work_struct *work)
{
struct ipoib_qp_state_validate *qp_work =
container_of(work, struct ipoib_qp_state_validate, work);
struct ipoib_dev_priv *priv = qp_work->priv;
struct ib_qp_attr qp_attr;
struct ib_qp_init_attr query_init_attr;
int ret;
ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
if (ret) {
ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
__func__, ret);
goto free_res;
}
pr_info("%s: QP: 0x%x is in state: %d\n",
__func__, priv->qp->qp_num, qp_attr.qp_state);
/* currently support only in SQE->RTS transition*/
if (qp_attr.qp_state == IB_QPS_SQE) {
qp_attr.qp_state = IB_QPS_RTS;
ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
if (ret) {
pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
ret, priv->qp->qp_num);
goto free_res;
}
pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
__func__, priv->qp->qp_num);
} else {
pr_warn("QP (%d) will stay in state: %d\n",
priv->qp->qp_num, qp_attr.qp_state);
}
free_res:
kfree(qp_work);
}
static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
{ {
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
...@@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
netif_wake_queue(dev); netif_wake_queue(dev);
if (wc->status != IB_WC_SUCCESS && if (wc->status != IB_WC_SUCCESS &&
wc->status != IB_WC_WR_FLUSH_ERR) wc->status != IB_WC_WR_FLUSH_ERR) {
struct ipoib_qp_state_validate *qp_work;
ipoib_warn(priv, "failed send event " ipoib_warn(priv, "failed send event "
"(status=%d, wrid=%d vend_err %x)\n", "(status=%d, wrid=%d vend_err %x)\n",
wc->status, wr_id, wc->vendor_err); wc->status, wr_id, wc->vendor_err);
qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
if (!qp_work) {
ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
__func__, priv->qp->qp_num);
return;
}
INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
qp_work->priv = priv;
queue_work(priv->wq, &qp_work->work);
}
} }
static int poll_tx(struct ipoib_dev_priv *priv) static int poll_tx(struct ipoib_dev_priv *priv)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment