Commit 82fffff4 authored by Liang Zhen's avatar Liang Zhen Committed by Greg Kroah-Hartman

staging: lustre: check wr_id returned by ib_poll_cq

If ib_poll_cq returned +ve without initialising ib_wc::wr_id (bug
in driver), then o2iblnd will run into unpredictable situation
because ib_wc::wr_id may refer to stale tx/rx pointer in stack.

It indicates bug in HCA driver if this happened, ko2iblnd should
output console error then close current connection.

This patch could also be helpful for LU-5271
Signed-off-by: default avatarLiang Zhen <liang.zhen@intel.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-519
Reviewed-on: http://review.whamcloud.com/12747Reviewed-by: default avatarIsaac Huang <he.huang@intel.com>
Reviewed-by: default avatarDoug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: default avatarJames Simmons <uja.ornl@yahoo.com>
Reviewed-by: default avatarOleg Drokin <oleg.drokin@intel.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent a70d69ae
...@@ -762,9 +762,10 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q) ...@@ -762,9 +762,10 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */ /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */
/* lowest bits of the work request id to stash the work item type. */ /* lowest bits of the work request id to stash the work item type. */
#define IBLND_WID_TX 0 #define IBLND_WID_INVAL 0
#define IBLND_WID_RDMA 1 #define IBLND_WID_TX 1
#define IBLND_WID_RX 2 #define IBLND_WID_RX 2
#define IBLND_WID_RDMA 3
#define IBLND_WID_MASK 3UL #define IBLND_WID_MASK 3UL
static inline __u64 static inline __u64
......
...@@ -768,7 +768,6 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) ...@@ -768,7 +768,6 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
int ver = conn->ibc_version; int ver = conn->ibc_version;
int rc; int rc;
int done; int done;
struct ib_send_wr *bad_wrq;
LASSERT(tx->tx_queued); LASSERT(tx->tx_queued);
/* We rely on this for QP sizing */ /* We rely on this for QP sizing */
...@@ -852,7 +851,14 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) ...@@ -852,7 +851,14 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
/* close_conn will launch failover */ /* close_conn will launch failover */
rc = -ENETDOWN; rc = -ENETDOWN;
} else { } else {
rc = ib_post_send(conn->ibc_cmid->qp, &tx->tx_wrq->wr, &bad_wrq); struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
"bad wr_id %llx, opc %d, flags %d, peer: %s\n",
wrq->wr_id, wrq->opcode, wrq->send_flags,
libcfs_nid2str(conn->ibc_peer->ibp_nid));
wrq = NULL;
rc = ib_post_send(conn->ibc_cmid->qp, &tx->tx_wrq->wr, &wrq);
} }
conn->ibc_last_send = jiffies; conn->ibc_last_send = jiffies;
...@@ -3420,6 +3426,8 @@ kiblnd_scheduler(void *arg) ...@@ -3420,6 +3426,8 @@ kiblnd_scheduler(void *arg)
spin_unlock_irqrestore(&sched->ibs_lock, flags); spin_unlock_irqrestore(&sched->ibs_lock, flags);
wc.wr_id = IBLND_WID_INVAL;
rc = ib_poll_cq(conn->ibc_cq, 1, &wc); rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
if (!rc) { if (!rc) {
rc = ib_req_notify_cq(conn->ibc_cq, rc = ib_req_notify_cq(conn->ibc_cq,
...@@ -3437,6 +3445,15 @@ kiblnd_scheduler(void *arg) ...@@ -3437,6 +3445,15 @@ kiblnd_scheduler(void *arg)
rc = ib_poll_cq(conn->ibc_cq, 1, &wc); rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
} }
if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n",
rc, wc.opcode, wc.status,
wc.vendor_err,
libcfs_nid2str(conn->ibc_peer->ibp_nid),
conn->ibc_state);
rc = -EINVAL;
}
if (rc < 0) { if (rc < 0) {
CWARN("%s: ib_poll_cq failed: %d, closing connection\n", CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid), libcfs_nid2str(conn->ibc_peer->ibp_nid),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment