Commit 2ec8e662 authored by Joachim Fenkes's avatar Joachim Fenkes Committed by Roland Dreier

IB/ehca: Prevent RDMA-related connection failures on some eHCA2 hardware

Some HW revisions of eHCA2 may cause an RC connection to break if they
received RDMA Reads over that connection before.  This can be
prevented by assuring that, after the first RDMA Read, the QP receives
a new RDMA Read every few million link packets.

Include code into the driver that inserts an empty (size 0) RDMA Read
into the message stream every now and then if the consumer doesn't
post them frequently enough.
Signed-off-by: default avatarJoachim Fenkes <fenkes@de.ibm.com>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent bbdd267e
...@@ -183,6 +183,11 @@ struct ehca_qp { ...@@ -183,6 +183,11 @@ struct ehca_qp {
u32 mm_count_squeue; u32 mm_count_squeue;
u32 mm_count_rqueue; u32 mm_count_rqueue;
u32 mm_count_galpa; u32 mm_count_galpa;
/* unsolicited ack circumvention */
int unsol_ack_circ;
int mtu_shift;
u32 message_count;
u32 packet_count;
}; };
#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
......
...@@ -592,9 +592,7 @@ static struct ehca_qp *internal_create_qp( ...@@ -592,9 +592,7 @@ static struct ehca_qp *internal_create_qp(
goto create_qp_exit1; goto create_qp_exit1;
} }
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) /* Always signal by WQE so we can hide circ. WQEs */
parms.sigtype = HCALL_SIGT_EVERY;
else
parms.sigtype = HCALL_SIGT_BY_WQE; parms.sigtype = HCALL_SIGT_BY_WQE;
/* UD_AV CIRCUMVENTION */ /* UD_AV CIRCUMVENTION */
...@@ -618,6 +616,10 @@ static struct ehca_qp *internal_create_qp( ...@@ -618,6 +616,10 @@ static struct ehca_qp *internal_create_qp(
parms.squeue.max_sge = max_send_sge; parms.squeue.max_sge = max_send_sge;
parms.rqueue.max_sge = max_recv_sge; parms.rqueue.max_sge = max_recv_sge;
/* RC QPs need one more SWQE for unsolicited ack circumvention */
if (qp_type == IB_QPT_RC)
parms.squeue.max_wr++;
if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
if (HAS_SQ(my_qp)) if (HAS_SQ(my_qp))
ehca_determine_small_queue( ehca_determine_small_queue(
...@@ -650,6 +652,8 @@ static struct ehca_qp *internal_create_qp( ...@@ -650,6 +652,8 @@ static struct ehca_qp *internal_create_qp(
parms.squeue.act_nr_sges = 1; parms.squeue.act_nr_sges = 1;
parms.rqueue.act_nr_sges = 1; parms.rqueue.act_nr_sges = 1;
} }
/* hide the extra WQE */
parms.squeue.act_nr_wqes--;
break; break;
case IB_QPT_UD: case IB_QPT_UD:
case IB_QPT_GSI: case IB_QPT_GSI:
...@@ -1294,6 +1298,8 @@ static int internal_modify_qp(struct ib_qp *ibqp, ...@@ -1294,6 +1298,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
} }
if (attr_mask & IB_QP_PATH_MTU) { if (attr_mask & IB_QP_PATH_MTU) {
/* store ld(MTU) */
my_qp->mtu_shift = attr->path_mtu + 7;
mqpcb->path_mtu = attr->path_mtu; mqpcb->path_mtu = attr->path_mtu;
update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1); update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
} }
......
...@@ -50,6 +50,9 @@ ...@@ -50,6 +50,9 @@
#include "hcp_if.h" #include "hcp_if.h"
#include "hipz_fns.h" #include "hipz_fns.h"
/* in RC traffic, insert an empty RDMA READ every this many packets */
#define ACK_CIRC_THRESHOLD 2000000
static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
struct ehca_wqe *wqe_p, struct ehca_wqe *wqe_p,
struct ib_recv_wr *recv_wr) struct ib_recv_wr *recv_wr)
...@@ -81,7 +84,7 @@ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, ...@@ -81,7 +84,7 @@ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
if (ehca_debug_level) { if (ehca_debug_level) {
ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p", ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
ipz_rqueue); ipz_rqueue);
ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe"); ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
} }
return 0; return 0;
...@@ -135,7 +138,8 @@ static void trace_send_wr_ud(const struct ib_send_wr *send_wr) ...@@ -135,7 +138,8 @@ static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
static inline int ehca_write_swqe(struct ehca_qp *qp, static inline int ehca_write_swqe(struct ehca_qp *qp,
struct ehca_wqe *wqe_p, struct ehca_wqe *wqe_p,
const struct ib_send_wr *send_wr) const struct ib_send_wr *send_wr,
int hidden)
{ {
u32 idx; u32 idx;
u64 dma_length; u64 dma_length;
...@@ -176,7 +180,9 @@ static inline int ehca_write_swqe(struct ehca_qp *qp, ...@@ -176,7 +180,9 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
wqe_p->wr_flag = 0; wqe_p->wr_flag = 0;
if (send_wr->send_flags & IB_SEND_SIGNALED) if ((send_wr->send_flags & IB_SEND_SIGNALED ||
qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
&& !hidden)
wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM; wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
if (send_wr->opcode == IB_WR_SEND_WITH_IMM || if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
...@@ -199,7 +205,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp, ...@@ -199,7 +205,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8; wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
wqe_p->local_ee_context_qkey = remote_qkey; wqe_p->local_ee_context_qkey = remote_qkey;
if (!send_wr->wr.ud.ah) { if (unlikely(!send_wr->wr.ud.ah)) {
ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
return -EINVAL; return -EINVAL;
} }
...@@ -255,6 +261,15 @@ static inline int ehca_write_swqe(struct ehca_qp *qp, ...@@ -255,6 +261,15 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
} /* eof idx */ } /* eof idx */
wqe_p->u.nud.atomic_1st_op_dma_len = dma_length; wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
/* unsolicited ack circumvention */
if (send_wr->opcode == IB_WR_RDMA_READ) {
/* on RDMA read, switch on and reset counters */
qp->message_count = qp->packet_count = 0;
qp->unsol_ack_circ = 1;
} else
/* else estimate #packets */
qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
break; break;
default: default:
...@@ -355,51 +370,83 @@ static inline void map_ib_wc_status(u32 cqe_status, ...@@ -355,51 +370,83 @@ static inline void map_ib_wc_status(u32 cqe_status,
*wc_status = IB_WC_SUCCESS; *wc_status = IB_WC_SUCCESS;
} }
int ehca_post_send(struct ib_qp *qp, static inline int post_one_send(struct ehca_qp *my_qp,
struct ib_send_wr *send_wr, struct ib_send_wr *cur_send_wr,
struct ib_send_wr **bad_send_wr) struct ib_send_wr **bad_send_wr,
int hidden)
{ {
struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
struct ib_send_wr *cur_send_wr;
struct ehca_wqe *wqe_p; struct ehca_wqe *wqe_p;
int wqe_cnt = 0; int ret;
int ret = 0;
unsigned long flags;
/* LOCK the QUEUE */
spin_lock_irqsave(&my_qp->spinlock_s, flags);
/* loop processes list of send reqs */
for (cur_send_wr = send_wr; cur_send_wr != NULL;
cur_send_wr = cur_send_wr->next) {
u64 start_offset = my_qp->ipz_squeue.current_q_offset; u64 start_offset = my_qp->ipz_squeue.current_q_offset;
/* get pointer next to free WQE */ /* get pointer next to free WQE */
wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
if (unlikely(!wqe_p)) { if (unlikely(!wqe_p)) {
/* too many posted work requests: queue overflow */ /* too many posted work requests: queue overflow */
if (bad_send_wr) if (bad_send_wr)
*bad_send_wr = cur_send_wr; *bad_send_wr = cur_send_wr;
if (wqe_cnt == 0) { ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
ret = -ENOMEM; "qp_num=%x", my_qp->ib_qp.qp_num);
ehca_err(qp->device, "Too many posted WQEs " return -ENOMEM;
"qp_num=%x", qp->qp_num);
}
goto post_send_exit0;
} }
/* write a SEND WQE into the QUEUE */ /* write a SEND WQE into the QUEUE */
ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr); ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, hidden);
/* /*
* if something failed, * if something failed,
* reset the free entry pointer to the start value * reset the free entry pointer to the start value
*/ */
if (unlikely(ret)) { if (unlikely(ret)) {
my_qp->ipz_squeue.current_q_offset = start_offset; my_qp->ipz_squeue.current_q_offset = start_offset;
if (bad_send_wr)
*bad_send_wr = cur_send_wr; *bad_send_wr = cur_send_wr;
if (wqe_cnt == 0) { ehca_err(my_qp->ib_qp.device, "Could not write WQE "
ret = -EINVAL; "qp_num=%x", my_qp->ib_qp.qp_num);
ehca_err(qp->device, "Could not write WQE " return -EINVAL;
"qp_num=%x", qp->qp_num); }
return 0;
}
int ehca_post_send(struct ib_qp *qp,
struct ib_send_wr *send_wr,
struct ib_send_wr **bad_send_wr)
{
struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
struct ib_send_wr *cur_send_wr;
int wqe_cnt = 0;
int ret = 0;
unsigned long flags;
/* LOCK the QUEUE */
spin_lock_irqsave(&my_qp->spinlock_s, flags);
/* Send an empty extra RDMA read if:
* 1) there has been an RDMA read on this connection before
* 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
* 3) we can be sure that any previous extra RDMA read has been
* processed so we don't overflow the SQ
*/
if (unlikely(my_qp->unsol_ack_circ &&
my_qp->packet_count > ACK_CIRC_THRESHOLD &&
my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
/* insert an empty RDMA READ to fix up the remote QP state */
struct ib_send_wr circ_wr;
memset(&circ_wr, 0, sizeof(circ_wr));
circ_wr.opcode = IB_WR_RDMA_READ;
post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */
wqe_cnt++;
ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num);
my_qp->message_count = my_qp->packet_count = 0;
} }
/* loop processes list of send reqs */
for (cur_send_wr = send_wr; cur_send_wr != NULL;
cur_send_wr = cur_send_wr->next) {
ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0);
if (unlikely(ret)) {
/* if one or more WQEs were successful, don't fail */
if (wqe_cnt)
ret = 0;
goto post_send_exit0; goto post_send_exit0;
} }
wqe_cnt++; wqe_cnt++;
...@@ -410,6 +457,7 @@ int ehca_post_send(struct ib_qp *qp, ...@@ -410,6 +457,7 @@ int ehca_post_send(struct ib_qp *qp,
post_send_exit0: post_send_exit0:
iosync(); /* serialize GAL register access */ iosync(); /* serialize GAL register access */
hipz_update_sqa(my_qp, wqe_cnt); hipz_update_sqa(my_qp, wqe_cnt);
my_qp->message_count += wqe_cnt;
spin_unlock_irqrestore(&my_qp->spinlock_s, flags); spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment