Commit 25d97dd5 authored by Vennila Megavannan's avatar Vennila Megavannan Committed by Greg Kroah-Hartman

staging/rdma/hfi1: Prevent host software lock up

If packets stop egressing the hardware link, software can lock up.

Implement a timeout for send context halt recovery.  This patch increases the
timeout for packet egress to 500 us and timer resets to zero if the packet
occupancy changes. Also we bounce the link on time out.
Reviewed-by: default avatarDean Luick <dean.luick@intel.com>
Signed-off-by: default avatarVennila Megavannan <vennila.megavannan@intel.com>
Signed-off-by: default avatarIra Weiny <ira.weiny@intel.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent dc060245
...@@ -922,10 +922,12 @@ void sc_disable(struct send_context *sc) ...@@ -922,10 +922,12 @@ void sc_disable(struct send_context *sc)
static void sc_wait_for_packet_egress(struct send_context *sc, int pause) static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
{ {
struct hfi1_devdata *dd = sc->dd; struct hfi1_devdata *dd = sc->dd;
u64 reg; u64 reg = 0;
u64 reg_prev;
u32 loop = 0; u32 loop = 0;
while (1) { while (1) {
reg_prev = reg;
reg = read_csr(dd, sc->hw_context * 8 + reg = read_csr(dd, sc->hw_context * 8 +
SEND_EGRESS_CTXT_STATUS); SEND_EGRESS_CTXT_STATUS);
/* done if egress is stopped */ /* done if egress is stopped */
...@@ -934,11 +936,17 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) ...@@ -934,11 +936,17 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
reg = packet_occupancy(reg); reg = packet_occupancy(reg);
if (reg == 0) if (reg == 0)
break; break;
if (loop > 100) { /* counter is reset if occupancy count changes */
if (reg != reg_prev)
loop = 0;
if (loop > 500) {
/* timed out - bounce the link */
dd_dev_err(dd, dd_dev_err(dd,
"%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n", "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
__func__, sc->sw_index, __func__, sc->sw_index,
sc->hw_context, (u32)reg); sc->hw_context, (u32)reg);
queue_work(dd->pport->hfi1_wq,
&dd->pport->link_bounce_work);
break; break;
} }
loop++; loop++;
......
...@@ -303,17 +303,26 @@ static void sdma_wait_for_packet_egress(struct sdma_engine *sde, ...@@ -303,17 +303,26 @@ static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
u64 off = 8 * sde->this_idx; u64 off = 8 * sde->this_idx;
struct hfi1_devdata *dd = sde->dd; struct hfi1_devdata *dd = sde->dd;
int lcnt = 0; int lcnt = 0;
u64 reg_prev;
u64 reg = 0;
while (1) { while (1) {
u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS); reg_prev = reg;
reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK; reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT; reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
if (reg == 0) if (reg == 0)
break; break;
if (lcnt++ > 100) { /* counter is reest if accupancy count changes */
dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n", if (reg != reg_prev)
lcnt = 0;
if (lcnt++ > 500) {
/* timed out - bounce the link */
dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
__func__, sde->this_idx, (u32)reg); __func__, sde->this_idx, (u32)reg);
queue_work(dd->pport->hfi1_wq,
&dd->pport->link_bounce_work);
break; break;
} }
udelay(1); udelay(1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment