Commit 9a25a30e authored by David S. Miller's avatar David S. Miller

Merge branch 'mlx4_en-fixes'

Tariq Toukan says:

====================
mlx4_en fixes

This patchset by Moshe contains fixes to the mlx4 Eth driver,
addressing issues in restart flow.

Patch 1 protects the restart task from being rescheduled while active.
  Please queue for -stable >= v2.6.
Patch 2 reconstructs SQs stuck in error state, and adds prints for improved
  debuggability.
  Please queue for -stable >= v3.12.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 299bcb55 ba603d9d
...@@ -1378,8 +1378,10 @@ static void mlx4_en_tx_timeout(struct net_device *dev, unsigned int txqueue) ...@@ -1378,8 +1378,10 @@ static void mlx4_en_tx_timeout(struct net_device *dev, unsigned int txqueue)
tx_ring->cons, tx_ring->prod); tx_ring->cons, tx_ring->prod);
priv->port_stats.tx_timeout++; priv->port_stats.tx_timeout++;
en_dbg(DRV, priv, "Scheduling watchdog\n"); if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state)) {
queue_work(mdev->workqueue, &priv->watchdog_task); en_dbg(DRV, priv, "Scheduling port restart\n");
queue_work(mdev->workqueue, &priv->restart_task);
}
} }
...@@ -1733,6 +1735,7 @@ int mlx4_en_start_port(struct net_device *dev) ...@@ -1733,6 +1735,7 @@ int mlx4_en_start_port(struct net_device *dev)
mlx4_en_deactivate_cq(priv, cq); mlx4_en_deactivate_cq(priv, cq);
goto tx_err; goto tx_err;
} }
clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state);
if (t != TX_XDP) { if (t != TX_XDP) {
tx_ring->tx_queue = netdev_get_tx_queue(dev, i); tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
tx_ring->recycle_ring = NULL; tx_ring->recycle_ring = NULL;
...@@ -1829,6 +1832,7 @@ int mlx4_en_start_port(struct net_device *dev) ...@@ -1829,6 +1832,7 @@ int mlx4_en_start_port(struct net_device *dev)
local_bh_enable(); local_bh_enable();
} }
clear_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state);
netif_tx_start_all_queues(dev); netif_tx_start_all_queues(dev);
netif_device_attach(dev); netif_device_attach(dev);
...@@ -1999,7 +2003,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) ...@@ -1999,7 +2003,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
static void mlx4_en_restart(struct work_struct *work) static void mlx4_en_restart(struct work_struct *work)
{ {
struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
watchdog_task); restart_task);
struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_dev *mdev = priv->mdev;
struct net_device *dev = priv->dev; struct net_device *dev = priv->dev;
...@@ -2377,7 +2381,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) ...@@ -2377,7 +2381,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
if (netif_running(dev)) { if (netif_running(dev)) {
mutex_lock(&mdev->state_lock); mutex_lock(&mdev->state_lock);
if (!mdev->device_up) { if (!mdev->device_up) {
/* NIC is probably restarting - let watchdog task reset /* NIC is probably restarting - let restart task reset
* the port */ * the port */
en_dbg(DRV, priv, "Change MTU called with card down!?\n"); en_dbg(DRV, priv, "Change MTU called with card down!?\n");
} else { } else {
...@@ -2386,7 +2390,9 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) ...@@ -2386,7 +2390,9 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
if (err) { if (err) {
en_err(priv, "Failed restarting port:%d\n", en_err(priv, "Failed restarting port:%d\n",
priv->port); priv->port);
queue_work(mdev->workqueue, &priv->watchdog_task); if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING,
&priv->state))
queue_work(mdev->workqueue, &priv->restart_task);
} }
} }
mutex_unlock(&mdev->state_lock); mutex_unlock(&mdev->state_lock);
...@@ -2792,7 +2798,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) ...@@ -2792,7 +2798,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
if (err) { if (err) {
en_err(priv, "Failed starting port %d for XDP change\n", en_err(priv, "Failed starting port %d for XDP change\n",
priv->port); priv->port);
queue_work(mdev->workqueue, &priv->watchdog_task); if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
queue_work(mdev->workqueue, &priv->restart_task);
} }
} }
...@@ -3165,7 +3172,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, ...@@ -3165,7 +3172,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev); priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
spin_lock_init(&priv->stats_lock); spin_lock_init(&priv->stats_lock);
INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode); INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
INIT_WORK(&priv->watchdog_task, mlx4_en_restart); INIT_WORK(&priv->restart_task, mlx4_en_restart);
INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task); INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task);
......
...@@ -392,6 +392,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) ...@@ -392,6 +392,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
return cnt; return cnt;
} }
static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
u16 cqe_index, struct mlx4_en_tx_ring *ring)
{
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_tx_info *tx_info;
struct mlx4_en_tx_desc *tx_desc;
u16 wqe_index;
int desc_size;
en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
false);
wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
tx_info = &ring->tx_info[wqe_index];
desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
wqe_index, desc_size);
tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);
if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
return;
en_err(priv, "Scheduling port restart\n");
queue_work(mdev->workqueue, &priv->restart_task);
}
int mlx4_en_process_tx_cq(struct net_device *dev, int mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget) struct mlx4_en_cq *cq, int napi_budget)
{ {
...@@ -438,13 +467,10 @@ int mlx4_en_process_tx_cq(struct net_device *dev, ...@@ -438,13 +467,10 @@ int mlx4_en_process_tx_cq(struct net_device *dev,
dma_rmb(); dma_rmb();
if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
MLX4_CQE_OPCODE_ERROR)) { MLX4_CQE_OPCODE_ERROR))
struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe; if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))
mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n", ring);
cqe_err->vendor_err_syndrome,
cqe_err->syndrome);
}
/* Skip over last polled CQE */ /* Skip over last polled CQE */
new_index = be16_to_cpu(cqe->wqe_index) & size_mask; new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
......
...@@ -271,6 +271,10 @@ struct mlx4_en_page_cache { ...@@ -271,6 +271,10 @@ struct mlx4_en_page_cache {
} buf[MLX4_EN_CACHE_SIZE]; } buf[MLX4_EN_CACHE_SIZE];
}; };
enum {
MLX4_EN_TX_RING_STATE_RECOVERING,
};
struct mlx4_en_priv; struct mlx4_en_priv;
struct mlx4_en_tx_ring { struct mlx4_en_tx_ring {
...@@ -317,6 +321,7 @@ struct mlx4_en_tx_ring { ...@@ -317,6 +321,7 @@ struct mlx4_en_tx_ring {
* Only queue_stopped might be used if BQL is not properly working. * Only queue_stopped might be used if BQL is not properly working.
*/ */
unsigned long queue_stopped; unsigned long queue_stopped;
unsigned long state;
struct mlx4_hwq_resources sp_wqres; struct mlx4_hwq_resources sp_wqres;
struct mlx4_qp sp_qp; struct mlx4_qp sp_qp;
struct mlx4_qp_context sp_context; struct mlx4_qp_context sp_context;
...@@ -530,6 +535,10 @@ struct mlx4_en_stats_bitmap { ...@@ -530,6 +535,10 @@ struct mlx4_en_stats_bitmap {
struct mutex mutex; /* for mutual access to stats bitmap */ struct mutex mutex; /* for mutual access to stats bitmap */
}; };
enum {
MLX4_EN_STATE_FLAG_RESTARTING,
};
struct mlx4_en_priv { struct mlx4_en_priv {
struct mlx4_en_dev *mdev; struct mlx4_en_dev *mdev;
struct mlx4_en_port_profile *prof; struct mlx4_en_port_profile *prof;
...@@ -595,7 +604,7 @@ struct mlx4_en_priv { ...@@ -595,7 +604,7 @@ struct mlx4_en_priv {
struct mlx4_en_cq *rx_cq[MAX_RX_RINGS]; struct mlx4_en_cq *rx_cq[MAX_RX_RINGS];
struct mlx4_qp drop_qp; struct mlx4_qp drop_qp;
struct work_struct rx_mode_task; struct work_struct rx_mode_task;
struct work_struct watchdog_task; struct work_struct restart_task;
struct work_struct linkstate_task; struct work_struct linkstate_task;
struct delayed_work stats_task; struct delayed_work stats_task;
struct delayed_work service_task; struct delayed_work service_task;
...@@ -641,6 +650,7 @@ struct mlx4_en_priv { ...@@ -641,6 +650,7 @@ struct mlx4_en_priv {
u32 pflags; u32 pflags;
u8 rss_key[MLX4_EN_RSS_KEY_SIZE]; u8 rss_key[MLX4_EN_RSS_KEY_SIZE];
u8 rss_hash_fn; u8 rss_hash_fn;
unsigned long state;
}; };
enum mlx4_en_wol { enum mlx4_en_wol {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment