Commit 3178308a authored by Rahul Rameshbabu's avatar Rahul Rameshbabu Committed by Saeed Mahameed

net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs

Use a map structure for associating CQEs containing port timestamping
information with the appropriate skb. Track order of WQEs submitted using a
FIFO. Check if the corresponding port timestamping CQEs from the lookup
values in the FIFO are considered dropped due to time elapsed. Return the
lookup value to a freelist after consuming the skb. Reuse the freed lookup
in future WQE submission iterations.

The map structure uses an integer identifier for the key and returns an skb
corresponding to that identifier. Embed the integer identifier in the WQE
submitted to the WQ for the transmit path when the SQ is a PTP (port
timestamping) SQ. The embedded identifier can then be queried using a field
in the CQE of the corresponding port timestamping CQ. In the port
timestamping napi_poll context, the identifier is queried from the CQE
polled from CQ and used to lookup the corresponding skb from the WQE submit
path. The skb reference is removed from map and then embedded with the port
HW timestamp information from the CQE and eventually consumed.

The metadata freelist FIFO is an array containing integer identifiers that
can be pushed and popped in the FIFO. The purpose of this structure is
bookkeeping what identifier values can safely be used in a subsequent WQE
submission and should not contain identifiers that have still not been
reaped by processing a corresponding CQE completion on the port
timestamping CQ.

The ts_cqe_pending_list structure is a combination of an array and linked
list. The array is pre-populated with the nodes that will be added and
removed from the head of the linked list. Each node contains the unique
identifier value associated with the values submitted in the WQEs and
retrieved in the port timestamping CQEs. When a WQE is submitted, the node
in the array corresponding to the identifier popped from the metadata
freelist is added to the end of the CQE pending list and is marked as
"in-use". The node is removed from the linked list under two conditions.
The first condition is that the corresponding port timestamping CQE is
polled in the PTP napi_poll context. The second condition is that more than
a second has elapsed since the DMA timestamp value corresponding to the WQE
submission. When the first condition occurs, the "in-use" bit in the linked
list node is cleared, and the resources corresponding to the WQE submission
are then released. The second condition, however, indicates that the port
timestamping CQE will likely never be delivered. It's not impossible for
the device to post a CQE after an infinite amount of time though highly
improbable. In order to be resilient to this improbable case, resources
related to the corresponding WQE submission are still kept, the identifier
value is not returned to the freelist, and the "in-use" bit is cleared on
the node to indicate that it's no longer part of the linked list of "likely
to be delivered" port timestamping CQE identifiers. A count for the number
of port timestamping CQEs considered highly likely to never be delivered by
the device is maintained. This count gets decremented in the unlikely event
a port timestamping CQE considered unlikely to ever be delivered is polled
in the PTP napi_poll context.
Signed-off-by: default avatarRahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: default avatarTariq Toukan <tariqt@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
parent b608dd67
......@@ -683,6 +683,12 @@ the software port.
time protocol.
- Error
* - `ptp_cq[i]_late_cqe`
- Number of times a CQE has been delivered on the PTP timestamping CQ when
the CQE was not expected since a certain amount of time had elapsed where
the device typically ensures not posting the CQE.
- Error
.. [#ring_global] The corresponding ring and global counters do not share the
same name (i.e. do not follow the common naming scheme).
......
......@@ -7,18 +7,36 @@
#include "en.h"
#include "en_stats.h"
#include "en/txrx.h"
#include <linux/ktime.h>
#include <linux/ptp_classify.h>
#include <linux/time64.h>
#define MLX5E_PTP_CHANNEL_IX 0
#define MLX5E_PTP_MAX_LOG_SQ_SIZE (8U)
#define MLX5E_PTP_TS_CQE_UNDELIVERED_TIMEOUT (1 * NSEC_PER_SEC)
struct mlx5e_ptp_metadata_fifo {
u8 cc;
u8 pc;
u8 mask;
u8 *data;
};
struct mlx5e_ptp_metadata_map {
u16 undelivered_counter;
u16 capacity;
struct sk_buff **data;
};
struct mlx5e_ptpsq {
struct mlx5e_txqsq txqsq;
struct mlx5e_cq ts_cq;
u16 skb_fifo_cc;
u16 skb_fifo_pc;
struct mlx5e_skb_fifo skb_fifo;
struct mlx5e_ptp_cq_stats *cq_stats;
u16 ts_cqe_ctr_mask;
struct mlx5e_ptp_port_ts_cqe_list *ts_cqe_pending_list;
struct mlx5e_ptp_metadata_fifo metadata_freelist;
struct mlx5e_ptp_metadata_map metadata_map;
};
enum {
......@@ -69,12 +87,35 @@ static inline bool mlx5e_use_ptpsq(struct sk_buff *skb)
fk.ports.dst == htons(PTP_EV_PORT));
}
static inline bool mlx5e_ptpsq_fifo_has_room(struct mlx5e_txqsq *sq)
static inline void mlx5e_ptp_metadata_fifo_push(struct mlx5e_ptp_metadata_fifo *fifo, u8 metadata)
{
if (!sq->ptpsq)
return true;
fifo->data[fifo->mask & fifo->pc++] = metadata;
}
static inline u8
mlx5e_ptp_metadata_fifo_pop(struct mlx5e_ptp_metadata_fifo *fifo)
{
return fifo->data[fifo->mask & fifo->cc++];
}
return mlx5e_skb_fifo_has_room(&sq->ptpsq->skb_fifo);
static inline void
mlx5e_ptp_metadata_map_put(struct mlx5e_ptp_metadata_map *map,
struct sk_buff *skb, u8 metadata)
{
WARN_ON_ONCE(map->data[metadata]);
map->data[metadata] = skb;
}
static inline bool mlx5e_ptpsq_metadata_freelist_empty(struct mlx5e_ptpsq *ptpsq)
{
struct mlx5e_ptp_metadata_fifo *freelist;
if (likely(!ptpsq))
return false;
freelist = &ptpsq->metadata_freelist;
return freelist->pc == freelist->cc;
}
int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
......@@ -89,6 +130,8 @@ void mlx5e_ptp_free_rx_fs(struct mlx5e_flow_steering *fs,
const struct mlx5e_profile *profile);
int mlx5e_ptp_rx_manage_fs(struct mlx5e_priv *priv, bool set);
void mlx5e_ptpsq_track_metadata(struct mlx5e_ptpsq *ptpsq, u8 metadata);
enum {
MLX5E_SKB_CB_CQE_HWTSTAMP = BIT(0),
MLX5E_SKB_CB_PORT_HWTSTAMP = BIT(1),
......
......@@ -2061,7 +2061,8 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable)
struct mlx5e_params new_params;
int err;
if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn) ||
!MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter))
return -EOPNOTSUPP;
/* Don't allow changing the PTP state if HTB offload is active, because
......
......@@ -2142,9 +2142,7 @@ static const struct counter_desc ptp_cq_stats_desc[] = {
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, err_cqe) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort_abs_diff_ns) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_cqe) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_event) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, ooo_cqe_drop) },
{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, late_cqe) },
};
static const struct counter_desc ptp_rq_stats_desc[] = {
......
......@@ -449,9 +449,7 @@ struct mlx5e_ptp_cq_stats {
u64 err_cqe;
u64 abort;
u64 abort_abs_diff_ns;
u64 resync_cqe;
u64 resync_event;
u64 ooo_cqe_drop;
u64 late_cqe;
};
struct mlx5e_rep_stats {
......
......@@ -372,7 +372,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
const struct mlx5e_tx_attr *attr,
const struct mlx5e_tx_wqe_attr *wqe_attr, u8 num_dma,
struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg,
bool xmit_more)
struct mlx5_wqe_eth_seg *eseg, bool xmit_more)
{
struct mlx5_wq_cyc *wq = &sq->wq;
bool send_doorbell;
......@@ -394,11 +394,16 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
mlx5e_tx_check_stop(sq);
if (unlikely(sq->ptpsq)) {
if (unlikely(sq->ptpsq &&
(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) {
u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata);
mlx5e_skb_cb_hwtstamp_init(skb);
mlx5e_skb_fifo_push(&sq->ptpsq->skb_fifo, skb);
mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index);
mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb,
metadata_index);
if (!netif_tx_queue_stopped(sq->txq) &&
!mlx5e_skb_fifo_has_room(&sq->ptpsq->skb_fifo)) {
mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) {
netif_tx_stop_queue(sq->txq);
sq->stats->stopped++;
}
......@@ -483,13 +488,16 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb,
if (unlikely(num_dma < 0))
goto err_drop;
mlx5e_txwqe_complete(sq, skb, attr, wqe_attr, num_dma, wi, cseg, xmit_more);
mlx5e_txwqe_complete(sq, skb, attr, wqe_attr, num_dma, wi, cseg, eseg, xmit_more);
return;
err_drop:
stats->dropped++;
dev_kfree_skb_any(skb);
if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist,
be32_to_cpu(eseg->flow_table_metadata));
mlx5e_tx_flush(sq);
}
......@@ -645,9 +653,9 @@ void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq)
static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb,
struct mlx5_wqe_eth_seg *eseg)
{
if (ptpsq->ts_cqe_ctr_mask && unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
eseg->flow_table_metadata = cpu_to_be32(ptpsq->skb_fifo_pc &
ptpsq->ts_cqe_ctr_mask);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
eseg->flow_table_metadata =
cpu_to_be32(mlx5e_ptp_metadata_fifo_pop(&ptpsq->metadata_freelist));
}
static void mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq,
......@@ -766,7 +774,7 @@ void mlx5e_txqsq_wake(struct mlx5e_txqsq *sq)
{
if (netif_tx_queue_stopped(sq->txq) &&
mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room) &&
mlx5e_ptpsq_fifo_has_room(sq) &&
!mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq) &&
!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
netif_tx_wake_queue(sq->txq);
sq->stats->wake++;
......@@ -1031,7 +1039,7 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
if (unlikely(num_dma < 0))
goto err_drop;
mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, xmit_more);
mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, eseg, xmit_more);
return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment