Commit e80d6556 authored by Shahar Shitrit's avatar Shahar Shitrit Committed by Jakub Kicinski
Browse files

net/mlx5e: Fix potential deadlock by deferring RX timeout recovery



mlx5e_reporter_rx_timeout() is currently invoked synchronously
in the driver's open error flow. This causes the thread holding
priv->state_lock to attempt acquiring the devlink lock, which
can result in a circular dependency with other devlink operations.

For example:

- Devlink health diagnose flow:
  - __devlink_nl_pre_doit() acquires the devlink lock.
  - devlink_nl_health_reporter_diagnose_doit() invokes the
    driver's diagnose callback.
  - mlx5e_rx_reporter_diagnose() then attempts to acquire
    priv->state_lock.

- Driver open flow:
  - mlx5e_open() acquires priv->state_lock.
  - If an error occurs, devlink_health_reporter may be called,
    attempting to acquire the devlink lock.

To prevent this circular locking scenario, defer the RX timeout
recovery by scheduling it via a workqueue. This ensures that the
recovery work acquires locks in a consistent order: first the
devlink lock, then priv->state_lock.

Additionally, make the recovery work acquire the netdev instance
lock to safely synchronize with the open/close channel flows,
similar to mlx5e_tx_timeout_work. Repeatedly attempt to acquire
the netdev instance lock until it is taken or the target RQ is no
longer active, as indicated by the MLX5E_STATE_CHANNELS_ACTIVE bit.

Fixes: 32c57fb2 ("net/mlx5e: Report and recover from rx timeout")
Signed-off-by: default avatarShahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: default avatarCosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: default avatarDragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1753256672-337784-4-git-send-email-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 6d19c44b
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -728,6 +728,7 @@ struct mlx5e_rq {
	struct xsk_buff_pool  *xsk_pool;

	struct work_struct     recover_work;
	struct work_struct     rx_timeout_work;

	/* control */
	struct mlx5_wq_ctrl    wq_ctrl;
+7 −0
Original line number Diff line number Diff line
@@ -170,16 +170,23 @@ static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
static int mlx5e_rx_reporter_timeout_recover(void *ctx)
{
	struct mlx5_eq_comp *eq;
	struct mlx5e_priv *priv;
	struct mlx5e_rq *rq;
	int err;

	rq = ctx;
	priv = rq->priv;

	mutex_lock(&priv->state_lock);

	eq = rq->cq.mcq.eq;

	err = mlx5e_health_channel_eq_recover(rq->netdev, eq, rq->cq.ch_stats);
	if (err && rq->icosq)
		clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);

	mutex_unlock(&priv->state_lock);

	return err;
}

+25 −1
Original line number Diff line number Diff line
@@ -707,6 +707,27 @@ static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
	mlx5e_reporter_rq_cqe_err(rq);
}

static void mlx5e_rq_timeout_work(struct work_struct *timeout_work)
{
	struct mlx5e_rq *rq = container_of(timeout_work,
					   struct mlx5e_rq,
					   rx_timeout_work);

	/* Acquire netdev instance lock to synchronize with channel close and
	 * reopen flows. Either successfully obtain the lock, or detect that
	 * channels are closing for another reason, making this work no longer
	 * necessary.
	 */
	while (!netdev_trylock(rq->netdev)) {
		if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
			return;
		msleep(20);
	}

	mlx5e_reporter_rx_timeout(rq);
	netdev_unlock(rq->netdev);
}

static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq)
{
	rq->wqe_overflow.page = alloc_page(GFP_KERNEL);
@@ -830,6 +851,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,

	rqp->wq.db_numa_node = node;
	INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
	INIT_WORK(&rq->rx_timeout_work, mlx5e_rq_timeout_work);

	if (params->xdp_prog)
		bpf_prog_inc(params->xdp_prog);
@@ -1204,7 +1226,8 @@ int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
	netdev_warn(rq->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
		    rq->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes);

	mlx5e_reporter_rx_timeout(rq);
	queue_work(rq->priv->wq, &rq->rx_timeout_work);

	return -ETIMEDOUT;
}

@@ -1375,6 +1398,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
	if (rq->dim)
		cancel_work_sync(&rq->dim->work);
	cancel_work_sync(&rq->recover_work);
	cancel_work_sync(&rq->rx_timeout_work);
	mlx5e_destroy_rq(rq);
	mlx5e_free_rx_descs(rq);
	mlx5e_free_rq(rq);