Commit 8f7b0030 authored by Cosmin Ratiu's avatar Cosmin Ratiu Committed by Jakub Kicinski
Browse files

net/mlx5e: Convert mlx5 netdevs to instance locking



This patch convert mlx5 to use the new netdev instance lock in addition
to the pre-existing state_lock (and the RTNL).

mlx5e_priv.state_lock was already used throughout mlx5 to protect
against concurrent state modifications on the same netdev, usually in
addition to the RTNL. The new netdev instance lock will eventually
replace it, but for now, it is acquired in addition to the existing
locks in the order RTNL -> instance lock -> state_lock.

All three netdev types handled by mlx5 are converted to the new style of
locking, because they share a lot of code related to initializing
channels and dealing with NAPI, so it's better to convert all three
rather than introduce different assumptions deep in the call stack
depending on the type of device.

Because of the nature of the call graphs in mlx5, it wasn't possible to
incrementally convert parts of the driver to use the new lock, since
either all call paths into NAPI have to possess the new lock if the
*_locked variants are used, or none of them can have the lock.

One area which required extra care is the interaction between closing
channels and devlink health reporter tasks.
Previously, the recovery tasks were unconditionally acquiring the
RTNL, which could lead to deadlocks in these scenarios:

T1: mlx5e_close (== .ndo_stop(), has RTNL) -> mlx5e_close_locked
-> mlx5e_close_channels -> mlx5e_ptp_close
-> mlx5e_ptp_close_queues -> mlx5e_ptp_close_txqsqs
-> mlx5e_ptp_close_txqsq
-> cancel_work_sync(&ptpsq->report_unhealthy_work) waits for

T2: mlx5e_ptpsq_unhealthy_work -> mlx5e_reporter_tx_ptpsq_unhealthy
-> mlx5e_health_report -> devlink_health_report
-> devlink_health_reporter_recover
-> mlx5e_tx_reporter_ptpsq_unhealthy_recover which does:
rtnl_lock();   => Deadlock.

Another similar instance of this is:
T1: mlx5e_close (== .ndo_stop(), has RTNL) -> mlx5e_close_locked
-> mlx5e_close_channels -> mlx5e_ptp_close
-> mlx5e_ptp_close_queues -> mlx5e_ptp_close_txqsqs
-> mlx5e_ptp_close_txqsq
-> cancel_work_sync(&sq->recover_work) waits for

T2: mlx5e_tx_err_cqe_work -> mlx5e_reporter_tx_err_cqe
-> mlx5e_health_report -> devlink_health_report
-> devlink_health_reporter_recover
-> mlx5e_tx_reporter_err_cqe_recover which does:
rtnl_lock();   => Another deadlock.

Fix that by using the same pattern previously done in
mlx5e_tx_timeout_work, where the RTNL was repeatedly tried to be
acquired until either:
a) it is successfully acquired or
b) there's no need for the work to be done any more (channel is being
closed).

Now, for all three recovery tasks, the instance lock is repeatedly tried
to be acquired until successful or the channel/SQ is closed.
As a side-effect, drop the !test_bit(MLX5E_STATE_OPENED, &priv->state)
check from mlx5e_tx_timeout_work, it's weaker than
!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state) and unnecessary.

Future patches will introduce new call paths (from netdev queue
management ops) which can close channels (and call cancel_work_sync on
the recovery tasks) without the RTNL lock and only with the netdev
instance lock.

Signed-off-by: default avatarCosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: default avatarCarolina Jubran <cjubran@nvidia.com>
Reviewed-by: default avatarDragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747829342-1018757-6-git-send-email-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent d7d4f9f7
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -114,6 +114,7 @@ int mlx5e_health_recover_channels(struct mlx5e_priv *priv)
	int err = 0;

	rtnl_lock();
	netdev_lock(priv->netdev);
	mutex_lock(&priv->state_lock);

	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
@@ -123,6 +124,7 @@ int mlx5e_health_recover_channels(struct mlx5e_priv *priv)

out:
	mutex_unlock(&priv->state_lock);
	netdev_unlock(priv->netdev);
	rtnl_unlock();

	return err;
+20 −5
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include "en/fs_tt_redirect.h"
#include <linux/list.h>
#include <linux/spinlock.h>
#include <net/netdev_lock.h>

struct mlx5e_ptp_fs {
	struct mlx5_flow_handle *l2_rule;
@@ -449,8 +450,22 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
{
	struct mlx5e_ptpsq *ptpsq =
		container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
	struct mlx5e_txqsq *sq = &ptpsq->txqsq;

	/* Recovering the PTP SQ means re-enabling NAPI, which requires the
	 * netdev instance lock. However, SQ closing has to wait for this work
	 * task to finish while also holding the same lock. So either get the
	 * lock or find that the SQ is no longer enabled and thus this work is
	 * not relevant anymore.
	 */
	while (!netdev_trylock(sq->netdev)) {
		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
			return;
		msleep(20);
	}

	mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
	netdev_unlock(sq->netdev);
}

static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
@@ -892,7 +907,7 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
	if (err)
		goto err_free;

	netif_napi_add(netdev, &c->napi, mlx5e_ptp_napi_poll);
	netif_napi_add_locked(netdev, &c->napi, mlx5e_ptp_napi_poll);

	mlx5e_ptp_build_params(c, cparams, params);

@@ -910,7 +925,7 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
	return 0;

err_napi_del:
	netif_napi_del(&c->napi);
	netif_napi_del_locked(&c->napi);
err_free:
	kvfree(cparams);
	kvfree(c);
@@ -920,7 +935,7 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
void mlx5e_ptp_close(struct mlx5e_ptp *c)
{
	mlx5e_ptp_close_queues(c);
	netif_napi_del(&c->napi);
	netif_napi_del_locked(&c->napi);

	kvfree(c);
}
@@ -929,7 +944,7 @@ void mlx5e_ptp_activate_channel(struct mlx5e_ptp *c)
{
	int tc;

	napi_enable(&c->napi);
	napi_enable_locked(&c->napi);

	if (test_bit(MLX5E_PTP_STATE_TX, c->state)) {
		for (tc = 0; tc < c->num_tc; tc++)
@@ -957,7 +972,7 @@ void mlx5e_ptp_deactivate_channel(struct mlx5e_ptp *c)
			mlx5e_deactivate_txqsq(&c->ptpsq[tc].txqsq);
	}

	napi_disable(&c->napi);
	napi_disable_locked(&c->napi);
}

int mlx5e_ptp_get_rqn(struct mlx5e_ptp *c, u32 *rqn)
+0 −4
Original line number Diff line number Diff line
@@ -107,9 +107,7 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
	mlx5e_reset_txqsq_cc_pc(sq);
	sq->stats->recover++;
	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
	rtnl_lock();
	mlx5e_activate_txqsq(sq);
	rtnl_unlock();

	if (sq->channel)
		mlx5e_trigger_napi_icosq(sq->channel);
@@ -176,7 +174,6 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)

	priv = ptpsq->txqsq.priv;

	rtnl_lock();
	mutex_lock(&priv->state_lock);
	chs = &priv->channels;
	netdev = priv->netdev;
@@ -196,7 +193,6 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
		netif_carrier_on(netdev);

	mutex_unlock(&priv->state_lock);
	rtnl_unlock();

	return err;
}
+7 −5
Original line number Diff line number Diff line
@@ -149,7 +149,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey);
	t->stats    = &priv->trap_stats.ch;

	netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll);
	netif_napi_add_locked(netdev, &t->napi, mlx5e_trap_napi_poll);

	err = mlx5e_open_trap_rq(priv, t);
	if (unlikely(err))
@@ -164,7 +164,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
err_close_trap_rq:
	mlx5e_close_trap_rq(&t->rq);
err_napi_del:
	netif_napi_del(&t->napi);
	netif_napi_del_locked(&t->napi);
	kvfree(t);
	return ERR_PTR(err);
}
@@ -173,13 +173,13 @@ void mlx5e_close_trap(struct mlx5e_trap *trap)
{
	mlx5e_tir_destroy(&trap->tir);
	mlx5e_close_trap_rq(&trap->rq);
	netif_napi_del(&trap->napi);
	netif_napi_del_locked(&trap->napi);
	kvfree(trap);
}

static void mlx5e_activate_trap(struct mlx5e_trap *trap)
{
	napi_enable(&trap->napi);
	napi_enable_locked(&trap->napi);
	mlx5e_activate_rq(&trap->rq);
	mlx5e_trigger_napi_sched(&trap->napi);
}
@@ -189,7 +189,7 @@ void mlx5e_deactivate_trap(struct mlx5e_priv *priv)
	struct mlx5e_trap *trap = priv->en_trap;

	mlx5e_deactivate_rq(&trap->rq);
	napi_disable(&trap->napi);
	napi_disable_locked(&trap->napi);
}

static struct mlx5e_trap *mlx5e_add_trap_queue(struct mlx5e_priv *priv)
@@ -285,6 +285,7 @@ int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_
	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
		return 0;

	netdev_lock(priv->netdev);
	switch (trap_ctx->action) {
	case DEVLINK_TRAP_ACTION_TRAP:
		err = mlx5e_handle_action_trap(priv, trap_ctx->id);
@@ -297,6 +298,7 @@ int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_
			    trap_ctx->action);
		err = -EINVAL;
	}
	netdev_unlock(priv->netdev);
	return err;
}

+2 −0
Original line number Diff line number Diff line
@@ -1147,6 +1147,7 @@ static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state)
	bool reset = true;
	int err;

	netdev_lock(priv->netdev);
	mutex_lock(&priv->state_lock);

	new_params = priv->channels.params;
@@ -1162,6 +1163,7 @@ static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state)
				       &trust_state, reset);

	mutex_unlock(&priv->state_lock);
	netdev_unlock(priv->netdev);

	return err;
}
Loading