Commit aed763ab authored by Cosmin Ratiu's avatar Cosmin Ratiu Committed by Jakub Kicinski
Browse files

net/mlx5: Fix deadlock between devlink lock and esw->wq



esw->work_queue executes esw_functions_changed_event_handler ->
esw_vfs_changed_event_handler and acquires the devlink lock.

.eswitch_mode_set (acquires devlink lock in devlink_nl_pre_doit) ->
mlx5_devlink_eswitch_mode_set -> mlx5_eswitch_disable_locked ->
mlx5_eswitch_event_handler_unregister -> flush_workqueue deadlocks
when esw_vfs_changed_event_handler executes.

Fix that by no longer flushing the work to avoid the deadlock, and using
a generation counter to keep track of work relevance. This avoids an old
handler manipulating an esw that has undergone one or more mode changes:
- the counter is incremented in mlx5_eswitch_event_handler_unregister.
- the counter is read and passed to the ephemeral mlx5_host_work struct.
- the work handler takes the devlink lock and bails out if the current
  generation is different than the one it was scheduled to operate on.
- mlx5_eswitch_cleanup does the final draining before destroying the wq.

No longer flushing the workqueue has the side effect of maybe no longer
cancelling pending vport_change_handler work items, but that's ok since
those are disabled elsewhere:
- mlx5_eswitch_disable_locked disables the vport eq notifier.
- mlx5_esw_vport_disable disarms the HW EQ notification and marks
  vport->enabled under state_lock to false to prevent pending vport
  handler from doing anything.
- mlx5_eswitch_cleanup destroys the workqueue and makes sure all events
  are disabled/finished.

Fixes: f1bc646c ("net/mlx5: Use devl_ API in mlx5_esw_offloads_devlink_port_register")
Signed-off-by: default avatarCosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Reviewed-by: default avatarDragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: default avatarSimon Horman <horms@kernel.org>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20260305081019.1811100-1-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 55f854dd
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -1072,10 +1072,11 @@ static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw)

static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
{
	if (esw->mode == MLX5_ESWITCH_OFFLOADS && mlx5_eswitch_is_funcs_handler(esw->dev))
	if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
	    mlx5_eswitch_is_funcs_handler(esw->dev)) {
		mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);

	flush_workqueue(esw->work_queue);
		atomic_inc(&esw->esw_funcs.generation);
	}
}

static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
+2 −0
Original line number Diff line number Diff line
@@ -335,10 +335,12 @@ struct esw_mc_addr { /* SRIOV only */
struct mlx5_host_work {
	struct work_struct	work;
	struct mlx5_eswitch	*esw;
	int			work_gen;
};

struct mlx5_esw_functions {
	struct mlx5_nb		nb;
	atomic_t		generation;
	bool			host_funcs_disabled;
	u16			num_vfs;
	u16			num_ec_vfs;
+13 −5
Original line number Diff line number Diff line
@@ -3582,22 +3582,28 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
}

static void
esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, const u32 *out)
esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
			      const u32 *out)
{
	struct devlink *devlink;
	bool host_pf_disabled;
	u16 new_num_vfs;

	devlink = priv_to_devlink(esw->dev);
	devl_lock(devlink);

	/* Stale work from one or more mode changes ago. Bail out. */
	if (work_gen != atomic_read(&esw->esw_funcs.generation))
		goto unlock;

	new_num_vfs = MLX5_GET(query_esw_functions_out, out,
			       host_params_context.host_num_of_vfs);
	host_pf_disabled = MLX5_GET(query_esw_functions_out, out,
				    host_params_context.host_pf_disabled);

	if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
		return;
		goto unlock;

	devlink = priv_to_devlink(esw->dev);
	devl_lock(devlink);
	/* Number of VFs can only change from "0 to x" or "x to 0". */
	if (esw->esw_funcs.num_vfs > 0) {
		mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -3612,6 +3618,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, const u32 *out)
		}
	}
	esw->esw_funcs.num_vfs = new_num_vfs;
unlock:
	devl_unlock(devlink);
}

@@ -3628,7 +3635,7 @@ static void esw_functions_changed_event_handler(struct work_struct *work)
	if (IS_ERR(out))
		goto out;

	esw_vfs_changed_event_handler(esw, out);
	esw_vfs_changed_event_handler(esw, host_work->work_gen, out);
	kvfree(out);
out:
	kfree(host_work);
@@ -3648,6 +3655,7 @@ int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type
	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);

	host_work->esw = esw;
	host_work->work_gen = atomic_read(&esw_funcs->generation);

	INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
	queue_work(esw->work_queue, &host_work->work);