Commit 7997bca6 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mlx5-misc-fixes-2026-02-18'

Tariq Toukan says:

====================
mlx5 misc fixes 2026-02-18

This patchset provides misc bug fixes from the team to the mlx5
core and Eth drivers.
====================

Link: https://patch.msgid.link/20260218072904.1764634-1-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents e6834a4c 57a94d4b
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -180,7 +180,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
}

/* Use this function to get max num channels (rxqs/txqs) only to create netdev */
static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
static inline unsigned int
mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
{
	return is_kdump_kernel() ?
		MLX5E_MIN_NUM_CHANNELS :
+0 −14
Original line number Diff line number Diff line
@@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
{
	struct mlx5e_ptpsq *ptpsq =
		container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
	struct mlx5e_txqsq *sq = &ptpsq->txqsq;

	/* Recovering the PTP SQ means re-enabling NAPI, which requires the
	 * netdev instance lock. However, SQ closing has to wait for this work
	 * task to finish while also holding the same lock. So either get the
	 * lock or find that the SQ is no longer enabled and thus this work is
	 * not relevant anymore.
	 */
	while (!netdev_trylock(sq->netdev)) {
		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
			return;
		msleep(20);
	}

	mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
	netdev_unlock(sq->netdev);
}

static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
+13 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Mellanox Technologies.

#include <net/netdev_lock.h>

#include "health.h"
#include "params.h"
#include "txrx.h"
@@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
	rq = ctx;
	priv = rq->priv;

	/* Acquire netdev instance lock to synchronize with channel close and
	 * reopen flows. Either successfully obtain the lock, or detect that
	 * channels are closing for another reason, making this work no longer
	 * necessary.
	 */
	while (!netdev_trylock(rq->netdev)) {
		if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
			return 0;
		msleep(20);
	}
	mutex_lock(&priv->state_lock);

	eq = rq->cq.mcq.eq;
@@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
		clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);

	mutex_unlock(&priv->state_lock);
	netdev_unlock(rq->netdev);

	return err;
}
+48 −4
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2019 Mellanox Technologies. */

#include <net/netdev_lock.h>

#include "health.h"
#include "en/ptp.h"
#include "en/devlink.h"
@@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
		return 0;

	/* Recovering queues means re-enabling NAPI, which requires the netdev
	 * instance lock. However, SQ closing flows have to wait for work tasks
	 * to finish while also holding the netdev instance lock. So either get
	 * the lock or find that the SQ is no longer enabled and thus this work
	 * is not relevant anymore.
	 */
	while (!netdev_trylock(dev)) {
		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
			return 0;
		msleep(20);
	}

	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
	if (err) {
		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
@@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
	else
		mlx5e_trigger_napi_sched(sq->cq.napi);

	netdev_unlock(dev);
	return 0;
out:
	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
	netdev_unlock(dev);
	return err;
}

@@ -137,10 +153,24 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
	sq = to_ctx->sq;
	eq = sq->cq.mcq.eq;
	priv = sq->priv;

	/* Recovering the TX queues implies re-enabling NAPI, which requires
	 * the netdev instance lock.
	 * However, channel closing flows have to wait for this work to finish
	 * while holding the same lock. So either get the lock or find that
	 * channels are being closed for other reason and this work is not
	 * relevant anymore.
	 */
	while (!netdev_trylock(sq->netdev)) {
		if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
			return 0;
		msleep(20);
	}

	err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
	if (!err) {
		to_ctx->status = 0; /* this sq recovered */
		return err;
		goto out;
	}

	mutex_lock(&priv->state_lock);
@@ -148,7 +178,7 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
	mutex_unlock(&priv->state_lock);
	if (!err) {
		to_ctx->status = 1; /* all channels recovered */
		return err;
		goto out;
	}

	to_ctx->status = err;
@@ -156,7 +186,8 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
	netdev_err(priv->netdev,
		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
		   err);

out:
	netdev_unlock(sq->netdev);
	return err;
}

@@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
		return 0;

	priv = ptpsq->txqsq.priv;
	netdev = priv->netdev;

	/* Recovering the PTP SQ means re-enabling NAPI, which requires the
	 * netdev instance lock. However, SQ closing has to wait for this work
	 * task to finish while also holding the same lock. So either get the
	 * lock or find that the SQ is no longer enabled and thus this work is
	 * not relevant anymore.
	 */
	while (!netdev_trylock(netdev)) {
		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
			return 0;
		msleep(20);
	}

	mutex_lock(&priv->state_lock);
	chs = &priv->channels;
	netdev = priv->netdev;

	carrier_ok = netif_carrier_ok(netdev);
	netif_carrier_off(netdev);
@@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
		netif_carrier_on(netdev);

	mutex_unlock(&priv->state_lock);
	netdev_unlock(netdev);

	return err;
}
+3 −7
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

#include <linux/iopoll.h>
#include <linux/math64.h>
#include "lib/aso.h"
#include "en/tc/post_act.h"
@@ -115,7 +116,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
	struct mlx5e_flow_meters *flow_meters;
	u8 cir_man, cir_exp, cbs_man, cbs_exp;
	struct mlx5_aso_wqe *aso_wqe;
	unsigned long expires;
	struct mlx5_aso *aso;
	u64 rate, burst;
	u8 ds_cnt;
@@ -187,12 +187,8 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
	mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl);

	/* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */
	expires = jiffies + msecs_to_jiffies(10);
	do {
		err = mlx5_aso_poll_cq(aso, true);
		if (err)
			usleep_range(2, 10);
	} while (err && time_is_after_jiffies(expires));
	read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
			  false, aso, true);
	mutex_unlock(&flow_meters->aso_lock);

	return err;
Loading