Commit 7df48e36 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull rdma fixes from Jason Gunthorpe:

 - Quite a few irdma bug fixes, several user triggerable

 - Fix a 0 SMAC header in ionic

 - Tolerate FW errors for RAAS in bng_re

 - Don't UAF in efa when printing error events

 - Better handle pool exhaustion in the new bvec paths

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  RDMA/irdma: Harden depth calculation functions
  RDMA/irdma: Return EINVAL for invalid arp index error
  RDMA/irdma: Fix deadlock during netdev reset with active connections
  RDMA/irdma: Remove reset check from irdma_modify_qp_to_err()
  RDMA/irdma: Clean up unnecessary dereference of event->cm_node
  RDMA/irdma: Remove a NOP wait_event() in irdma_modify_qp_roce()
  RDMA/irdma: Update ibqp state to error if QP is already in error state
  RDMA/irdma: Initialize free_qp completion before using it
  RDMA/efa: Fix possible deadlock
  RDMA/rw: Fix MR pool exhaustion in bvec RDMA READ path
  RDMA/rw: Fall back to direct SGE on MR pool exhaustion
  RDMA/efa: Fix use of completion ctx after free
  RDMA/bng_re: Fix silent failure in HWRM version query
  RDMA/ionic: Preserve and set Ethernet source MAC after ib_ud_header_init()
  RDMA/irdma: Fix double free related to rereg_user_mr
parents 8af4fad5 e37afcb5
Loading
Loading
Loading
Loading
+27 −10
Original line number Diff line number Diff line
@@ -608,14 +608,29 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
	if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
		ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
				sg_offset, remote_addr, rkey, dir);
	} else if (sg_cnt > 1) {
		/*
		 * If MR init succeeded or failed for a reason other
		 * than pool exhaustion, that result is final.
		 *
		 * Pool exhaustion (-EAGAIN) from the max_sgl_rd
		 * optimization is recoverable: fall back to
		 * direct SGE posting. iWARP and force_mr require
		 * MRs unconditionally, so -EAGAIN is terminal.
		 */
		if (ret != -EAGAIN ||
		    rdma_protocol_iwarp(qp->device, port_num) ||
		    unlikely(rdma_rw_force_mr))
			goto out;
	}

	if (sg_cnt > 1)
		ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
				remote_addr, rkey, dir);
	} else {
	else
		ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
				remote_addr, rkey, dir);
	}

out:
	if (ret < 0)
		goto out_unmap_sg;
	return ret;
@@ -686,14 +701,16 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
		return ret;

	/*
	 * IOVA mapping not available. Check if MR registration provides
	 * better performance than multiple SGE entries.
	 * IOVA not available; fall back to the map_wrs path, which maps
	 * each bvec as a direct SGE. This is always correct: the MR path
	 * is a throughput optimization, not a correctness requirement.
	 * (iWARP, which does require MRs, is handled by the check above.)
	 *
	 * The rdma_rw_io_needs_mr() gate is not used here because nr_bvec
	 * is a raw page count that overstates DMA entry demand -- the bvec
	 * caller has no post-DMA-coalescing segment count, and feeding the
	 * inflated count into the MR path exhausts the pool on RDMA READs.
	 */
	if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
						nr_bvec, &iter, remote_addr,
						rkey, dir);

	return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
			remote_addr, rkey, dir);
}
+9 −5
Original line number Diff line number Diff line
@@ -210,7 +210,7 @@ static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev)
	return rc;
}

static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)
static int bng_re_query_hwrm_version(struct bng_re_dev *rdev)
{
	struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
	struct hwrm_ver_get_output ver_get_resp = {};
@@ -230,7 +230,7 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)
	if (rc) {
		ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
			  rc);
		return;
		return rc;
	}

	cctx = rdev->chip_ctx;
@@ -244,6 +244,8 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)

	if (!cctx->hwrm_cmd_max_timeout)
		cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT;

	return 0;
}

static void bng_re_dev_uninit(struct bng_re_dev *rdev)
@@ -306,13 +308,15 @@ static int bng_re_dev_init(struct bng_re_dev *rdev)
		goto msix_ctx_fail;
	}

	bng_re_query_hwrm_version(rdev);
	rc = bng_re_query_hwrm_version(rdev);
	if (rc)
		goto destroy_chip_ctx;

	rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw);
	if (rc) {
		ibdev_err(&rdev->ibdev,
			  "Failed to allocate RCFW Channel: %#x\n", rc);
		goto alloc_fw_chl_fail;
		goto destroy_chip_ctx;
	}

	/* Allocate nq record memory */
@@ -391,7 +395,7 @@ static int bng_re_dev_init(struct bng_re_dev *rdev)
	kfree(rdev->nqr);
nq_alloc_fail:
	bng_re_free_rcfw_channel(&rdev->rcfw);
alloc_fw_chl_fail:
destroy_chip_ctx:
	bng_re_destroy_chip_ctx(rdev);
msix_ctx_fail:
	bnge_unregister_dev(rdev->aux_dev);
+40 −48
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
/*
 * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved.
 * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved.
 */

#include <linux/log2.h>
@@ -310,23 +310,19 @@ static inline struct efa_comp_ctx *efa_com_get_comp_ctx_by_cmd_id(struct efa_com
	return &aq->comp_ctx[ctx_id];
}

static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
static void __efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
				       struct efa_comp_ctx *comp_ctx,
				       struct efa_admin_aq_entry *cmd,
				       size_t cmd_size_in_bytes,
				       struct efa_admin_acq_entry *comp,
				       size_t comp_size_in_bytes)
{
	struct efa_admin_aq_entry *aqe;
	struct efa_comp_ctx *comp_ctx;
	u16 queue_size_mask;
	u16 cmd_id;
	u16 ctx_id;
	u16 pi;

	comp_ctx = efa_com_alloc_comp_ctx(aq);
	if (!comp_ctx)
		return ERR_PTR(-EINVAL);

	queue_size_mask = aq->depth - 1;
	pi = aq->sq.pc & queue_size_mask;
	ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx);
@@ -360,8 +356,6 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu

	/* barrier not needed in case of writel */
	writel(aq->sq.pc, aq->sq.db_addr);

	return comp_ctx;
}

static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
@@ -394,28 +388,25 @@ static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
	return 0;
}

static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
static int efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
				    struct efa_comp_ctx *comp_ctx,
				    struct efa_admin_aq_entry *cmd,
				    size_t cmd_size_in_bytes,
				    struct efa_admin_acq_entry *comp,
				    size_t comp_size_in_bytes)
{
	struct efa_comp_ctx *comp_ctx;

	spin_lock(&aq->sq.lock);
	if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) {
		ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n");
		spin_unlock(&aq->sq.lock);
		return ERR_PTR(-ENODEV);
		return -ENODEV;
	}

	comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp,
	__efa_com_submit_admin_cmd(aq, comp_ctx, cmd, cmd_size_in_bytes, comp,
				   comp_size_in_bytes);
	spin_unlock(&aq->sq.lock);
	if (IS_ERR(comp_ctx))
		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);

	return comp_ctx;
	return 0;
}

static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq,
@@ -512,7 +503,6 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c
{
	unsigned long timeout;
	unsigned long flags;
	int err;

	timeout = jiffies + usecs_to_jiffies(aq->completion_timeout);

@@ -532,24 +522,20 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c
			atomic64_inc(&aq->stats.no_completion);

			clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
			err = -ETIME;
			goto out;
			return -ETIME;
		}

		msleep(aq->poll_interval);
	}

	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
out:
	efa_com_dealloc_comp_ctx(aq, comp_ctx);
	return err;
	return efa_com_comp_status_to_errno(
		comp_ctx->user_cqe->acq_common_descriptor.status);
}

static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx,
							struct efa_com_admin_queue *aq)
{
	unsigned long flags;
	int err;

	wait_for_completion_timeout(&comp_ctx->wait_event,
				    usecs_to_jiffies(aq->completion_timeout));
@@ -585,14 +571,11 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com
				aq->cq.cc);

		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
		err = -ETIME;
		goto out;
		return -ETIME;
	}

	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
out:
	efa_com_dealloc_comp_ctx(aq, comp_ctx);
	return err;
	return efa_com_comp_status_to_errno(
		comp_ctx->user_cqe->acq_common_descriptor.status);
}

/*
@@ -642,30 +625,39 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
	ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n",
		  efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
		  cmd->aq_common_descriptor.opcode);
	comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size);
	if (IS_ERR(comp_ctx)) {

	comp_ctx = efa_com_alloc_comp_ctx(aq);
	if (!comp_ctx) {
		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
		up(&aq->avail_cmds);
		return -EINVAL;
	}

	err = efa_com_submit_admin_cmd(aq, comp_ctx, cmd, cmd_size, comp, comp_size);
	if (err) {
		ibdev_err_ratelimited(
			aq->efa_dev,
			"Failed to submit command %s (opcode %u) err %pe\n",
			"Failed to submit command %s (opcode %u) err %d\n",
			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
			cmd->aq_common_descriptor.opcode, comp_ctx);
			cmd->aq_common_descriptor.opcode, err);

		efa_com_dealloc_comp_ctx(aq, comp_ctx);
		up(&aq->avail_cmds);
		atomic64_inc(&aq->stats.cmd_err);
		return PTR_ERR(comp_ctx);
		return err;
	}

	err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
	if (err) {
		ibdev_err_ratelimited(
			aq->efa_dev,
			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
			"Failed to process command %s (opcode %u) err %d\n",
			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
			cmd->aq_common_descriptor.opcode,
			comp_ctx->user_cqe->acq_common_descriptor.status, err);
			cmd->aq_common_descriptor.opcode, err);
		atomic64_inc(&aq->stats.cmd_err);
	}

	efa_com_dealloc_comp_ctx(aq, comp_ctx);
	up(&aq->avail_cmds);

	return err;
+3 −1
Original line number Diff line number Diff line
@@ -508,6 +508,7 @@ static int ionic_build_hdr(struct ionic_ibdev *dev,
{
	const struct ib_global_route *grh;
	enum rdma_network_type net;
	u8 smac[ETH_ALEN];
	u16 vlan;
	int rc;

@@ -518,7 +519,7 @@ static int ionic_build_hdr(struct ionic_ibdev *dev,

	grh = rdma_ah_read_grh(attr);

	rc = rdma_read_gid_l2_fields(grh->sgid_attr, &vlan, &hdr->eth.smac_h[0]);
	rc = rdma_read_gid_l2_fields(grh->sgid_attr, &vlan, smac);
	if (rc)
		return rc;

@@ -536,6 +537,7 @@ static int ionic_build_hdr(struct ionic_ibdev *dev,
	if (rc)
		return rc;

	ether_addr_copy(hdr->eth.smac_h, smac);
	ether_addr_copy(hdr->eth.dmac_h, attr->roce.dmac);

	if (net == RDMA_NETWORK_IPV4) {
+16 −13
Original line number Diff line number Diff line
@@ -2241,11 +2241,12 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev,
	int oldarpindex;
	int arpindex;
	struct net_device *netdev = iwdev->netdev;
	int ret;

	/* create an hte and cm_node for this instance */
	cm_node = kzalloc_obj(*cm_node, GFP_ATOMIC);
	if (!cm_node)
		return NULL;
		return ERR_PTR(-ENOMEM);

	/* set our node specific transport info */
	cm_node->ipv4 = cm_info->ipv4;
@@ -2348,8 +2349,10 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev,
			arpindex = -EINVAL;
	}

	if (arpindex < 0)
	if (arpindex < 0) {
		ret = -EINVAL;
		goto err;
	}

	ether_addr_copy(cm_node->rem_mac,
			iwdev->rf->arp_table[arpindex].mac_addr);
@@ -2360,7 +2363,7 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev,
err:
	kfree(cm_node);

	return NULL;
	return ERR_PTR(ret);
}

static void irdma_destroy_connection(struct irdma_cm_node *cm_node)
@@ -3021,8 +3024,8 @@ static int irdma_create_cm_node(struct irdma_cm_core *cm_core,

	/* create a CM connection node */
	cm_node = irdma_make_cm_node(cm_core, iwdev, cm_info, NULL);
	if (!cm_node)
		return -ENOMEM;
	if (IS_ERR(cm_node))
		return PTR_ERR(cm_node);

	/* set our node side to client (active) side */
	cm_node->tcp_cntxt.client = 1;
@@ -3219,9 +3222,9 @@ void irdma_receive_ilq(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *rbuf)
		cm_info.cm_id = listener->cm_id;
		cm_node = irdma_make_cm_node(cm_core, iwdev, &cm_info,
					     listener);
		if (!cm_node) {
		if (IS_ERR(cm_node)) {
			ibdev_dbg(&cm_core->iwdev->ibdev,
				  "CM: allocate node failed\n");
				  "CM: allocate node failed ret=%ld\n", PTR_ERR(cm_node));
			refcount_dec(&listener->refcnt);
			return;
		}
@@ -4239,21 +4242,21 @@ static void irdma_cm_event_handler(struct work_struct *work)
		irdma_cm_event_reset(event);
		break;
	case IRDMA_CM_EVENT_CONNECTED:
		if (!event->cm_node->cm_id ||
		    event->cm_node->state != IRDMA_CM_STATE_OFFLOADED)
		if (!cm_node->cm_id ||
		    cm_node->state != IRDMA_CM_STATE_OFFLOADED)
			break;
		irdma_cm_event_connected(event);
		break;
	case IRDMA_CM_EVENT_MPA_REJECT:
		if (!event->cm_node->cm_id ||
		if (!cm_node->cm_id ||
		    cm_node->state == IRDMA_CM_STATE_OFFLOADED)
			break;
		irdma_send_cm_event(cm_node, cm_node->cm_id,
				    IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
		break;
	case IRDMA_CM_EVENT_ABORTED:
		if (!event->cm_node->cm_id ||
		    event->cm_node->state == IRDMA_CM_STATE_OFFLOADED)
		if (!cm_node->cm_id ||
		    cm_node->state == IRDMA_CM_STATE_OFFLOADED)
			break;
		irdma_event_connect_error(event);
		break;
@@ -4263,7 +4266,7 @@ static void irdma_cm_event_handler(struct work_struct *work)
		break;
	}

	irdma_rem_ref_cm_node(event->cm_node);
	irdma_rem_ref_cm_node(cm_node);
	kfree(event);
}

Loading