Commit 7ea76944 authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'net-smc-make-wr-buffer-count-configurable'

Halil Pasic says:

====================
net/smc: make wr buffer count configurable

The current value of SMC_WR_BUF_CNT is 16 which leads to heavy
contention on the wr_tx_wait workqueue of the SMC-R linkgroup and its
spinlock when many connections are competing for the work request
buffers. Currently up to 256 connections per linkgroup are supported.

To make things worse when finally a buffer becomes available and
smc_wr_tx_put_slot() signals the linkgroup's wr_tx_wait wq, because
WQ_FLAG_EXCLUSIVE is not used all the waiters get woken up, most of the
time a single one can proceed, and the rest is contending on the
spinlock of the wq to go to sleep again.

Addressing this by simply bumping SMC_WR_BUF_CNT to 256 was deemed
risky, because the large-ish physically continuous allocation could fail
and lead to TCP fall-backs. For reference see this discussion thread on
"[PATCH net-next] net/smc: increase SMC_WR_BUF_CNT" (in archive
https://lists.openwall.net/netdev/2024/11/05/186), which concludes with
the agreement to try to come up with something smarter, which is what
this series aims for.

Additionally if for some reason it is known that heavy contention is not
to be expected going with something like 256 work request buffers is
wasteful. To address these concerns make the number of work requests
configurable, and introduce a back-off logic with handles -ENOMEM form
smc_wr_alloc_link_mem() gracefully.

v5: https://lore.kernel.org/netdev/20250929000001.1752206-1-pasic@linux.ibm.com/
v4: https://lore.kernel.org/netdev/20250927232144.3478161-1-pasic@linux.ibm.com/
v3: https://lore.kernel.org/netdev/20250921214440.325325-1-pasic@linux.ibm.com/
v2: https://lore.kernel.org/netdev/20250908220150.3329433-1-pasic@linux.ibm.com/
v1: https://lore.kernel.org/all/20250904211254.1057445-1-pasic@linux.ibm.com/
====================

Link: https://patch.msgid.link/20251027224856.2970019-1-pasic@linux.ibm.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents ea7d0d60 8f736087
Loading
Loading
Loading
Loading
+40 −0
Original line number Diff line number Diff line
@@ -71,3 +71,43 @@ smcr_max_conns_per_lgr - INTEGER
	acceptable value ranges from 16 to 255. Only for SMC-R v2.1 and later.

	Default: 255

smcr_max_send_wr - INTEGER
	So-called work request buffers are SMCR link (and RDMA queue pair) level
	resources necessary for performing RDMA operations. Since up to 255
	connections can share a link group and thus also a link and the number
	of the work request buffers is decided when the link is allocated,
	depending on the workload it can be a bottleneck in a sense that threads
	have to wait for work request buffers to become available. Before the
	introduction of this control the maximal number of work request buffers
	available on the send path used to be hard coded to 16. With this control
	it becomes configurable. The acceptable range is between 2 and 2048.

	Please be aware that all the buffers need to be allocated as a physically
	continuous array in which each element is a single buffer and has the size
	of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails, we keep retrying
	with half of the buffer count until it is ether successful or (unlikely)
	we dip below the old hard coded value which is 16 where we give up much
	like before having this control.

	Default: 16

smcr_max_recv_wr - INTEGER
	So-called work request buffers are SMCR link (and RDMA queue pair) level
	resources necessary for performing RDMA operations. Since up to 255
	connections can share a link group and thus also a link and the number
	of the work request buffers is decided when the link is allocated,
	depending on the workload it can be a bottleneck in a sense that threads
	have to wait for work request buffers to become available. Before the
	introduction of this control the maximal number of work request buffers
	available on the receive path used to be hard coded to 16. With this control
	it becomes configurable. The acceptable range is between 2 and 2048.

	Please be aware that all the buffers need to be allocated as a physically
	continuous array in which each element is a single buffer and has the size
	of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails, we keep retrying
	with half of the buffer count until it is ether successful or (unlikely)
	we dip below the old hard coded value which is 16 where we give up much
	like before having this control.

	Default: 48
+2 −0
Original line number Diff line number Diff line
@@ -24,5 +24,7 @@ struct netns_smc {
	int				sysctl_rmem;
	int				sysctl_max_links_per_lgr;
	int				sysctl_max_conns_per_lgr;
	unsigned int			sysctl_smcr_max_send_wr;
	unsigned int			sysctl_smcr_max_recv_wr;
};
#endif
+24 −10
Original line number Diff line number Diff line
@@ -810,6 +810,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
	lnk->clearing = 0;
	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
	lnk->link_id = smcr_next_link_id(lgr);
	lnk->max_send_wr = lgr->max_send_wr;
	lnk->max_recv_wr = lgr->max_recv_wr;
	lnk->lgr = lgr;
	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
	lnk->link_idx = link_idx;
@@ -836,27 +838,39 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
	rc = smc_llc_link_init(lnk);
	if (rc)
		goto out;
	rc = smc_wr_alloc_link_mem(lnk);
	if (rc)
		goto clear_llc_lnk;
	rc = smc_ib_create_protection_domain(lnk);
	if (rc)
		goto free_link_mem;
		goto clear_llc_lnk;
	do {
		rc = smc_ib_create_queue_pair(lnk);
		if (rc)
			goto dealloc_pd;
		rc = smc_wr_alloc_link_mem(lnk);
		if (!rc)
			break;
		else if (rc != -ENOMEM) /* give up */
			goto destroy_qp;
		/* retry with smaller ... */
		lnk->max_send_wr /= 2;
		lnk->max_recv_wr /= 2;
		/* ... unless droping below old SMC_WR_BUF_SIZE */
		if (lnk->max_send_wr < 16 || lnk->max_recv_wr < 48)
			goto destroy_qp;
		smc_ib_destroy_queue_pair(lnk);
	} while (1);

	rc = smc_wr_create_link(lnk);
	if (rc)
		goto destroy_qp;
		goto free_link_mem;
	lnk->state = SMC_LNK_ACTIVATING;
	return 0;

free_link_mem:
	smc_wr_free_link_mem(lnk);
destroy_qp:
	smc_ib_destroy_queue_pair(lnk);
dealloc_pd:
	smc_ib_dealloc_protection_domain(lnk);
free_link_mem:
	smc_wr_free_link_mem(lnk);
clear_llc_lnk:
	smc_llc_link_clear(lnk, false);
out:
+8 −0
Original line number Diff line number Diff line
@@ -34,6 +34,8 @@
					 * distributions may modify it to a value between
					 * 16-255 as needed.
					 */
#define SMCR_MAX_SEND_WR_DEF	16	/* Default number of work requests per send queue */
#define SMCR_MAX_RECV_WR_DEF	48	/* Default number of work requests per recv queue */

struct smc_lgr_list {			/* list of link group definition */
	struct list_head	list;
@@ -173,6 +175,8 @@ struct smc_link {
	struct completion	llc_testlink_resp; /* wait for rx of testlink */
	int			llc_testlink_time; /* testlink interval */
	atomic_t		conn_cnt; /* connections on this link */
	u16			max_send_wr;
	u16			max_recv_wr;
};

/* For now we just allow one parallel link per link group. The SMC protocol
@@ -366,6 +370,10 @@ struct smc_link_group {
						/* max conn can be assigned to lgr */
			u8			max_links;
						/* max links can be added in lgr */
			u16			max_send_wr;
						/* number of WR buffers on send */
			u16			max_recv_wr;
						/* number of WR buffers on recv */
		};
		struct { /* SMC-D */
			struct smcd_gid		peer_gid;
+5 −5
Original line number Diff line number Diff line
@@ -669,11 +669,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
		.recv_cq = lnk->smcibdev->roce_cq_recv,
		.srq = NULL,
		.cap = {
				/* include unsolicited rdma_writes as well,
				 * there are max. 2 RDMA_WRITE per 1 WR_SEND
				 */
			.max_send_wr = SMC_WR_BUF_CNT * 3,
			.max_recv_wr = SMC_WR_BUF_CNT * 3,
			.max_send_sge = SMC_IB_MAX_SEND_SGE,
			.max_recv_sge = lnk->wr_rx_sge_cnt,
			.max_inline_data = 0,
@@ -683,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
	};
	int rc;

	/* include unsolicited rdma_writes as well,
	 * there are max. 2 RDMA_WRITE per 1 WR_SEND
	 */
	qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
	qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
	lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
	rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
	if (IS_ERR(lnk->roce_qp))
Loading