Commit a1f53d5f authored by Gerd Rausch's avatar Gerd Rausch Committed by Jakub Kicinski
Browse files

net/rds: Use the first lane until RDS_EXTHDR_NPATHS arrives



Instead of just blocking the sender until "c_npaths" is known
(it gets updated upon the receipt of a MPRDS PONG message),
simply use the first lane (cp_index#0).

But just using the first lane isn't enough.

As soon as we enqueue messages on a different lane, we'd run the risk
of out-of-order delivery of RDS messages.

Earlier messages enqueued on "cp_index == 0" could be delivered later
than more recent messages enqueued on "cp_index > 0", mostly because of
possible head of line blocking issues causing the first lane to be
slower.

To avoid that, we simply take a snapshot of "cp_next_tx_seq" at the
time we're about to fan-out to more lanes.

Then we delay the transmission of messages enqueued on other lanes
with "cp_index > 0" until cp_index#0 caught up with the delivery of
new messages (from "cp_send_queue") as well as in-flight
messages (from "cp_retrans") that haven't been acknowledged yet
by the receiver.

We also add a new counter "mprds_catchup_tx0_retries" to keep track
of how many times "rds_send_xmit" had to suspend activities,
because it was waiting for the first lane to catch up.

Signed-off-by: default avatarGerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: default avatarAllison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-8-achender@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 9d30ad8a
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -170,6 +170,8 @@ struct rds_connection {

	u32			c_my_gen_num;
	u32			c_peer_gen_num;

	u64			c_cp0_mprds_catchup_tx_seq;
};

static inline
@@ -749,6 +751,7 @@ struct rds_statistics {
	u64	s_recv_bytes_added_to_socket;
	u64	s_recv_bytes_removed_from_socket;
	u64	s_send_stuck_rm;
	u64	s_mprds_catchup_tx0_retries;
};

/* af_rds.c */
+20 −3
Original line number Diff line number Diff line
@@ -208,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
	} buffer;
	bool new_with_sport_idx = false;
	u32 new_peer_gen_num = 0;
	int new_npaths;

	new_npaths = conn->c_npaths;

	while (1) {
		len = sizeof(buffer);
@@ -217,7 +220,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
		/* Process extension header here */
		switch (type) {
		case RDS_EXTHDR_NPATHS:
			conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
			new_npaths = min_t(int, RDS_MPATH_WORKERS,
					   be16_to_cpu(buffer.rds_npaths));
			break;
		case RDS_EXTHDR_GEN_NUM:
@@ -233,8 +236,22 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
	}

	conn->c_with_sport_idx = new_with_sport_idx;

	if (new_npaths > 1 && new_npaths != conn->c_npaths) {
		/* We're about to fan-out.
		 * Make sure that messages from cp_index#0
		 * are sent prior to handling other lanes.
		 */
		struct rds_conn_path *cp0 = conn->c_path;
		unsigned long flags;

		spin_lock_irqsave(&cp0->cp_lock, flags);
		conn->c_cp0_mprds_catchup_tx_seq = cp0->cp_next_tx_seq;
		spin_unlock_irqrestore(&cp0->cp_lock, flags);
	}
	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
	conn->c_npaths = max_t(int, conn->c_npaths, 1);
	conn->c_npaths = max_t(int, new_npaths, 1);

	conn->c_ping_triggered = 0;
	rds_conn_peer_gen_update(conn, new_peer_gen_num);

+66 −36
Original line number Diff line number Diff line
@@ -119,6 +119,57 @@ static void release_in_xmit(struct rds_conn_path *cp)
		wake_up_all(&cp->cp_waitq);
}

/*
 * Helper function for multipath fanout to ensure lane 0 transmits queued
 * messages before other lanes to prevent out-of-order delivery.
 *
 * Returns true if lane 0 still has messages or false otherwise
 */
static bool rds_mprds_cp0_catchup(struct rds_connection *conn)
{
	struct rds_conn_path *cp0 = conn->c_path;
	struct rds_message *rm0;
	unsigned long flags;
	bool ret = false;

	spin_lock_irqsave(&cp0->cp_lock, flags);

	/* the oldest / first message in the retransmit queue
	 * has to be at or beyond c_cp0_mprds_catchup_tx_seq
	 */
	if (!list_empty(&cp0->cp_retrans)) {
		rm0 = list_entry(cp0->cp_retrans.next, struct rds_message,
				 m_conn_item);
		if (be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) <
		    conn->c_cp0_mprds_catchup_tx_seq) {
			/* the retransmit queue of cp_index#0 has not
			 * quite caught up yet
			 */
			ret = true;
			goto unlock;
		}
	}

	/* the oldest / first message of the send queue
	 * has to be at or beyond c_cp0_mprds_catchup_tx_seq
	 */
	rm0 = cp0->cp_xmit_rm;
	if (!rm0 && !list_empty(&cp0->cp_send_queue))
		rm0 = list_entry(cp0->cp_send_queue.next, struct rds_message,
				 m_conn_item);
	if (rm0 && be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) <
	    conn->c_cp0_mprds_catchup_tx_seq) {
		/* the send queue of cp_index#0 has not quite
		 * caught up yet
		 */
		ret = true;
	}

unlock:
	spin_unlock_irqrestore(&cp0->cp_lock, flags);
	return ret;
}

/*
 * We're making the conscious trade-off here to only send one message
 * down the connection at a time.
@@ -248,6 +299,14 @@ int rds_send_xmit(struct rds_conn_path *cp)
			if (batch_count >= send_batch_count)
				goto over_batch;

			/* make sure cp_index#0 caught up during fan-out in
			 * order to avoid lane races
			 */
			if (cp->cp_index > 0 && rds_mprds_cp0_catchup(conn)) {
				rds_stats_inc(s_mprds_catchup_tx0_retries);
				goto over_batch;
			}

			spin_lock_irqsave(&cp->cp_lock, flags);

			if (!list_empty(&cp->cp_send_queue)) {
@@ -1042,39 +1101,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
	return ret;
}

static int rds_send_mprds_hash(struct rds_sock *rs,
			       struct rds_connection *conn, int nonblock)
{
	int hash;

	if (conn->c_npaths == 0)
		hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
	else
		hash = RDS_MPATH_HASH(rs, conn->c_npaths);
	if (conn->c_npaths == 0 && hash != 0) {
		rds_send_ping(conn, 0);

		/* The underlying connection is not up yet.  Need to wait
		 * until it is up to be sure that the non-zero c_path can be
		 * used.  But if we are interrupted, we have to use the zero
		 * c_path in case the connection ends up being non-MP capable.
		 */
		if (conn->c_npaths == 0) {
			/* Cannot wait for the connection be made, so just use
			 * the base c_path.
			 */
			if (nonblock)
				return 0;
			if (wait_event_interruptible(conn->c_hs_waitq,
						     conn->c_npaths != 0))
				hash = 0;
		}
		if (conn->c_npaths == 1)
			hash = 0;
	}
	return hash;
}

static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
{
	struct rds_rdma_args *args;
@@ -1304,10 +1330,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
		rs->rs_conn = conn;
	}

	if (conn->c_trans->t_mp_capable)
		cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
	else
	if (conn->c_trans->t_mp_capable) {
		/* Use c_path[0] until we learn that
		 * the peer supports more (c_npaths > 1)
		 */
		cpath = &conn->c_path[RDS_MPATH_HASH(rs, conn->c_npaths ? : 1)];
	} else {
		cpath = &conn->c_path[0];
	}

	rm->m_conn_path = cpath;

+1 −0
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@ static const char *const rds_stat_names[] = {
	"recv_bytes_added_to_sock",
	"recv_bytes_freed_fromsock",
	"send_stuck_rm",
	"mprds_catchup_tx0_retries",
};

void rds_stats_info_copy(struct rds_info_iterator *iter,