Commit a20a6992 authored by Gerd Rausch's avatar Gerd Rausch Committed by Jakub Kicinski
Browse files

net/rds: Encode cp_index in TCP source port



Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").

However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.

Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.

However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.

So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.

A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.

Signed-off-by: default avatarGerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: default avatarAllison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-3-achender@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 46f257ee
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS]	= sizeof(__be16),
[RDS_EXTHDR_GEN_NUM]	= sizeof(__be32),
[RDS_EXTHDR_SPORT_IDX]	= 1,
};

void rds_message_addref(struct rds_message *rm)
+3 −0
Original line number Diff line number Diff line
@@ -147,6 +147,7 @@ struct rds_connection {
				c_ping_triggered:1,
				c_pad_to_32:29;
	int			c_npaths;
	bool			c_with_sport_idx;
	struct rds_connection	*c_passive;
	struct rds_transport	*c_trans;

@@ -278,8 +279,10 @@ struct rds_ext_header_rdma_bytes {
 */
#define RDS_EXTHDR_NPATHS	5
#define RDS_EXTHDR_GEN_NUM	6
#define RDS_EXTHDR_SPORT_IDX    8

#define __RDS_EXTHDR_MAX	16 /* for now */

#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
#define	RDS_MSG_RX_HDR		0
#define	RDS_MSG_RX_START	1
+7 −0
Original line number Diff line number Diff line
@@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
		struct rds_ext_header_version version;
		__be16 rds_npaths;
		__be32 rds_gen_num;
		u8 dummy;
	} buffer;
	bool new_with_sport_idx = false;
	u32 new_peer_gen_num = 0;

	while (1) {
@@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
		case RDS_EXTHDR_GEN_NUM:
			new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
			break;
		case RDS_EXTHDR_SPORT_IDX:
			new_with_sport_idx = true;
			break;
		default:
			pr_warn_ratelimited("ignoring unknown exthdr type "
					     "0x%x\n", type);
		}
	}

	conn->c_with_sport_idx = new_with_sport_idx;
	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
	conn->c_npaths = max_t(int, conn->c_npaths, 1);
	conn->c_ping_triggered = 0;
+4 −0
Original line number Diff line number Diff line
@@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
	    cp->cp_conn->c_trans->t_mp_capable) {
		__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
		__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
		u8 dummy = 0;

		rds_message_add_extension(&rm->m_inc.i_hdr,
					  RDS_EXTHDR_NPATHS, &npaths);
		rds_message_add_extension(&rm->m_inc.i_hdr,
					  RDS_EXTHDR_GEN_NUM,
					  &my_gen_num);
		rds_message_add_extension(&rm->m_inc.i_hdr,
					  RDS_EXTHDR_SPORT_IDX,
					  &dummy);
	}
	spin_unlock_irqrestore(&cp->cp_lock, flags);

+1 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@ struct rds_tcp_connection {
	 */
	struct mutex		t_conn_path_lock;
	struct socket		*t_sock;
	u32			t_client_port_group;
	struct rds_tcp_net	*t_rtn;
	void			*t_orig_write_space;
	void			*t_orig_data_ready;
Loading