Commit 46f257ee authored by Shamir Rabinovitch's avatar Shamir Rabinovitch Committed by Jakub Kicinski
Browse files

net/rds: new extension header: rdma bytes



Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).

In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.

Signed-off-by: default avatarShamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: default avatarGuangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: default avatarAllison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-2-achender@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent acd21dd2
Loading
Loading
Loading
Loading
+33 −7
Original line number Diff line number Diff line
@@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
		/* If it has a RDMA op, tell the peer we did it. This is
		 * used by the peer to release use-once RDMA MRs. */
		if (rm->rdma.op_active) {
			struct rds_ext_header_rdma ext_hdr;
			struct rds_ext_header_rdma ext_hdr = {};
			struct rds_ext_header_rdma_bytes
				rdma_bytes_ext_hdr = {};

			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
			rds_message_add_extension(&rm->m_inc.i_hdr,
					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
			if (rds_message_add_extension(&rm->m_inc.i_hdr,
						      RDS_EXTHDR_RDMA,
						      &ext_hdr)) {
				/* prepare the rdma bytes ext header */
				rdma_bytes_ext_hdr.h_rflags =
					rm->rdma.op_write ?
					RDS_FLAG_RDMA_WR_BYTES :
					RDS_FLAG_RDMA_RD_BYTES;
				rdma_bytes_ext_hdr.h_rdma_bytes =
					cpu_to_be32(rm->rdma.op_bytes);
			} else {
				rdsdebug("RDS_EXTHDR_RDMA dropped");
			}

			if (rds_message_add_extension(&rm->m_inc.i_hdr,
						      RDS_EXTHDR_RDMA_BYTES,
						      &rdma_bytes_ext_hdr)) {
				/* rdma bytes ext header was added successfully,
				 * notify the remote side via flag in header
				 */
				rm->m_inc.i_hdr.h_flags |=
					RDS_FLAG_EXTHDR_EXTENSION;
			} else {
				rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped");
			}
		}
		if (rm->m_rdma_cookie) {
			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
		if (rm->m_rdma_cookie &&
		    !rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
				rds_rdma_cookie_key(rm->m_rdma_cookie),
					rds_rdma_cookie_offset(rm->m_rdma_cookie));
				rds_rdma_cookie_offset(rm->m_rdma_cookie))) {
			rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n");
		}

		/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+52 −13
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST]	= sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS]	= sizeof(__be16),
[RDS_EXTHDR_GEN_NUM]	= sizeof(__be32),
};
@@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
	hdr->h_sport = sport;
	hdr->h_dport = dport;
	hdr->h_sequence = cpu_to_be64(seq);
	hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
	/* see rds_find_next_ext_space for reason why we memset the
	 * ext header
	 */
	memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);

int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
			      const void *data, unsigned int len)
/*
 * Find the next place we can add an RDS header extension with
 * specific length. Extension headers are pushed one after the
 * other. In the following, the number after the colon is the number
 * of bytes:
 *
 * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
 *
 * If the extension headers fill the complete extension header space
 * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
 */
static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
				   u8 **ext_start)
{
	unsigned int ext_len = sizeof(u8) + len;
	unsigned char *dst;
	unsigned int ext_len;
	unsigned int type;
	int ind = 0;

	/* For now, refuse to add more than one extension header */
	if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
	while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
		if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
			*ext_start = hdr->h_exthdr + ind;
			return 0;
		}

		type = hdr->h_exthdr[ind];

		ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
		WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
		if (!ext_len)
			return -EINVAL;

		/* ind points to a valid ext hdr with known length */
		ind += 1 + ext_len;
	}

	/* no room for extension */
	return -ENOSPC;
}

/* The ext hdr space is prefilled with zero from the kzalloc() */
int rds_message_add_extension(struct rds_header *hdr,
			      unsigned int type, const void *data)
{
	unsigned char *dst;
	unsigned int len;

	if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
	len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
	if (!len)
		return 0;

	if (ext_len >= RDS_HEADER_EXT_SPACE)
	if (rds_find_next_ext_space(hdr, len, &dst))
		return 0;
	dst = hdr->h_exthdr;

	*dst++ = type;
	memcpy(dst, data, len);

	dst[len] = RDS_EXTHDR_NONE;
	return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o

	ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
	ext_hdr.h_rdma_offset = cpu_to_be32(offset);
	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);

+20 −5
Original line number Diff line number Diff line
@@ -186,6 +186,7 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
#define RDS_FLAG_CONG_BITMAP		0x01
#define RDS_FLAG_ACK_REQUIRED		0x02
#define RDS_FLAG_RETRANSMITTED		0x04
#define RDS_FLAG_EXTHDR_EXTENSION	0x20
#define RDS_MAX_ADV_CREDIT		255

/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
@@ -258,6 +259,20 @@ struct rds_ext_header_rdma_dest {
	__be32			h_rdma_offset;
};

/*
 * This extension header tells the peer about delivered RDMA byte count.
 */
#define RDS_EXTHDR_RDMA_BYTES	4

struct rds_ext_header_rdma_bytes {
	__be32		h_rdma_bytes;	/* byte count */
	u8		h_rflags;	/* direction of RDMA, write or read */
	u8		h_pad[3];
};

#define RDS_FLAG_RDMA_WR_BYTES	0x01
#define RDS_FLAG_RDMA_RD_BYTES	0x02

/* Extension header announcing number of paths.
 * Implicit length = 2 bytes.
 */
@@ -871,7 +886,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
				 __be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
			      unsigned int type, const void *data, unsigned int len);
			      unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
			       unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+2 −4
Original line number Diff line number Diff line
@@ -1459,12 +1459,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
		__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);

		rds_message_add_extension(&rm->m_inc.i_hdr,
					  RDS_EXTHDR_NPATHS, &npaths,
					  sizeof(npaths));
					  RDS_EXTHDR_NPATHS, &npaths);
		rds_message_add_extension(&rm->m_inc.i_hdr,
					  RDS_EXTHDR_GEN_NUM,
					  &my_gen_num,
					  sizeof(u32));
					  &my_gen_num);
	}
	spin_unlock_irqrestore(&cp->cp_lock, flags);