Commit eeaedc54 authored by David Howells's avatar David Howells Committed by Jakub Kicinski
Browse files

rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)



Implement path-MTU probing (along the lines of RFC8899) by padding some of
the PING ACKs we send.  PING ACKs get their own individual responses quite
apart from the acking of data (though, as ACKs, they fulfil that role
also).

The probing concentrates on packet sizes that correspond how many
subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are
just aggregations of individual DATA packets and can be split easily for
retransmission purposes.

If we want to perform probing, we advertise this by setting the maximum
number of jumbo subpackets to 0 in the ack trailer when we send an ACK and
see if the peer is also advertising the service.  This is interpreted by
non-supporting Rx stacks as an indication that jumbo packets aren't
supported.

The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged
at a maximum of 1444 unless pmtud is supported by both sides.

Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 420f8af5
Loading
Loading
Loading
Loading
+124 −0
Original line number Diff line number Diff line
@@ -364,6 +364,7 @@
	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
	EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \
	EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \
	EM(rxrpc_propose_ack_ping_for_old_rtt,	"OldRtt ") \
	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
	EM(rxrpc_propose_ack_ping_for_rtt,	"Rtt    ") \
@@ -478,6 +479,11 @@
	EM(rxrpc_txbuf_see_send_more,		"SEE SEND+  ")	\
	E_(rxrpc_txbuf_see_unacked,		"SEE UNACKED")

#define rxrpc_pmtud_reduce_traces \
	EM(rxrpc_pmtud_reduce_ack,		"Ack  ")	\
	EM(rxrpc_pmtud_reduce_icmp,		"Icmp ")	\
	E_(rxrpc_pmtud_reduce_route,		"Route")

/*
 * Generate enums for tracing information.
 */
@@ -498,6 +504,7 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte);
enum rxrpc_conn_trace		{ rxrpc_conn_traces } __mode(byte);
enum rxrpc_local_trace		{ rxrpc_local_traces } __mode(byte);
enum rxrpc_peer_trace		{ rxrpc_peer_traces } __mode(byte);
enum rxrpc_pmtud_reduce_trace	{ rxrpc_pmtud_reduce_traces } __mode(byte);
enum rxrpc_propose_ack_outcome	{ rxrpc_propose_ack_outcomes } __mode(byte);
enum rxrpc_propose_ack_trace	{ rxrpc_propose_ack_traces } __mode(byte);
enum rxrpc_receive_trace	{ rxrpc_receive_traces } __mode(byte);
@@ -534,6 +541,7 @@ rxrpc_congest_changes;
rxrpc_congest_modes;
rxrpc_conn_traces;
rxrpc_local_traces;
rxrpc_pmtud_reduce_traces;
rxrpc_propose_ack_traces;
rxrpc_receive_traces;
rxrpc_recvmsg_traces;
@@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack,
		      __entry->sack)
	    );

TRACE_EVENT(rxrpc_pmtud_tx,
	    TP_PROTO(struct rxrpc_call *call),

	    TP_ARGS(call),

	    TP_STRUCT__entry(
		    __field(unsigned int,	peer_debug_id)
		    __field(unsigned int,	call_debug_id)
		    __field(rxrpc_serial_t,	ping_serial)
		    __field(unsigned short,	pmtud_trial)
		    __field(unsigned short,	pmtud_good)
		    __field(unsigned short,	pmtud_bad)
			     ),

	    TP_fast_assign(
		    __entry->peer_debug_id = call->peer->debug_id;
		    __entry->call_debug_id = call->debug_id;
		    __entry->ping_serial = call->conn->pmtud_probe;
		    __entry->pmtud_trial = call->peer->pmtud_trial;
		    __entry->pmtud_good = call->peer->pmtud_good;
		    __entry->pmtud_bad = call->peer->pmtud_bad;
			   ),

	    TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u",
		      __entry->peer_debug_id,
		      __entry->call_debug_id,
		      __entry->ping_serial,
		      __entry->pmtud_good,
		      __entry->pmtud_trial,
		      __entry->pmtud_bad)
	    );

TRACE_EVENT(rxrpc_pmtud_rx,
	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),

	    TP_ARGS(conn, resp_serial),

	    TP_STRUCT__entry(
		    __field(unsigned int,	peer_debug_id)
		    __field(unsigned int,	call_debug_id)
		    __field(rxrpc_serial_t,	ping_serial)
		    __field(rxrpc_serial_t,	resp_serial)
		    __field(unsigned short,	max_data)
		    __field(u8,			jumbo_max)
			     ),

	    TP_fast_assign(
		    __entry->peer_debug_id = conn->peer->debug_id;
		    __entry->call_debug_id = conn->pmtud_call;
		    __entry->ping_serial = conn->pmtud_probe;
		    __entry->resp_serial = resp_serial;
		    __entry->max_data = conn->peer->max_data;
		    __entry->jumbo_max = conn->peer->pmtud_jumbo;
			   ),

	    TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u",
		      __entry->peer_debug_id,
		      __entry->call_debug_id,
		      __entry->ping_serial,
		      __entry->resp_serial,
		      __entry->max_data,
		      __entry->jumbo_max)
	    );

TRACE_EVENT(rxrpc_pmtud_lost,
	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),

	    TP_ARGS(conn, resp_serial),

	    TP_STRUCT__entry(
		    __field(unsigned int,	peer_debug_id)
		    __field(unsigned int,	call_debug_id)
		    __field(rxrpc_serial_t,	ping_serial)
		    __field(rxrpc_serial_t,	resp_serial)
			     ),

	    TP_fast_assign(
		    __entry->peer_debug_id = conn->peer->debug_id;
		    __entry->call_debug_id = conn->pmtud_call;
		    __entry->ping_serial = conn->pmtud_probe;
		    __entry->resp_serial = resp_serial;
			   ),

	    TP_printk("P=%08x c=%08x pr=%08x rr=%08x",
		      __entry->peer_debug_id,
		      __entry->call_debug_id,
		      __entry->ping_serial,
		      __entry->resp_serial)
	    );

TRACE_EVENT(rxrpc_pmtud_reduce,
	    TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial,
		     unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason),

	    TP_ARGS(peer, serial, max_data, reason),

	    TP_STRUCT__entry(
		    __field(unsigned int,	peer_debug_id)
		    __field(rxrpc_serial_t,	serial)
		    __field(unsigned int,	max_data)
		    __field(enum rxrpc_pmtud_reduce_trace, reason)
			     ),

	    TP_fast_assign(
		    __entry->peer_debug_id = peer->debug_id;
		    __entry->serial = serial;
		    __entry->max_data = max_data;
		    __entry->reason = reason;
			   ),

	    TP_printk("P=%08x %s r=%08x m=%u",
		      __entry->peer_debug_id,
		      __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces),
		      __entry->serial, __entry->max_data)
	    );

#undef EM
#undef E_

+21 −4
Original line number Diff line number Diff line
@@ -344,13 +344,25 @@ struct rxrpc_peer {
	time64_t		last_tx_at;	/* Last time packet sent here */
	seqlock_t		service_conn_lock;
	spinlock_t		lock;		/* access lock */
	unsigned int		if_mtu;		/* interface MTU for this peer */
	unsigned int		mtu;		/* network MTU for this peer */
	unsigned int		maxdata;	/* data size (MTU - hdrsize) */
	unsigned short		hdrsize;	/* header size (IP + UDP + RxRPC) */
	int			debug_id;	/* debug ID for printks */
	struct sockaddr_rxrpc	srx;		/* remote address */

	/* Path MTU discovery [RFC8899] */
	unsigned int		pmtud_trial;	/* Current MTU probe size */
	unsigned int		pmtud_good;	/* Largest working MTU probe we've tried */
	unsigned int		pmtud_bad;	/* Smallest non-working MTU probe we've tried */
	bool			pmtud_lost;	/* T if MTU probe was lost */
	bool			pmtud_probing;	/* T if we have an active probe outstanding */
	bool			pmtud_pending;	/* T if a call to this peer should send a probe */
	u8			pmtud_jumbo;	/* Max jumbo packets for the MTU */
	bool			ackr_adv_pmtud;	/* T if the peer advertises path-MTU */
	unsigned int		ackr_max_data;	/* Maximum data advertised by peer */
	seqcount_t		mtu_lock;	/* Lockless MTU access management */
	unsigned int		if_mtu;		/* Local interface MTU (- hdrsize) for this peer */
	unsigned int		max_data;	/* Maximum packet data capacity for this peer */
	unsigned short		hdrsize;	/* header size (IP + UDP + RxRPC) */
	unsigned short		tx_seg_max;	/* Maximum number of transmissable segments */

	/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
	spinlock_t		rtt_input_lock;	/* RTT lock for input routine */
@@ -531,6 +543,8 @@ struct rxrpc_connection {
	int			debug_id;	/* debug ID for printks */
	rxrpc_serial_t		tx_serial;	/* Outgoing packet serial number counter */
	unsigned int		hi_serial;	/* highest serial number received */
	rxrpc_serial_t		pmtud_probe;	/* Serial of MTU probe (or 0) */
	unsigned int		pmtud_call;	/* ID of call used for probe */
	u32			service_id;	/* Service ID, possibly upgraded */
	u32			security_level;	/* Security level selected */
	u8			security_ix;	/* security type */
@@ -1155,6 +1169,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
 */
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
		    rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
int rxrpc_send_abort_packet(struct rxrpc_call *);
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
@@ -1166,6 +1181,8 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
 */
void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
void rxrpc_peer_keepalive_worker(struct work_struct *);
void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
				 bool sendmsg_fail);

/*
 * peer_object.c
+5 −0
Original line number Diff line number Diff line
@@ -483,6 +483,11 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
			rxrpc_disconnect_call(call);
		if (call->security)
			call->security->free_call_crypto(call);
	} else {
		if (skb &&
		    call->peer->ackr_adv_pmtud &&
		    call->peer->pmtud_pending)
			rxrpc_send_probe_for_pmtud(call);
	}
	if (call->acks_hard_ack != call->tx_bottom)
		rxrpc_shrink_call_tx_buffer(call);
+11 −6
Original line number Diff line number Diff line
@@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
	struct rxrpc_acktrailer trailer;
	size_t len;
	int ret, ioc;
	u32 serial, mtu, call_id, padding;
	u32 serial, max_mtu, if_mtu, call_id, padding;

	_enter("%d", conn->debug_id);

@@ -150,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
		break;

	case RXRPC_PACKET_TYPE_ACK:
		mtu = conn->peer->if_mtu;
		mtu -= conn->peer->hdrsize;
		if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
		if (conn->peer->ackr_adv_pmtud) {
			max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
		} else {
			if_mtu = umin(1444, if_mtu);
			max_mtu = if_mtu;
		}
		pkt.ack.bufferSpace	= 0;
		pkt.ack.maxSkew		= htons(skb ? skb->priority : 0);
		pkt.ack.firstPacket	= htonl(chan->last_seq + 1);
@@ -159,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
		pkt.ack.serial		= htonl(skb ? sp->hdr.serial : 0);
		pkt.ack.reason		= skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
		pkt.ack.nAcks		= 0;
		trailer.maxMTU		= htonl(rxrpc_rx_mtu);
		trailer.ifMTU		= htonl(mtu);
		trailer.maxMTU		= htonl(max_mtu);
		trailer.ifMTU		= htonl(if_mtu);
		trailer.rwind		= htonl(rxrpc_rx_window_size);
		trailer.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
		trailer.jumbo_max	= 0;
		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
		padding			= 0;
		iov[0].iov_len += sizeof(pkt.ack);
+6 −0
Original line number Diff line number Diff line
@@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
	list_del_init(&conn->proc_link);
	write_unlock(&rxnet->conn_lock);

	if (conn->pmtud_probe) {
		trace_rxrpc_pmtud_lost(conn, 0);
		conn->peer->pmtud_probing = false;
		conn->peer->pmtud_pending = true;
	}

	rxrpc_purge_queue(&conn->rx_queue);

	rxrpc_kill_client_conn(conn);
Loading