Commit 74f7c523 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-receive-path-improvement'

Matthieu Baerts says:

====================
mptcp: receive path improvement

This series includes several changes to the MPTCP RX path. The main
goals are improving the RX performances, and increase the long term
maintainability.

Some changes reflects recent(ish) improvements introduced in the TCP
stack: patch 1, 2 and 3 are the MPTCP counter part of SKB deferral free
and auto-tuning improvements. Note that patch 3 could possibly fix
additional issues, and overall such patch should protect from similar
issues to arise in the future.

Patches 4-7 are aimed at introducing the socket backlog usage which will
be done in a later series to process the packets received by the
different subflows while the msk socket is owned.

Patch 8 is not related to the RX path, but it contains additional tests
for new features recently introduced in net-next.
====================

Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-0-5da266aa9c1a@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents f017c1f7 c912f935
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -370,6 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcvbuf_grow(struct sock *sk);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
+1 −1
Original line number Diff line number Diff line
@@ -891,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
	}
}

static void tcp_rcvbuf_grow(struct sock *sk)
void tcp_rcvbuf_grow(struct sock *sk)
{
	const struct net *net = sock_net(sk);
	struct tcp_sock *tp = tcp_sk(sk);
+95 −92
Original line number Diff line number Diff line
@@ -142,22 +142,33 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
	__kfree_skb(skb);
}

static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
			       struct sk_buff *from)
static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
				 struct sk_buff *from, bool *fragstolen,
				 int *delta)
{
	bool fragstolen;
	int delta;
	int limit = READ_ONCE(sk->sk_rcvbuf);

	if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) ||
	    MPTCP_SKB_CB(from)->offset ||
	    ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) ||
	    !skb_try_coalesce(to, from, &fragstolen, &delta))
	    ((to->len + from->len) > (limit >> 3)) ||
	    !skb_try_coalesce(to, from, fragstolen, delta))
		return false;

	pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
		 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
		 to->len, MPTCP_SKB_CB(from)->end_seq);
	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
	return true;
}

static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
			       struct sk_buff *from)
{
	bool fragstolen;
	int delta;

	if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta))
		return false;

	/* note the fwd memory can reach a negative value after accounting
	 * for the delta, but the later skb free will restore a non
@@ -179,6 +190,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
	return mptcp_try_coalesce((struct sock *)msk, to, from);
}

/* "inspired" by tcp_rcvbuf_grow(), main difference:
 * - mptcp does not maintain a msk-level window clamp
 * - returns true when  the receive buffer is actually updated
 */
static bool mptcp_rcvbuf_grow(struct sock *sk)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
	const struct net *net = sock_net(sk);
	int rcvwin, rcvbuf, cap;

	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		return false;

	rcvwin = msk->rcvq_space.space << 1;

	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;

	cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);

	rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
	if (rcvbuf > sk->sk_rcvbuf) {
		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
		return true;
	}
	return false;
}

/* "inspired" by tcp_data_queue_ofo(), main differences:
 * - use mptcp seqs
 * - don't cope with sacks
@@ -292,29 +332,16 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end:
	skb_condense(skb);
	skb_set_owner_r(skb, sk);
	/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
	if (sk->sk_socket)
		mptcp_rcvbuf_grow(sk);
}

static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
			     struct sk_buff *skb, unsigned int offset,
			     size_t copy_len)
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
			   int copy_len)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	struct sock *sk = (struct sock *)msk;
	struct sk_buff *tail;
	bool has_rxtstamp;

	__skb_unlink(skb, &ssk->sk_receive_queue);

	skb_ext_reset(skb);
	skb_orphan(skb);

	/* try to fetch required memory from subflow */
	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
		goto drop;
	}

	has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;

	/* the skb map_seq accounts for the skb offset:
	 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
@@ -326,6 +353,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
	MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
	MPTCP_SKB_CB(skb)->cant_coalesce = 0;

	__skb_unlink(skb, &ssk->sk_receive_queue);

	skb_ext_reset(skb);
	skb_dst_drop(skb);
}

static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
{
	u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
	struct mptcp_sock *msk = mptcp_sk(sk);
	struct sk_buff *tail;

	/* try to fetch required memory from subflow */
	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
		goto drop;
	}

	if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
		/* in sequence */
		msk->bytes_received += copy_len;
@@ -646,7 +691,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
		if (offset < skb->len) {
			size_t len = skb->len - offset;

			ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret;
			mptcp_init_skb(ssk, skb, offset, len);
			skb_orphan(skb);
			ret = __mptcp_move_skb(sk, skb) || ret;
			seq += len;

			if (unlikely(map_remaining < len)) {
@@ -767,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)

	moved = __mptcp_move_skbs_from_subflow(msk, ssk);
	__mptcp_ofo_queue(msk);
	if (unlikely(ssk->sk_err)) {
		if (!sock_owned_by_user(sk))
			__mptcp_error_report(sk);
		else
			__set_bit(MPTCP_ERROR_REPORT,  &msk->cb_flags);
	}
	if (unlikely(ssk->sk_err))
		__mptcp_subflow_error_report(sk, ssk);

	/* If the moves have caught up with the DATA_FIN sequence number
	 * it's time to ack the DATA_FIN and change socket state, but
@@ -784,18 +827,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
	return moved;
}

static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
{
	if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
		WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
}

static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
	struct mptcp_sock *msk = mptcp_sk(sk);

	__mptcp_rcvbuf_update(sk, ssk);

	/* Wake-up the reader only for in-sequence data */
	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
		sk->sk_data_ready(sk);
@@ -1943,12 +1978,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
		}

		if (!(flags & MSG_PEEK)) {
			/* avoid the indirect call, we know the destructor is sock_wfree */
			/* avoid the indirect call, we know the destructor is sock_rfree */
			skb->destructor = NULL;
			skb->sk = NULL;
			atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
			sk_mem_uncharge(sk, skb->truesize);
			__skb_unlink(skb, &sk->sk_receive_queue);
			__kfree_skb(skb);
			skb_attempt_defer_free(skb);
			msk->bytes_consumed += count;
		}

@@ -2013,26 +2049,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
		goto new_measure;

	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
		u64 rcvwin, grow;
		int rcvbuf;

		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;

		grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);

		do_div(grow, msk->rcvq_space.space);
		rcvwin += (grow << 1);

		rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));

		if (rcvbuf > sk->sk_rcvbuf) {
			u32 window_clamp;

			window_clamp = mptcp_win_from_space(sk, rcvbuf);
			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
	msk->rcvq_space.space = msk->rcvq_space.copied;
	if (mptcp_rcvbuf_grow(sk)) {

		/* Make subflows follow along.  If we do not do this, we
		 * get drops at subflow level if skbs can't be moved to
@@ -2045,16 +2063,12 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)

			ssk = mptcp_subflow_tcp_sock(subflow);
			slow = lock_sock_fast(ssk);
				WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
				WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
				if (tcp_can_send_ack(ssk))
					tcp_cleanup_rbuf(ssk, 1);
			tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
			tcp_rcvbuf_grow(ssk);
			unlock_sock_fast(ssk, slow);
		}
	}
	}

	msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
	msk->rcvq_space.copied = 0;
	msk->rcvq_space.time = mstamp;
@@ -2083,11 +2097,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
	if (list_empty(&msk->conn_list))
		return false;

	/* verify we can move any data from the subflow, eventually updating */
	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		mptcp_for_each_subflow(msk, subflow)
			__mptcp_rcvbuf_update(sk, subflow->tcp_sock);

	subflow = list_first_entry(&msk->conn_list,
				   struct mptcp_subflow_context, node);
	for (;;) {
@@ -2205,14 +2214,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
				break;
			}

			if (sk->sk_shutdown & RCV_SHUTDOWN) {
				/* race breaker: the shutdown could be after the
				 * previous receive queue check
				 */
				if (__mptcp_move_skbs(sk))
					continue;
			if (sk->sk_shutdown & RCV_SHUTDOWN)
				break;
			}

			if (sk->sk_state == TCP_CLOSE) {
				copied = -ENOTCONN;
+2 −2
Original line number Diff line number Diff line
@@ -341,8 +341,8 @@ struct mptcp_sock {
	struct mptcp_pm_data	pm;
	struct mptcp_sched_ops	*sched;
	struct {
		u32	space;	/* bytes copied in last measurement window */
		u32	copied; /* bytes copied in this measurement window */
		int	space;	/* bytes copied in last measurement window */
		int	copied; /* bytes copied in this measurement window */
		u64	time;	/* start time of measurement window */
		u64	rtt_us; /* last maximum rtt of subflows */
	} rcvq_space;
+69 −0
Original line number Diff line number Diff line
@@ -2320,6 +2320,74 @@ signal_address_tests()
	fi
}

laminar_endp_tests()
{
	# no laminar endpoints: routing rules are used
	if reset_with_tcp_filter "without a laminar endpoint" ns1 10.0.2.2 REJECT &&
	   mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then
		pm_nl_set_limits $ns1 0 2
		pm_nl_set_limits $ns2 2 2
		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
		run_tests $ns1 $ns2 10.0.1.1
		join_syn_tx=1 \
			chk_join_nr 0 0 0
		chk_add_nr 1 1
	fi

	# laminar endpoints: this endpoint is used
	if reset_with_tcp_filter "with a laminar endpoint" ns1 10.0.2.2 REJECT &&
	   mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then
		pm_nl_set_limits $ns1 0 2
		pm_nl_set_limits $ns2 2 2
		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
		pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar
		run_tests $ns1 $ns2 10.0.1.1
		chk_join_nr 1 1 1
		chk_add_nr 1 1
	fi

	# laminar endpoints: these endpoints are used
	if reset_with_tcp_filter "with multiple laminar endpoints" ns1 10.0.2.2 REJECT &&
	   mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then
		pm_nl_set_limits $ns1 0 2
		pm_nl_set_limits $ns2 2 2
		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
		pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
		pm_nl_add_endpoint $ns2 dead:beef:3::2 flags laminar
		pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar
		pm_nl_add_endpoint $ns2 10.0.4.2 flags laminar
		run_tests $ns1 $ns2 10.0.1.1
		chk_join_nr 2 2 2
		chk_add_nr 2 2
	fi

	# laminar endpoints: only one endpoint is used
	if reset_with_tcp_filter "single laminar endpoint" ns1 10.0.2.2 REJECT &&
	   mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then
		pm_nl_set_limits $ns1 0 2
		pm_nl_set_limits $ns2 2 2
		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
		pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
		pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar
		run_tests $ns1 $ns2 10.0.1.1
		chk_join_nr 1 1 1
		chk_add_nr 2 2
	fi

	# laminar endpoints: subflow and laminar flags
	if reset_with_tcp_filter "sublow + laminar endpoints" ns1 10.0.2.2 REJECT &&
	   mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then
		pm_nl_set_limits $ns1 0 4
		pm_nl_set_limits $ns2 2 4
		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
		pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,laminar
		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,laminar
		run_tests $ns1 $ns2 10.0.1.1
		chk_join_nr 1 1 1
		chk_add_nr 1 1
	fi
}

link_failure_tests()
{
	# accept and use add_addr with additional subflows and link loss
@@ -4109,6 +4177,7 @@ all_tests_sorted=(
	f@subflows_tests
	e@subflows_error_tests
	s@signal_address_tests
	L@laminar_endp_tests
	l@link_failure_tests
	t@add_addr_timeout_tests
	r@remove_tests
Loading