Commit 22af030f authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-rx-path-refactor'

Matthieu Baerts says:

====================
mptcp: rx path refactor

Paolo worked on this RX path refactor for these two main reasons:

- Currently, the MPTCP RX path introduces quite a bit of 'exceptional'
  accounting/locking processing WRT to plain TCP, adding up to the
  implementation complexity in a miserable way.

- The performance gap WRT plain TCP for single subflow connections is
  quite measurable.

The present refactor addresses both the above items: most of the
additional complexity is dropped, and single stream performances
increase measurably, from 55Gbps to 71Gbps in Paolo's loopback test.
As a reference, plain TCP was around 84Gbps on the same host.

The above comes to a price: the patch are invasive, even in subtle ways.

Note: patch 5/7 removes the sk_forward_alloc_get() helper, which caused
some trivial modifications in different places in the net tree: sockets,
IPv4, sched. That's why a few more people have been Cc here. Feel free
to only look at this patch 5/7.
====================

Link: https://patch.msgid.link/20250218-net-next-mptcp-rx-path-refactor-v1-0-4a47d90d7998@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 9a6c2b2b e0ca4057
Loading
Loading
Loading
Loading
+0 −13
Original line number Diff line number Diff line
@@ -1285,10 +1285,6 @@ struct proto {
	unsigned int		inuse_idx;
#endif

#if IS_ENABLED(CONFIG_MPTCP)
	int			(*forward_alloc_get)(const struct sock *sk);
#endif

	bool			(*stream_memory_free)(const struct sock *sk, int wake);
	bool			(*sock_is_readable)(struct sock *sk);
	/* Memory pressure */
@@ -1349,15 +1345,6 @@ int sock_load_diag_module(int family, int protocol);

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

static inline int sk_forward_alloc_get(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP)
	if (sk->sk_prot->forward_alloc_get)
		return sk->sk_prot->forward_alloc_get(sk);
#endif
	return READ_ONCE(sk->sk_forward_alloc);
}

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
	if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
+1 −1
Original line number Diff line number Diff line
@@ -3882,7 +3882,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
+1 −1
Original line number Diff line number Diff line
@@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk)
	WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
	WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
	WARN_ON_ONCE(sk->sk_wmem_queued);
	WARN_ON_ONCE(sk_forward_alloc_get(sk));
	WARN_ON_ONCE(sk->sk_forward_alloc);

	kfree(rcu_dereference_protected(inet->inet_opt, 1));
	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
+1 −1
Original line number Diff line number Diff line
@@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
		struct inet_diag_meminfo minfo = {
			.idiag_rmem = sk_rmem_alloc_get(sk),
			.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
			.idiag_fmem = sk_forward_alloc_get(sk),
			.idiag_fmem = READ_ONCE(sk->sk_forward_alloc),
			.idiag_tmem = sk_wmem_alloc_get(sk),
		};

+4 −23
Original line number Diff line number Diff line
@@ -40,17 +40,17 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
	tp->copied_seq += skb->len;
	subflow->ssn_offset += skb->len;

	/* initialize a dummy sequence number, we will update it at MPC
	 * completion, if needed
	 */
	/* Only the sequence delta is relevant */
	MPTCP_SKB_CB(skb)->map_seq = -skb->len;
	MPTCP_SKB_CB(skb)->end_seq = 0;
	MPTCP_SKB_CB(skb)->offset = 0;
	MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
	MPTCP_SKB_CB(skb)->cant_coalesce = 1;

	mptcp_data_lock(sk);
	DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk));

	mptcp_set_owner_r(skb, sk);
	skb_set_owner_r(skb, sk);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	mptcp_sk(sk)->bytes_received += skb->len;

@@ -58,22 +58,3 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf

	mptcp_data_unlock(sk);
}

void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
				     const struct mptcp_options_received *mp_opt)
{
	struct sock *sk = (struct sock *)msk;
	struct sk_buff *skb;

	skb = skb_peek_tail(&sk->sk_receive_queue);
	if (skb) {
		WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
		pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk,
			 MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
			 MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
		MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
		MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
	}

	pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq);
}
Loading