Commit 8846f9a0 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-features-and-fixes-for-v6-7'

Mat Martineau says:

====================
mptcp: Features and fixes for v6.7

Patch 1 adds a configurable timeout for the MPTCP connection when all
subflows are closed, to support break-before-make use cases.

Patch 2 is a fix for a 1-byte error in rx data counters with MPTCP
fastopen connections.

Patch 3 is a minor code cleanup.

Patches 4 & 5 add handling of rcvlowat for MPTCP sockets, with a
prerequisite patch to use a common scaling ratio between TCP and MPTCP.

Patch 6 improves efficiency of memory copying in MPTCP transmit code.

Patch 7 refactors syncing of socket options from the MPTCP socket to
its subflows.

Patches 8 & 9 help the MPTCP packet scheduler perform well by changing
the handling of notsent_lowat in subflows and how available buffer space
is calculated for MPTCP-level sends.
====================

Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents aad36cd3 8005184f
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -25,6 +25,17 @@ add_addr_timeout - INTEGER (seconds)

	Default: 120

close_timeout - INTEGER (seconds)
	Set the make-after-break timeout: in absence of any close or
	shutdown syscall, MPTCP sockets will maintain the status
	unchanged for such time, after the last subflow removal, before
	moving to TCP_CLOSE.

	The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace
	sysctl.

	Default: 60

checksum_enabled - BOOLEAN
	Control whether DSS checksum can be enabled.

+7 −5
Original line number Diff line number Diff line
@@ -1489,13 +1489,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
/* Assume a conservative default of 1200 bytes of payload per 4K page.
 * This may be adjusted later in tcp_measure_rcv_mss().
 */
	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
				    SKB_TRUESIZE(4096);
#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
				   SKB_TRUESIZE(4096))

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
	tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
}

/* Note: caller must be prepared to deal with negative returns */
+16 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ struct mptcp_pernet {
#endif

	unsigned int add_addr_timeout;
	unsigned int close_timeout;
	unsigned int stale_loss_cnt;
	u8 mptcp_enabled;
	u8 checksum_enabled;
@@ -65,6 +66,13 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
	return mptcp_get_pernet(net)->stale_loss_cnt;
}

unsigned int mptcp_close_timeout(const struct sock *sk)
{
	if (sock_flag(sk, SOCK_DEAD))
		return TCP_TIMEWAIT_LEN;
	return mptcp_get_pernet(sock_net(sk))->close_timeout;
}

int mptcp_get_pm_type(const struct net *net)
{
	return mptcp_get_pernet(net)->pm_type;
@@ -79,6 +87,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
	pernet->mptcp_enabled = 1;
	pernet->add_addr_timeout = TCP_RTO_MAX;
	pernet->close_timeout = TCP_TIMEWAIT_LEN;
	pernet->checksum_enabled = 0;
	pernet->allow_join_initial_addr_port = 1;
	pernet->stale_loss_cnt = 4;
@@ -141,6 +150,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
		.mode = 0644,
		.proc_handler = proc_dostring,
	},
	{
		.procname = "close_timeout",
		.maxlen = sizeof(unsigned int),
		.mode = 0644,
		.proc_handler = proc_dointvec_jiffies,
	},
	{}
};

@@ -163,6 +178,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
	table[4].data = &pernet->stale_loss_cnt;
	table[5].data = &pernet->pm_type;
	table[6].data = &pernet->scheduler;
	table[7].data = &pernet->close_timeout;

	hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
				     ARRAY_SIZE(mptcp_sysctl_table));
+1 −0
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf

	mptcp_set_owner_r(skb, sk);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	mptcp_sk(sk)->bytes_received += skb->len;

	sk->sk_data_ready(sk);

+45 −24
Original line number Diff line number Diff line
@@ -121,8 +121,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
		ret = __mptcp_socket_create(msk);
		if (ret)
			return ERR_PTR(ret);

		mptcp_sockopt_sync(msk, msk->first);
	}

	return msk->first;
@@ -863,9 +861,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)

	/* Wake-up the reader only for in-sequence data */
	mptcp_data_lock(sk);
	if (move_skbs_to_msk(msk, ssk))
	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
		sk->sk_data_ready(sk);

	mptcp_data_unlock(sk);
}

@@ -893,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
	mptcp_sockopt_sync_locked(msk, ssk);
	mptcp_subflow_joined(msk, ssk);
	mptcp_stop_tout_timer(sk);
	__mptcp_propagate_sndbuf(sk, ssk);
	return true;
}

@@ -1079,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
	struct mptcp_sock *msk = mptcp_sk(sk);
	bool first = true;

	sk_stream_moderate_sndbuf(sk);
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

		if (first)
			tcp_enter_memory_pressure(ssk);
		sk_stream_moderate_sndbuf(ssk);

		first = false;
	}
	__mptcp_sync_sndbuf(sk);
}

/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -1761,6 +1760,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
	return ret;
}

static int do_copy_data_nocache(struct sock *sk, int copy,
				struct iov_iter *from, char *to)
{
	if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
		if (!copy_from_iter_full_nocache(to, copy, from))
			return -EFAULT;
	} else if (!copy_from_iter_full(to, copy, from)) {
		return -EFAULT;
	}
	return 0;
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1834,11 +1845,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		if (!sk_wmem_schedule(sk, total_ts))
			goto wait_for_memory;

		if (copy_page_from_iter(dfrag->page, offset, psize,
					&msg->msg_iter) != psize) {
			ret = -EFAULT;
		ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
					   page_address(dfrag->page) + offset);
		if (ret)
			goto do_error;
		}

		/* data successfully copied into the write queue */
		sk_forward_alloc_add(sk, -total_ts);
@@ -1922,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
			if (!(flags & MSG_PEEK)) {
				MPTCP_SKB_CB(skb)->offset += count;
				MPTCP_SKB_CB(skb)->map_seq += count;
				msk->bytes_consumed += count;
			}
			break;
		}
@@ -1932,6 +1943,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
			WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
			__skb_unlink(skb, &msk->receive_queue);
			__kfree_skb(skb);
			msk->bytes_consumed += count;
		}

		if (copied >= len)
@@ -2391,8 +2403,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
	if (msk->in_accept_queue && msk->first == ssk &&
	    (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
		/* ensure later check in mptcp_worker() will dispose the msk */
		mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
		sock_set_flag(sk, SOCK_DEAD);
		mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
		lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
		mptcp_subflow_drop_ctx(ssk);
		goto out_release;
@@ -2448,6 +2460,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
		WRITE_ONCE(msk->first, NULL);

out:
	__mptcp_sync_sndbuf(sk);
	if (need_push)
		__mptcp_push_pending(sk, 0);

@@ -2516,7 +2529,7 @@ static bool mptcp_close_tout_expired(const struct sock *sk)
		return false;

	return time_after32(tcp_jiffies32,
		  inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
		  inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
}

static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2659,7 +2672,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
		return;

	close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
			TCP_TIMEWAIT_LEN;
			mptcp_close_timeout(sk);

	/* the close timeout takes precedence on the fail one, and here at least one of
	 * them is active
@@ -2755,6 +2768,7 @@ static void __mptcp_init_sock(struct sock *sk)
	msk->rmem_fwd_alloc = 0;
	WRITE_ONCE(msk->rmem_released, 0);
	msk->timer_ival = TCP_RTO_MIN;
	msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;

	WRITE_ONCE(msk->first, NULL);
	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -2964,16 +2978,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
	__mptcp_destroy_sock(sk);
}

static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
static __poll_t mptcp_check_readable(struct sock *sk)
{
	/* Concurrent splices from sk_receive_queue into receive_queue will
	 * always show at least one non-empty queue when checked in this order.
	 */
	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
	    skb_queue_empty_lockless(&msk->receive_queue))
		return 0;

	return EPOLLIN | EPOLLRDNORM;
	return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
}

static void mptcp_check_listen_stop(struct sock *sk)
@@ -3011,7 +3018,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
		goto cleanup;
	}

	if (mptcp_check_readable(msk) || timeout < 0) {
	if (mptcp_data_avail(msk) || timeout < 0) {
		/* If the msk has read data, or the caller explicitly ask it,
		 * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
		 */
@@ -3138,6 +3145,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
	msk->snd_data_fin_enable = false;
	msk->rcv_fastclose = false;
	msk->use_64bit_ack = false;
	msk->bytes_consumed = 0;
	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
	mptcp_pm_data_reset(msk);
	mptcp_ca_reset(sk);
@@ -3219,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
	 * uses the correct data
	 */
	mptcp_copy_inaddrs(nsk, ssk);
	mptcp_propagate_sndbuf(nsk, ssk);
	__mptcp_propagate_sndbuf(nsk, ssk);

	mptcp_rcv_space_init(msk, ssk);
	bh_unlock_sock(nsk);
@@ -3397,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk)
			__mptcp_set_connected(sk);
		if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
			__mptcp_error_report(sk);
		if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
			__mptcp_sync_sndbuf(sk);
	}

	__mptcp_update_rmem(sk);
@@ -3441,6 +3451,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
			__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
		mptcp_data_unlock(sk);
	}
	if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
		mptcp_data_lock(sk);
		if (!sock_owned_by_user(sk))
			__mptcp_sync_sndbuf(sk);
		else
			__set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
		mptcp_data_unlock(sk);
	}
	if (status & BIT(MPTCP_DELEGATE_ACK))
		schedule_3rdack_retransmission(ssk);
}
@@ -3525,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk)
	/* active subflow, already present inside the conn_list */
	if (!list_empty(&subflow->node)) {
		mptcp_subflow_joined(msk, ssk);
		mptcp_propagate_sndbuf(parent, ssk);
		return true;
	}

@@ -3909,7 +3928,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
		mask |= mptcp_check_readable(msk);
		mask |= mptcp_check_readable(sk);
		if (shutdown & SEND_SHUTDOWN)
			mask |= EPOLLOUT | EPOLLWRNORM;
		else
@@ -3947,6 +3966,7 @@ static const struct proto_ops mptcp_stream_ops = {
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.set_rcvlowat	   = mptcp_set_rcvlowat,
};

static struct inet_protosw mptcp_protosw = {
@@ -4048,6 +4068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
#ifdef CONFIG_COMPAT
	.compat_ioctl	   = inet6_compat_ioctl,
#endif
	.set_rcvlowat	   = mptcp_set_rcvlowat,
};

static struct proto mptcp_v6_prot;
Loading