Commit b78fcd0a authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mptcp-lowat-sockopt'



Matthieu Baerts says:

====================
mptcp: add TCP_NOTSENT_LOWAT sockopt support

Patch 3 does the magic of adding TCP_NOTSENT_LOWAT support, all the
other ones are minor cleanup seen along when working on the new
feature.

Note that this feature relies on the existing accounting for snd_nxt.
Such accounting is not 110% accurate as it tracks the most recent
sequence number queued to any subflow, and not the actual sequence
number sent on the wire. Paolo experimented a lot, trying to implement
the latter, and in the end it proved to be both "too complex" and "not
necessary".

The complexity raises from the need for additional lock and a lot of
refactoring to introduce such protections without adding significant
overhead. Additionally, snd_nxt is currently used and exposed with the
current semantic by the internal packet scheduling. Introducing a
different tracking will still require us to keep the old one.

More interestingly, a more accurate tracking could be not strictly
necessary: as the MPTCP socket enqueues data to the subflows only up
to the available send window, any enqueue data is sent on the wire
instantly, without any blocking operation short or a drop in the tx
path at the nft or TC layer.
====================

Signed-off-by: default avatarMatthieu Baerts (NGI0) <matttbe@kernel.org>
parents 26b5df99 7f71a337
Loading
Loading
Loading
Loading
+37 −17
Original line number Diff line number Diff line
@@ -1692,15 +1692,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
	}
}

static void mptcp_set_nospace(struct sock *sk)
{
	/* enable autotune */
	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

	/* will be cleared on avail space */
	set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
}

static int mptcp_disconnect(struct sock *sk, int flags);

static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
@@ -1771,6 +1762,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy,
	return 0;
}

/* open-code sk_stream_memory_free() plus sent limit computation to
 * avoid indirect calls in fast-path.
 * Called under the msk socket lock, so we can avoid a bunch of ONCE
 * annotations.
 */
static u32 mptcp_send_limit(const struct sock *sk)
{
	const struct mptcp_sock *msk = mptcp_sk(sk);
	u32 limit, not_sent;

	if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
		return 0;

	limit = mptcp_notsent_lowat(sk);
	if (limit == UINT_MAX)
		return UINT_MAX;

	not_sent = msk->write_seq - msk->snd_nxt;
	if (not_sent >= limit)
		return 0;

	return limit - not_sent;
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1815,6 +1830,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		struct mptcp_data_frag *dfrag;
		bool dfrag_collapsed;
		size_t psize, offset;
		u32 copy_limit;

		/* ensure fitting the notsent_lowat() constraint */
		copy_limit = mptcp_send_limit(sk);
		if (!copy_limit)
			goto wait_for_memory;

		/* reuse tail pfrag, if possible, or carve a new one from the
		 * page allocator
@@ -1822,9 +1843,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		dfrag = mptcp_pending_tail(sk);
		dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
		if (!dfrag_collapsed) {
			if (!sk_stream_memory_free(sk))
				goto wait_for_memory;

			if (!mptcp_page_frag_refill(sk, pfrag))
				goto wait_for_memory;

@@ -1839,6 +1857,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		offset = dfrag->offset + dfrag->data_len;
		psize = pfrag->size - offset;
		psize = min_t(size_t, psize, msg_data_left(msg));
		psize = min_t(size_t, psize, copy_limit);
		total_ts = psize + frag_truesize;

		if (!sk_wmem_schedule(sk, total_ts))
@@ -1874,7 +1893,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		continue;

wait_for_memory:
		mptcp_set_nospace(sk);
		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
		__mptcp_push_pending(sk, msg->msg_flags);
		ret = sk_stream_wait_memory(sk, &timeo);
		if (ret)
@@ -3769,6 +3788,7 @@ static struct proto mptcp_prot = {
	.unhash		= mptcp_unhash,
	.get_port	= mptcp_get_port,
	.forward_alloc_get	= mptcp_forward_alloc_get,
	.stream_memory_free	= mptcp_stream_memory_free,
	.sockets_allocated	= &mptcp_sockets_allocated,

	.memory_allocated	= &tcp_memory_allocated,
@@ -3942,12 +3962,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
	struct sock *sk = (struct sock *)msk;

	if (sk_stream_is_writeable(sk))
	if (__mptcp_stream_is_writeable(sk, 1))
		return EPOLLOUT | EPOLLWRNORM;

	mptcp_set_nospace(sk);
	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
	if (sk_stream_is_writeable(sk))
	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
	smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
	if (__mptcp_stream_is_writeable(sk, 1))
		return EPOLLOUT | EPOLLWRNORM;

	return 0;
+32 −10
Original line number Diff line number Diff line
@@ -113,10 +113,9 @@
#define MPTCP_RST_TRANSIENT	BIT(0)

/* MPTCP socket atomic flags */
#define MPTCP_NOSPACE		1
#define MPTCP_WORK_RTX		2
#define MPTCP_FALLBACK_DONE	4
#define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_WORK_RTX		1
#define MPTCP_FALLBACK_DONE	2
#define MPTCP_WORK_CLOSE_SUBFLOW 3

/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING	1
@@ -308,6 +307,7 @@ struct mptcp_sock {
			in_accept_queue:1,
			free_first:1,
			rcvspace_init:1;
	u32		notsent_lowat;
	struct work_struct work;
	struct sk_buff  *ooo_last_skb;
	struct rb_root  out_of_order_queue;
@@ -808,15 +808,37 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
	       READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}

static inline u32 mptcp_notsent_lowat(const struct sock *sk)
{
	struct net *net = sock_net(sk);
	u32 val;

	val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
	return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
}

static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
{
	const struct mptcp_sock *msk = mptcp_sk(sk);
	u32 notsent_bytes;

	notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
	return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
}

static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
{
	return mptcp_stream_memory_free(sk, wake) &&
	       __sk_stream_is_writeable(sk, wake);
}

static inline void mptcp_write_space(struct sock *sk)
{
	if (sk_stream_is_writeable(sk)) {
	/* pairs with memory barrier in mptcp_poll */
	smp_mb();
		if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
	if (mptcp_stream_memory_free(sk, 1))
		sk_stream_write_space(sk);
}
}

static inline void __mptcp_sync_sndbuf(struct sock *sk)
{
+32 −39
Original line number Diff line number Diff line
@@ -624,20 +624,11 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
	return ret;
}

static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
					 unsigned int optlen)
static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
{
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	int val;

	if (optlen < sizeof(int))
		return -EINVAL;

	if (copy_from_sockptr(&val, optval, sizeof(val)))
		return -EFAULT;

	lock_sock(sk);
	sockopt_seq_inc(msk);
	msk->cork = !!val;
	mptcp_for_each_subflow(msk, subflow) {
@@ -649,25 +640,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva
	}
	if (!val)
		mptcp_check_and_set_pending(sk);
	release_sock(sk);

	return 0;
}

static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
					    unsigned int optlen)
static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
{
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	int val;

	if (optlen < sizeof(int))
		return -EINVAL;

	if (copy_from_sockptr(&val, optval, sizeof(val)))
		return -EFAULT;

	lock_sock(sk);
	sockopt_seq_inc(msk);
	msk->nodelay = !!val;
	mptcp_for_each_subflow(msk, subflow) {
@@ -679,8 +660,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op
	}
	if (val)
		mptcp_check_and_set_pending(sk);
	release_sock(sk);

	return 0;
}

@@ -803,25 +782,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
	int ret, val;

	switch (optname) {
	case TCP_INQ:
		ret = mptcp_get_int_option(msk, optval, optlen, &val);
		if (ret)
			return ret;
		if (val < 0 || val > 1)
			return -EINVAL;

		lock_sock(sk);
		msk->recvmsg_inq = !!val;
		release_sock(sk);
		return 0;
	case TCP_ULP:
		return -EOPNOTSUPP;
	case TCP_CONGESTION:
		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
	case TCP_CORK:
		return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
	case TCP_NODELAY:
		return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
	case TCP_DEFER_ACCEPT:
		/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
		mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
@@ -834,7 +798,34 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
						      optval, optlen);
	}

	return -EOPNOTSUPP;
	ret = mptcp_get_int_option(msk, optval, optlen, &val);
	if (ret)
		return ret;

	lock_sock(sk);
	switch (optname) {
	case TCP_INQ:
		if (val < 0 || val > 1)
			ret = -EINVAL;
		else
			msk->recvmsg_inq = !!val;
		break;
	case TCP_NOTSENT_LOWAT:
		WRITE_ONCE(msk->notsent_lowat, val);
		mptcp_write_space(sk);
		break;
	case TCP_CORK:
		ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
		break;
	case TCP_NODELAY:
		ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
		break;
	default:
		ret = -ENOPROTOOPT;
	}

	release_sock(sk);
	return ret;
}

int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -1349,6 +1340,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
		return mptcp_put_int_option(msk, optval, optlen, msk->cork);
	case TCP_NODELAY:
		return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
	case TCP_NOTSENT_LOWAT:
		return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
	}
	return -EOPNOTSUPP;
}