Commit 358105ab authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'tcp-dccp-refine-source-port-selection'

Eric Dumazet says:

====================
tcp/dccp: refine source port selection

This patch series leverages IP_LOCAL_PORT_RANGE option
to no longer favor even source port selection at connect() time.

This should lower time taken by connect() for hosts having
many active connections to the same destination.
====================

Link: https://lore.kernel.org/r/20231214192939.1962891-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 758a8d5b 20718485
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
	*low = range & 0xffff;
	*high = range >> 16;
}
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
+16 −5
Original line number Diff line number Diff line
@@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
	return !sk->sk_rcv_saddr;
}

void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
/**
 *	inet_sk_get_local_port_range - fetch ephemeral ports range
 *	@sk: socket
 *	@low: pointer to low port
 *	@high: pointer to high port
 *
 *	Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
 *	Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
 *	Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
 */
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
	const struct inet_sock *inet = inet_sk(sk);
	const struct net *net = sock_net(sk);
	int lo, hi, sk_lo, sk_hi;
	bool local_range = false;
	u32 sk_range;

	inet_get_local_port_range(net, &lo, &hi);
	inet_get_local_port_range(sock_net(sk), &lo, &hi);

	sk_range = READ_ONCE(inet->local_port_range);
	sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
	if (unlikely(sk_range)) {
		sk_lo = sk_range & 0xffff;
		sk_hi = sk_range >> 16;
@@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
			lo = sk_lo;
		if (lo <= sk_hi && sk_hi <= hi)
			hi = sk_hi;
		local_range = true;
	}

	*low = lo;
	*high = hi;
	return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);

+16 −11
Original line number Diff line number Diff line
@@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
	bool tb_created = false;
	u32 remaining, offset;
	int ret, i, low, high;
	int l3mdev;
	bool local_ports;
	int step, l3mdev;
	u32 index;

	if (port) {
@@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,

	l3mdev = inet_sk_bound_l3mdev(sk);

	inet_sk_get_local_port_range(sk, &low, &high);
	local_ports = inet_sk_get_local_port_range(sk, &low, &high);
	step = local_ports ? 1 : 2;

	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
	if (!local_ports && remaining > 1)
		remaining &= ~1U;

	get_random_sleepable_once(table_perturb,
@@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
	if (!local_ports)
		offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
	for (i = 0; i < remaining; i += step, port += step) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
@@ -1083,10 +1087,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		cond_resched();
	}

	if (!local_ports) {
		offset++;
		if ((offset & 1) && remaining > 1)
			goto other_parity_scan;

	}
	return -EADDRNOTAVAIL;

ok:
@@ -1109,8 +1114,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
	 * on low contention the randomness is maximal and on high contention
	 * it may be inexistent.
	 */
	i = max_t(int, i, get_random_u32_below(8) * 2);
	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
	i = max_t(int, i, get_random_u32_below(8) * step);
	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, tb2, port);