Commit d57f4b87 authored by Jakub Sitnicki's avatar Jakub Sitnicki Committed by Paolo Abeni
Browse files

tcp: Update bind bucket state on port release



Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or
fastreuseport >= 0 after a socket is explicitly bound to a port, it remains
in that state until all sockets are removed and the bucket is destroyed.

In this state, the bucket is skipped during ephemeral port selection in
connect(). For applications using a reduced ephemeral port
range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port
exhaustion since blocked buckets are excluded from reuse.

The reason the bucket state isn't updated on port release is unclear.
Possibly a performance trade-off to avoid scanning bucket owners, or just
an oversight.

Fix it by recalculating the bucket state when a socket releases a port. To
limit overhead, each inet_bind2_bucket stores its own (fastreuse,
fastreuseport) state. On port release, only the relevant port-addr bucket
is scanned, and the overall state is derived from these.

Signed-off-by: default avatarJakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: default avatarKuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 3afb106f
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -316,8 +316,9 @@ int inet_csk_listen_start(struct sock *sk);
void inet_csk_listen_stop(struct sock *sk);

/* update the fast reuse flag when adding a socket */
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
			       struct sock *sk);
void inet_csk_update_fastreuse(const struct sock *sk,
			       struct inet_bind_bucket *tb,
			       struct inet_bind2_bucket *tb2);

struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);

+2 −0
Original line number Diff line number Diff line
@@ -108,6 +108,8 @@ struct inet_bind2_bucket {
	struct hlist_node	bhash_node;
	/* List of sockets hashed to this bucket */
	struct hlist_head	owners;
	signed char		fastreuse;
	signed char		fastreuseport;
};

static inline struct net *ib_net(const struct inet_bind_bucket *ib)
+2 −1
Original line number Diff line number Diff line
@@ -70,7 +70,8 @@ struct inet_timewait_sock {
	unsigned int		tw_transparent  : 1,
				tw_flowlabel	: 20,
				tw_usec_ts	: 1,
				tw_pad		: 2,	/* 2 bits hole */
				tw_connect_bind	: 1,
				tw_pad		: 1,	/* 1 bit hole */
				tw_tos		: 8;
	u32			tw_txhash;
	u32			tw_priority;
+4 −0
Original line number Diff line number Diff line
@@ -1494,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk)

#define SOCK_BINDADDR_LOCK	4
#define SOCK_BINDPORT_LOCK	8
/**
 * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
 */
#define SOCK_CONNECT_BIND	16

struct socket_alloc {
	struct socket socket;
+8 −4
Original line number Diff line number Diff line
@@ -423,7 +423,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
}

static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
				     struct sock *sk)
				     const struct sock *sk)
{
	if (tb->fastreuseport <= 0)
		return 0;
@@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
				    ipv6_only_sock(sk), true, false);
}

void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
			       struct sock *sk)
void inet_csk_update_fastreuse(const struct sock *sk,
			       struct inet_bind_bucket *tb,
			       struct inet_bind2_bucket *tb2)
{
	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;

@@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
			tb->fastreuseport = 0;
		}
	}

	tb2->fastreuse = tb->fastreuse;
	tb2->fastreuseport = tb->fastreuseport;
}

/* Obtain a reference to a local port for the given sock,
@@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
	}

success:
	inet_csk_update_fastreuse(tb, sk);
	inet_csk_update_fastreuse(sk, tb, tb2);

	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, tb2, port);
Loading