Commit 5f123035 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-refactor-bhash2'

Kuniyuki Iwashima says:

====================
tcp: Refactor bhash2 and remove sk_bind2_node.

This series refactors code around bhash2 and remove some bhash2-specific
fields; sock.sk_bind2_node, and inet_timewait_sock.tw_bind2_node.

  patch 1      : optimise bind() for non-wildcard v4-mapped-v6 address
  patch 2 -  4 : optimise bind() conflict tests
  patch 5 - 12 : Link bhash2 to bhash and unlink sk from bhash2 to
                 remove sk_bind2_node

The patch 8 will trigger a false-positive error by checkpatch.

v2: resend of https://lore.kernel.org/netdev/20231213082029.35149-1-kuniyu@amazon.com/
  * Rebase on latest net-next
  * Patch 11
    * Add change in inet_diag_dump_icsk() for recent bhash dump patch

v1: https://lore.kernel.org/netdev/20231023190255.39190-1-kuniyu@amazon.com/


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 67f440c0 8191792c
Loading
Loading
Loading
Loading
+8 −13
Original line number Diff line number Diff line
@@ -88,7 +88,7 @@ struct inet_bind_bucket {
	unsigned short		fast_sk_family;
	bool			fast_ipv6_only;
	struct hlist_node	node;
	struct hlist_head	owners;
	struct hlist_head	bhash2;
};

struct inet_bind2_bucket {
@@ -96,22 +96,17 @@ struct inet_bind2_bucket {
	int			l3mdev;
	unsigned short		port;
#if IS_ENABLED(CONFIG_IPV6)
	unsigned short		family;
#endif
	union {
#if IS_ENABLED(CONFIG_IPV6)
	unsigned short		addr_type;
	struct in6_addr		v6_rcv_saddr;
#endif
#define rcv_saddr		v6_rcv_saddr.s6_addr32[3]
#else
	__be32			rcv_saddr;
	};
#endif
	/* Node in the bhash2 inet_bind_hashbucket chain */
	struct hlist_node	node;
	struct hlist_node	bhash_node;
	/* List of sockets hashed to this bucket */
	struct hlist_head	owners;
	/* bhash has twsk in owners, but bhash2 has twsk in
	 * deathrow not to add a member in struct sock_common.
	 */
	struct hlist_head	deathrow;
};

static inline struct net *ib_net(const struct inet_bind_bucket *ib)
@@ -241,7 +236,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
			 struct inet_bind_hashbucket *head,
			 unsigned short port, int l3mdev,
			 struct inet_bind_bucket *tb,
			 const struct sock *sk);

void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
+0 −4
Original line number Diff line number Diff line
@@ -75,13 +75,9 @@ struct inet_timewait_sock {
	struct timer_list	tw_timer;
	struct inet_bind_bucket	*tw_tb;
	struct inet_bind2_bucket	*tw_tb2;
	struct hlist_node		tw_bind2_node;
};
#define tw_tclass tw_tos

#define twsk_for_each_bound_bhash2(__tw, list) \
	hlist_for_each_entry(__tw, list, tw_bind2_node)

static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
	return (struct inet_timewait_sock *)sk;
+0 −5
Original line number Diff line number Diff line
@@ -784,11 +784,6 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
					cpu_to_be32(0x0000ffff))) == 0UL;
}

static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a)
{
	return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]);
}

static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
	return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
+0 −14
Original line number Diff line number Diff line
@@ -352,7 +352,6 @@ struct sk_filter;
  *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *	@sk_txtime_unused: unused txtime flags
  *	@ns_tracker: tracker for netns reference
  *	@sk_bind2_node: bind node in the bhash2 table
  */
struct sock {
	/*
@@ -544,7 +543,6 @@ struct sock {
#endif
	struct rcu_head		sk_rcu;
	netns_tracker		ns_tracker;
	struct hlist_node	sk_bind2_node;
};

enum sk_pacing {
@@ -873,16 +871,6 @@ static inline void sk_add_bind_node(struct sock *sk,
	hlist_add_head(&sk->sk_bind_node, list);
}

static inline void __sk_del_bind2_node(struct sock *sk)
{
	__hlist_del(&sk->sk_bind2_node);
}

static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
	hlist_add_head(&sk->sk_bind2_node, list);
}

#define sk_for_each(__sk, list) \
	hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
@@ -900,8 +888,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
	hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
	hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
	hlist_for_each_entry(__sk, list, sk_bind2_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
+34 −39
Original line number Diff line number Diff line
@@ -159,8 +159,11 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
	if (sk->sk_family == AF_INET6) {
		int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);

		return addr_type != IPV6_ADDR_ANY &&
			addr_type != IPV6_ADDR_MAPPED;
		if (addr_type == IPV6_ADDR_ANY)
			return false;

		if (addr_type != IPV6_ADDR_MAPPED)
			return true;
	}
#endif
	return sk->sk_rcv_saddr != htonl(INADDR_ANY);
@@ -213,18 +216,9 @@ static bool inet_bhash2_conflict(const struct sock *sk,
				 bool relax, bool reuseport_cb_ok,
				 bool reuseport_ok)
{
	struct inet_timewait_sock *tw2;
	struct sock *sk2;

	sk_for_each_bound_bhash2(sk2, &tb2->owners) {
		if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
					   reuseport_cb_ok, reuseport_ok))
			return true;
	}

	twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
		sk2 = (struct sock *)tw2;

	sk_for_each_bound(sk2, &tb2->owners) {
		if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
					   reuseport_cb_ok, reuseport_ok))
			return true;
@@ -233,15 +227,20 @@ static bool inet_bhash2_conflict(const struct sock *sk,
	return false;
}

#define sk_for_each_bound_bhash(__sk, __tb2, __tb)			\
	hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node)	\
		sk_for_each_bound(sk2, &(__tb2)->owners)

/* This should be called only when the tb and tb2 hashbuckets' locks are held */
static int inet_csk_bind_conflict(const struct sock *sk,
				  const struct inet_bind_bucket *tb,
				  const struct inet_bind2_bucket *tb2, /* may be null */
				  bool relax, bool reuseport_ok)
{
	bool reuseport_cb_ok;
	struct sock_reuseport *reuseport_cb;
	kuid_t uid = sock_i_uid((struct sock *)sk);
	struct sock_reuseport *reuseport_cb;
	bool reuseport_cb_ok;
	struct sock *sk2;

	rcu_read_lock();
	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
@@ -249,32 +248,29 @@ static int inet_csk_bind_conflict(const struct sock *sk,
	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
	rcu_read_unlock();

	/*
	 * Unlike other sk lookup places we do not check
	/* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
	 * ipv4) should have been checked already. We need to do these two
	 * checks separately because their spinlocks have to be acquired/released
	 * independently of each other, to prevent possible deadlocks
	 */
	if (inet_use_bhash2_on_bind(sk))
		return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax,
						   reuseport_cb_ok, reuseport_ok);

	/* Unlike other sk lookup places we do not check
	 * for sk_net here, since _all_ the socks listed
	 * in tb->owners and tb2->owners list belong
	 * to the same net - the one this bucket belongs to.
	 */
	sk_for_each_bound_bhash(sk2, tb2, tb) {
		if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok))
			continue;

	if (!inet_use_bhash2_on_bind(sk)) {
		struct sock *sk2;

		sk_for_each_bound(sk2, &tb->owners)
			if (inet_bind_conflict(sk, sk2, uid, relax,
					       reuseport_cb_ok, reuseport_ok) &&
			    inet_rcv_saddr_equal(sk, sk2, true))
		if (inet_rcv_saddr_equal(sk, sk2, true))
			return true;

		return false;
	}

	/* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
	 * ipv4) should have been checked already. We need to do these two
	 * checks separately because their spinlocks have to be acquired/released
	 * independently of each other, to prevent possible deadlocks
	 */
	return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
					   reuseport_ok);
	return false;
}

/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
@@ -457,7 +453,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
	kuid_t uid = sock_i_uid(sk);
	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;

	if (hlist_empty(&tb->owners)) {
	if (hlist_empty(&tb->bhash2)) {
		tb->fastreuse = reuse;
		if (sk->sk_reuseport) {
			tb->fastreuseport = FASTREUSEPORT_ANY;
@@ -549,7 +545,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
	}

	if (!found_port) {
		if (!hlist_empty(&tb->owners)) {
		if (!hlist_empty(&tb->bhash2)) {
			if (sk->sk_reuse == SK_FORCE_REUSE ||
			    (tb->fastreuse > 0 && reuse) ||
			    sk_reuseport_match(tb, sk))
@@ -569,7 +565,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)

	if (!tb2) {
		tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
					       net, head2, port, l3mdev, sk);
					       net, head2, tb, sk);
		if (!tb2)
			goto fail_unlock;
		bhash2_created = true;
@@ -591,11 +587,10 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)

fail_unlock:
	if (ret) {
		if (bhash2_created)
			inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
		if (bhash_created)
			inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
		if (bhash2_created)
			inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
						  tb2);
	}
	if (head2_lock_acquired)
		spin_unlock(&head2->lock);
Loading