Commit a853c609 authored by Eric Dumazet's avatar Eric Dumazet Committed by Jakub Kicinski
Browse files

inetpeer: do not get a refcount in inet_getpeer()



All inet_getpeer() callers except ip4_frag_init() don't need
to acquire a permanent refcount on the inetpeer.

They can switch to full RCU protection.

Move the refcount_inc_not_zero() into ip4_frag_init(),
so that all the other callers no longer have to
perform a pair of expensive atomic operations on
a possibly contended cache line.

inet_putpeer() no longer needs to be exported.

After this patch, my DUT can receive 8,400,000 UDP packets
per second targeting closed ports, using 50% less cpu cycles
than before.

Also change two calls to l3mdev_master_ifindex() by
l3mdev_master_ifindex_rcu() (Ido ideas)

Fixes: 8c2bd38b ("icmp: change the order of rate limits")
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241215175629.1248773-5-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 50b362f2
Loading
Loading
Loading
Loading
+4 −5
Original line number Diff line number Diff line
@@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
	struct dst_entry *dst = &rt->dst;
	struct inet_peer *peer;
	bool rc = true;
	int vif;

	if (!apply_ratelimit)
		return true;
@@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
		goto out;

	vif = l3mdev_master_ifindex(dst->dev);
	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif);
	rcu_read_lock();
	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
			       l3mdev_master_ifindex_rcu(dst->dev));
	rc = inet_peer_xrlim_allow(peer,
				   READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
	if (peer)
		inet_putpeer(peer);
	rcu_read_unlock();
out:
	if (!rc)
		__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+2 −6
Original line number Diff line number Diff line
@@ -109,8 +109,6 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
		p = rb_entry(parent, struct inet_peer, rb_node);
		cmp = inetpeer_addr_cmp(daddr, &p->daddr);
		if (cmp == 0) {
			if (!refcount_inc_not_zero(&p->refcnt))
				break;
			now = jiffies;
			if (READ_ONCE(p->dtime) != now)
				WRITE_ONCE(p->dtime, now);
@@ -169,6 +167,7 @@ static void inet_peer_gc(struct inet_peer_base *base,
	}
}

/* Must be called under RCU : No refcount change is done here. */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
			       const struct inetpeer_addr *daddr)
{
@@ -179,10 +178,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
	/* Attempt a lockless lookup first.
	 * Because of a concurrent writer, we might not find an existing entry.
	 */
	rcu_read_lock();
	seq = read_seqbegin(&base->lock);
	p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
	rcu_read_unlock();

	if (p)
		return p;
@@ -200,7 +197,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
		if (p) {
			p->daddr = *daddr;
			p->dtime = (__u32)jiffies;
			refcount_set(&p->refcnt, 2);
			refcount_set(&p->refcnt, 1);
			atomic_set(&p->rid, 0);
			p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
			p->rate_tokens = 0;
@@ -228,7 +225,6 @@ void inet_putpeer(struct inet_peer *p)
	if (refcount_dec_and_test(&p->refcnt))
		kfree_rcu(p, rcu);
}
EXPORT_SYMBOL_GPL(inet_putpeer);

/*
 *	Check transmit rate limitation for given message.
+10 −5
Original line number Diff line number Diff line
@@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
	struct ipq *qp = container_of(q, struct ipq, q);
	struct net *net = q->fqdir->net;

	const struct frag_v4_compare_key *key = a;
	struct net *net = q->fqdir->net;
	struct inet_peer *p = NULL;

	q->key.v4 = *key;
	qp->ecn = 0;
	qp->peer = q->fqdir->max_dist ?
		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif) :
		NULL;
	if (q->fqdir->max_dist) {
		rcu_read_lock();
		p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
		if (p && !refcount_inc_not_zero(&p->refcnt))
			p = NULL;
		rcu_read_unlock();
	}
	qp->peer = p;
}

static void ip4_frag_free(struct inet_frag_queue *q)
+8 −7
Original line number Diff line number Diff line
@@ -870,11 +870,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
	}
	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
	rcu_read_unlock();

	net = dev_net(rt->dst.dev);
	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
	if (!peer) {
		rcu_read_unlock();
		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
			  rt_nexthop(rt, ip_hdr(skb)->daddr));
		return;
@@ -893,7 +893,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
	 */
	if (peer->n_redirects >= ip_rt_redirect_number) {
		peer->rate_last = jiffies;
		goto out_put_peer;
		goto out_unlock;
	}

	/* Check for load limit; set rate_last to the latest sent
@@ -914,8 +914,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
					     &ip_hdr(skb)->saddr, inet_iif(skb),
					     &ip_hdr(skb)->daddr, &gw);
	}
out_put_peer:
	inet_putpeer(peer);
out_unlock:
	rcu_read_unlock();
}

static int ip_error(struct sk_buff *skb)
@@ -975,9 +975,9 @@ static int ip_error(struct sk_buff *skb)
		break;
	}

	rcu_read_lock();
	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
			       l3mdev_master_ifindex(skb->dev));

			       l3mdev_master_ifindex_rcu(skb->dev));
	send = true;
	if (peer) {
		now = jiffies;
@@ -989,8 +989,9 @@ static int ip_error(struct sk_buff *skb)
			peer->rate_tokens -= ip_rt_error_cost;
		else
			send = false;
		inet_putpeer(peer);
	}
	rcu_read_unlock();

	if (send)
		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);

+2 −2
Original line number Diff line number Diff line
@@ -222,10 +222,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
		if (rt->rt6i_dst.plen < 128)
			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);

		rcu_read_lock();
		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
		res = inet_peer_xrlim_allow(peer, tmo);
		if (peer)
			inet_putpeer(peer);
		rcu_read_unlock();
	}
	if (!res)
		__ICMP6_INC_STATS(net, ip6_dst_idev(dst),
Loading