Commit 9de76f55 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'gro-inline-tcp6_gro_-receive-complete'

Eric Dumazet says:

====================
gro: inline tcp6_gro_{receive,complete}

On some platforms, GRO stack is too deep and causes cpu stalls.

Decreasing call depths by one shows a 1.5 % gain on Zen2 cpus.
(32 RX queues, 100Gbit NIC, RFS enabled, tcp_rr with 128 threads and 10,000 flows)

We can go further by inlining ipv6_gro_{receive,complete}
and take care of IPv4 if there is interest.

Note: two temporary __always_inline will be replaced with
      inline_for_performance when/if available.

Cumulative size increase for this series (of 3):

$ scripts/bloat-o-meter -t vmlinux.0 vmlinux.3
add/remove: 2/2 grow/shrink: 5/1 up/down: 1572/-471 (1101)
Function                                     old     new   delta
ipv6_gro_receive                            1069    1846    +777
ipv6_gro_complete                            433     733    +300
tcp6_check_fraglist_gro                        -     272    +272
tcp6_gro_complete                            227     306     +79
tcp4_gro_complete                            325     397     +72
ipv6_offload_init                            218     274     +56
__pfx_tcp6_check_fraglist_gro                  -      16     +16
__pfx___skb_incr_checksum_unnecessary         32       -     -32
__skb_incr_checksum_unnecessary              186       -    -186
tcp6_gro_receive                             959     706    -253
Total: Before=22592724, After=22593825, chg +0.00%
====================

Link: https://patch.msgid.link/20260120164903.1912995-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents a4674aa5 b8d9b7da
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -4763,7 +4763,7 @@ static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
	}
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
static __always_inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
	if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
		if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
+2 −3
Original line number Diff line number Diff line
@@ -405,9 +405,8 @@ INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
							   struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));

INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
							   struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *);
int udp6_gro_complete(struct sk_buff *, int);

#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb)	\
({								\
+0 −2
Original line number Diff line number Diff line
@@ -2324,8 +2324,6 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
				struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
#ifdef CONFIG_INET
void tcp_gro_complete(struct sk_buff *skb);
#else
+1 −1
Original line number Diff line number Diff line
@@ -45,7 +45,7 @@ obj-$(CONFIG_IPV6_FOU) += fou6.o

obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o \
			ip6_offload.o tcpv6_offload.o exthdrs_offload.o
			ip6_offload.o exthdrs_offload.o

obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o

+22 −21
Original line number Diff line number Diff line
@@ -19,23 +19,7 @@
#include <net/gso.h>

#include "ip6_offload.h"

/* All GRO functions are always builtin, except UDP over ipv6, which lays in
 * ipv6 module, as it depends on UDPv6 lookup function, so we need special care
 * when ipv6 is built as a module
 */
#if IS_BUILTIN(CONFIG_IPV6)
#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
#endif

#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb)	\
({								\
	unlikely(gro_recursion_inc_test(skb)) ?			\
		NAPI_GRO_CB(skb)->flush |= 1, NULL :		\
		INDIRECT_CALL_L4(cb, f2, f1, head, skb);	\
})
#include "tcpv6_offload.c"

static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
{
@@ -298,9 +282,19 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,

	skb_gro_postpull_rcsum(skb, iph, nlen);

	pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
					 ops->callbacks.gro_receive, head, skb);
	if (unlikely(gro_recursion_inc_test(skb))) {
		flush = 1;
		goto out;
	}

	if (likely(proto == IPPROTO_TCP))
		pp = tcp6_gro_receive(head, skb);
#if IS_BUILTIN(CONFIG_IPV6)
	else if (likely(proto == IPPROTO_UDP))
		pp = udp6_gro_receive(head, skb);
#endif
	else
		pp = ops->callbacks.gro_receive(head, skb);
out:
	skb_gro_flush_final(skb, pp, flush);

@@ -379,11 +373,18 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
	}

	nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);

	if (likely(ops == &net_hotdata.tcpv6_offload))
		return tcp6_gro_complete(skb, nhoff);
#if IS_BUILTIN(CONFIG_IPV6)
	if (ops == &net_hotdata.udpv6_offload)
		return udp6_gro_complete(skb, nhoff);
#endif

	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
		goto out;

	err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
			       udp6_gro_complete, skb, nhoff);
	err = ops->callbacks.gro_complete(skb, nhoff);

out:
	return err;
Loading