Commit f2db7b80 authored by Eric Dumazet's avatar Eric Dumazet Committed by Jakub Kicinski
Browse files

net/sched: refine indirect call mitigation in tc_wrapper.h



Some modern cpus disable X86_FEATURE_RETPOLINE feature,
even if a direct call can still be beneficial.

Even when IBRS is present, an indirect call is more expensive
than a direct one:

Direct Calls:
  Compilers can perform powerful optimizations like inlining,
  where the function body is directly inserted at the call site,
  eliminating call overhead entirely.

Indirect Calls:
  Inlining is much harder, if not impossible, because the compiler
  doesn't know the target function at compile time.
  Techniques like Indirect Call Promotion can help by using
  profile-guided optimization to turn frequently taken indirect calls
  into conditional direct calls, but they still add complexity
  and potential overhead compared to a truly direct call.

In this patch, I split tc_skip_wrapper in two different
static keys, one for tc_act() (tc_skip_wrapper_act)
and one for tc_classify() (tc_skip_wrapper_cls).

Then I enable the tc_skip_wrapper_cls only if the count
of builtin classifiers is above one.

I enable tc_skip_wrapper_act only it the count of builtin
actions is above one.

In our production kernels, we only have CONFIG_NET_CLS_BPF=y
and CONFIG_NET_ACT_BPF=y. Other are modules or are not compiled.

Tested on AMD Turin cpus, cls_bpf_classify() cost went
from 1% down to 0.18 %, and FDO will be able to inline
it in tcf_classify() for further gains.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarJamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: default avatarPedro Tammela <pctammela@mojatatu.com>
Reviewed-by: default avatarVictor Nogueira <victor@mojatatu.com>
Link: https://patch.msgid.link/20260307133601.3863071-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent e8eb33d6
Loading
Loading
Loading
Loading
+42 −5
Original line number Diff line number Diff line
@@ -12,7 +12,8 @@

#define TC_INDIRECT_SCOPE

extern struct static_key_false tc_skip_wrapper;
extern struct static_key_false tc_skip_wrapper_act;
extern struct static_key_false tc_skip_wrapper_cls;

/* TC Actions */
#ifdef CONFIG_NET_CLS_ACT
@@ -46,7 +47,7 @@ TC_INDIRECT_ACTION_DECLARE(tunnel_key_act);
static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
			   struct tcf_result *res)
{
	if (static_branch_likely(&tc_skip_wrapper))
	if (static_branch_likely(&tc_skip_wrapper_act))
		goto skip;

#if IS_BUILTIN(CONFIG_NET_ACT_GACT)
@@ -153,7 +154,7 @@ TC_INDIRECT_FILTER_DECLARE(u32_classify);
static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
				struct tcf_result *res)
{
	if (static_branch_likely(&tc_skip_wrapper))
	if (static_branch_likely(&tc_skip_wrapper_cls))
		goto skip;

#if IS_BUILTIN(CONFIG_NET_CLS_BPF)
@@ -202,8 +203,44 @@ static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
static inline void tc_wrapper_init(void)
{
#ifdef CONFIG_X86
	if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
		static_branch_enable(&tc_skip_wrapper);
	int cnt_cls = IS_BUILTIN(CONFIG_NET_CLS_BPF) +
		IS_BUILTIN(CONFIG_NET_CLS_U32)  +
		IS_BUILTIN(CONFIG_NET_CLS_FLOWER) +
		IS_BUILTIN(CONFIG_NET_CLS_FW) +
		IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) +
		IS_BUILTIN(CONFIG_NET_CLS_BASIC) +
		IS_BUILTIN(CONFIG_NET_CLS_CGROUP) +
		IS_BUILTIN(CONFIG_NET_CLS_FLOW) +
		IS_BUILTIN(CONFIG_NET_CLS_ROUTE4);

	int cnt_act = IS_BUILTIN(CONFIG_NET_ACT_GACT) +
		IS_BUILTIN(CONFIG_NET_ACT_MIRRED) +
		IS_BUILTIN(CONFIG_NET_ACT_PEDIT) +
		IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) +
		IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) +
		IS_BUILTIN(CONFIG_NET_ACT_POLICE) +
		IS_BUILTIN(CONFIG_NET_ACT_BPF) +
		IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) +
		IS_BUILTIN(CONFIG_NET_ACT_CSUM) +
		IS_BUILTIN(CONFIG_NET_ACT_CT) +
		IS_BUILTIN(CONFIG_NET_ACT_CTINFO) +
		IS_BUILTIN(CONFIG_NET_ACT_GATE) +
		IS_BUILTIN(CONFIG_NET_ACT_MPLS) +
		IS_BUILTIN(CONFIG_NET_ACT_NAT) +
		IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) +
		IS_BUILTIN(CONFIG_NET_ACT_VLAN) +
		IS_BUILTIN(CONFIG_NET_ACT_IFE) +
		IS_BUILTIN(CONFIG_NET_ACT_SIMP) +
		IS_BUILTIN(CONFIG_NET_ACT_SAMPLE);

	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
		return;

	if (cnt_cls > 1)
		static_branch_enable(&tc_skip_wrapper_cls);

	if (cnt_act > 1)
		static_branch_enable(&tc_skip_wrapper_act);
#endif
}

+2 −1
Original line number Diff line number Diff line
@@ -2479,7 +2479,8 @@ static struct pernet_operations psched_net_ops = {
};

#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_act);
DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_cls);
#endif

static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = {