Commit 661f951e authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask()

Leon [1] and Vinicius [2] noted a topology_span_sane() warning during
their testing starting from v6.16-rc1. Debug that followed pointed to
the tl->mask() for the NODE domain being incorrectly resolved to that of
the highest NUMA domain.

tl->mask() for NODE is set to the sd_numa_mask() which depends on the
global "sched_domains_curr_level" hack. "sched_domains_curr_level" is
set to the "tl->numa_level" during tl traversal in build_sched_domains()
calling sd_init() but was not reset before topology_span_sane().

Since "tl->numa_level" still reflected the old value from
build_sched_domains(), topology_span_sane() for the NODE domain trips
when the span of the last NUMA domain overlaps.

Instead of replicating the "sched_domains_curr_level" hack, get rid of
it entirely and instead, pass the entire "sched_domain_topology_level"
object to tl->cpumask() function to prevent such mishap in the future.

sd_numa_mask() now directly references "tl->numa_level" instead of
relying on the global "sched_domains_curr_level" hack to index into
sched_domains_numa_masks[].

The original warning was reproducible on the following NUMA topology
reported by Leon:

    $ sudo numactl -H
    available: 5 nodes (0-4)
    node 0 cpus: 0 1
    node 0 size: 2927 MB
    node 0 free: 1603 MB
    node 1 cpus: 2 3
    node 1 size: 3023 MB
    node 1 free: 3008 MB
    node 2 cpus: 4 5
    node 2 size: 3023 MB
    node 2 free: 3007 MB
    node 3 cpus: 6 7
    node 3 size: 3023 MB
    node 3 free: 3002 MB
    node 4 cpus: 8 9
    node 4 size: 3022 MB
    node 4 free: 2718 MB
    node distances:
    node   0   1   2   3   4
      0:  10  39  38  37  36
      1:  39  10  38  37  36
      2:  38  38  10  37  36
      3:  37  37  37  10  36
      4:  36  36  36  36  10

The above topology can be mimicked using the following QEMU cmd that was
used to reproduce the warning and test the fix:

     sudo qemu-system-x86_64 -enable-kvm -cpu host \
     -m 20G -smp cpus=10,sockets=10 -machine q35 \
     -object memory-backend-ram,size=4G,id=m0 \
     -object memory-backend-ram,size=4G,id=m1 \
     -object memory-backend-ram,size=4G,id=m2 \
     -object memory-backend-ram,size=4G,id=m3 \
     -object memory-backend-ram,size=4G,id=m4 \
     -numa node,cpus=0-1,memdev=m0,nodeid=0 \
     -numa node,cpus=2-3,memdev=m1,nodeid=1 \
     -numa node,cpus=4-5,memdev=m2,nodeid=2 \
     -numa node,cpus=6-7,memdev=m3,nodeid=3 \
     -numa node,cpus=8-9,memdev=m4,nodeid=4 \
     -numa dist,src=0,dst=1,val=39 \
     -numa dist,src=0,dst=2,val=38 \
     -numa dist,src=0,dst=3,val=37 \
     -numa dist,src=0,dst=4,val=36 \
     -numa dist,src=1,dst=0,val=39 \
     -numa dist,src=1,dst=2,val=38 \
     -numa dist,src=1,dst=3,val=37 \
     -numa dist,src=1,dst=4,val=36 \
     -numa dist,src=2,dst=0,val=38 \
     -numa dist,src=2,dst=1,val=38 \
     -numa dist,src=2,dst=3,val=37 \
     -numa dist,src=2,dst=4,val=36 \
     -numa dist,src=3,dst=0,val=37 \
     -numa dist,src=3,dst=1,val=37 \
     -numa dist,src=3,dst=2,val=37 \
     -numa dist,src=3,dst=4,val=36 \
     -numa dist,src=4,dst=0,val=36 \
     -numa dist,src=4,dst=1,val=36 \
     -numa dist,src=4,dst=2,val=36 \
     -numa dist,src=4,dst=3,val=36 \
     ...

  [ prateek: Moved common functions to include/linux/sched/topology.h,
    reuse the common bits for s390 and ppc, commit message ]

Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/

 [1]
Fixes: ccf74128 ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da, f55dac1d
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: default avatarLeon Romanovsky <leon@kernel.org>
Signed-off-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarValentin Schneider <vschneid@redhat.com>
Reviewed-by: default avatarShrikanth Hegde <sshegde@linux.ibm.com>
Tested-by: Valentin Schneider <vschneid@redhat.com> # x86
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com> # powerpc
Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2]
parent 8fd5485f
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -971,6 +971,10 @@ config SCHED_SMT
	  when dealing with POWER5 cpus at a cost of slightly increased
	  overhead in some places. If unsure say N here.

config SCHED_MC
	def_bool y
	depends on SMP

config PPC_DENORMALISATION
	bool "PowerPC denormalisation exception handling"
	depends on PPC_BOOK3S_64
+2 −0
Original line number Diff line number Diff line
@@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu)
#ifdef CONFIG_SMP
#include <asm/cputable.h>

struct cpumask *cpu_coregroup_mask(int cpu);

#ifdef CONFIG_PPC64
#include <asm/smp.h>

+11 −16
Original line number Diff line number Diff line
@@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void)
 * We can't just pass cpu_l2_cache_mask() directly because
 * returns a non-const pointer and the compiler barfs on that.
 */
static const struct cpumask *shared_cache_mask(int cpu)
static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu)
{
	return per_cpu(cpu_l2_cache_map, cpu);
}

#ifdef CONFIG_SCHED_SMT
static const struct cpumask *smallcore_smt_mask(int cpu)
static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu)
{
	return cpu_smallcore_mask(cpu);
}
#endif

static struct cpumask *cpu_coregroup_mask(int cpu)
struct cpumask *cpu_coregroup_mask(int cpu)
{
	return per_cpu(cpu_coregroup_map, cpu);
}
@@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void)
	return coregroup_enabled;
}

static const struct cpumask *cpu_mc_mask(int cpu)
{
	return cpu_coregroup_mask(cpu);
}

static int __init init_big_cores(void)
{
	int cpu;
@@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
		return false;
	}

	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));

	/* Update l2-cache mask with all the CPUs that are part of submask */
	or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
@@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
		return;
	}

	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));

	/* Update coregroup mask with all the CPUs that are part of submask */
	or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
@@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu)

	/* If chip_id is -1; limit the cpu_core_mask to within PKG */
	if (chip_id == -1)
		cpumask_and(mask, mask, cpu_cpu_mask(cpu));
		cpumask_and(mask, mask, cpu_node_mask(cpu));

	for_each_cpu(i, mask) {
		if (chip_id == cpu_to_chip_id(i)) {
@@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void)
	if (has_big_cores) {
		pr_info("Big cores detected but using small core scheduling\n");
		powerpc_topology[i++] =
			SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
			SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT);
	} else {
		powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
		powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT);
	}
#endif
	if (shared_caches) {
		powerpc_topology[i++] =
			SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
			SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE);
	}

	if (has_coregroup_support()) {
		powerpc_topology[i++] =
			SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
			SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);
	}

	powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
	powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);

	/* There must be one trailing NULL entry left.  */
	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
+7 −13
Original line number Diff line number Diff line
@@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu)
	return rc;
}

static const struct cpumask *cpu_thread_mask(int cpu)
{
	return &cpu_topology[cpu].thread_mask;
}


const struct cpumask *cpu_coregroup_mask(int cpu)
{
	return &cpu_topology[cpu].core_mask;
}

static const struct cpumask *cpu_book_mask(int cpu)
static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu)
{
	return &cpu_topology[cpu].book_mask;
}

static const struct cpumask *cpu_drawer_mask(int cpu)
static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu)
{
	return &cpu_topology[cpu].drawer_mask;
}

static struct sched_domain_topology_level s390_topology[] = {
	SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
	SDTL_INIT(cpu_book_mask, NULL, BOOK),
	SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
	SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
	SDTL_INIT(tl_book_mask, NULL, BOOK),
	SDTL_INIT(tl_drawer_mask, NULL, DRAWER),
	SDTL_INIT(tl_pkg_mask, NULL, PKG),
	{ NULL, },
};

+4 −4
Original line number Diff line number Diff line
@@ -479,14 +479,14 @@ static int x86_cluster_flags(void)
static bool x86_has_numa_in_package;

static struct sched_domain_topology_level x86_topology[] = {
	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
	SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
	SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
	SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
	SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
#endif
	SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
	SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
	{ NULL },
};

Loading