Commit bf76f23a authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "Core scheduler changes:

   - Better tracking of maximum lag of tasks in presence of different
     slices duration, for better handling of lag in the fair scheduler
     (Vincent Guittot)

   - Clean up and standardize #if/#else/#endif markers throughout the
     entire scheduler code base (Ingo Molnar)

   - Make SMP unconditional: build the SMP scheduler's data structures
     and logic on UP kernel too, even though they are not used, to
     simplify the scheduler and remove around 200 #ifdef/[#else]/#endif
     blocks from the scheduler (Ingo Molnar)

   - Reorganize cgroup bandwidth control interface handling for better
     interfacing with sched_ext (Tejun Heo)

  Balancing:

   - Bump sd->max_newidle_lb_cost when newidle balance fails (Chris
     Mason)

   - Remove sched_domain_topology_level::flags to simplify the code
     (Prateek Nayak)

   - Simplify and clean up build_sched_topology() (Li Chen)

   - Optimize build_sched_topology() on large machines (Li Chen)

  Real-time scheduling:

   - Add initial version of proxy execution: a mechanism for
     mutex-owning tasks to inherit the scheduling context of higher
     priority waiters.

     Currently limited to a single runqueue and conditional on
     CONFIG_EXPERT, and other limitations (John Stultz, Peter Zijlstra,
     Valentin Schneider)

   - Deadline scheduler (Juri Lelli):
      - Fix dl_servers initialization order (Juri Lelli)
      - Fix DL scheduler's root domain reinitialization logic (Juri
        Lelli)
      - Fix accounting bugs after global limits change (Juri Lelli)
      - Fix scalability regression by implementing less agressive
        dl_server handling (Peter Zijlstra)

  PSI:

   - Improve scalability by optimizing psi_group_change() cpu_clock()
     usage (Peter Zijlstra)

  Rust changes:

   - Make Task, CondVar and PollCondVar methods inline to avoid
     unnecessary function calls (Kunwu Chan, Panagiotis Foliadis)

   - Add might_sleep() support for Rust code: Rust's "#[track_caller]"
     mechanism is used so that Rust's might_sleep() doesn't need to be
     defined as a macro (Fujita Tomonori)

   - Introduce file_from_location() (Boqun Feng)

  Debugging & instrumentation:

   - Make clangd usable with scheduler source code files again (Peter
     Zijlstra)

   - tools: Add root_domains_dump.py which dumps root domains info (Juri
     Lelli)

   - tools: Add dl_bw_dump.py for printing bandwidth accounting info
     (Juri Lelli)

  Misc cleanups & fixes:

   - Remove play_idle() (Feng Lee)

   - Fix check_preemption_disabled() (Sebastian Andrzej Siewior)

   - Do not call __put_task_struct() on RT if pi_blocked_on is set (Luis
     Claudio R. Goncalves)

   - Correct the comment in place_entity() (wang wei)"

* tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (84 commits)
  sched/idle: Remove play_idle()
  sched: Do not call __put_task_struct() on rt if pi_blocked_on is set
  sched: Start blocked_on chain processing in find_proxy_task()
  sched: Fix proxy/current (push,pull)ability
  sched: Add an initial sketch of the find_proxy_task() function
  sched: Fix runtime accounting w/ split exec & sched contexts
  sched: Move update_curr_task logic into update_curr_se
  locking/mutex: Add p->blocked_on wrappers for correctness checks
  locking/mutex: Rework task_struct::blocked_on
  sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable
  sched/topology: Remove sched_domain_topology_level::flags
  x86/smpboot: avoid SMT domain attach/destroy if SMT is not enabled
  x86/smpboot: moves x86_topology to static initialize and truncate
  x86/smpboot: remove redundant CONFIG_SCHED_SMT
  smpboot: introduce SDTL_INIT() helper to tidy sched topology setup
  tools/sched: Add dl_bw_dump.py for printing bandwidth accounting info
  tools/sched: Add root_domains_dump.py which dumps root domains info
  sched/deadline: Fix accounting after global limits change
  sched/deadline: Reset extra_bw to max_bw when clearing root domains
  sched/deadline: Initialize dl_servers after SMP
  ...
parents 14bed9bc 1b5f1454
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -6410,6 +6410,11 @@
	sa1100ir	[NET]
			See drivers/net/irda/sa1100_ir.c.

	sched_proxy_exec= [KNL]
			Enables or disables "proxy execution" style
			solution to mutex-based priority inversion.
			Format: <bool>

	sched_verbose	[KNL,EARLY] Enables verbose scheduler debug messages.

	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
+1 −0
Original line number Diff line number Diff line
@@ -22319,6 +22319,7 @@ F: include/linux/wait.h
F:	include/uapi/linux/sched.h
F:	kernel/fork.c
F:	kernel/sched/
F:	tools/sched/
SCHEDULER - SCHED_EXT
R:	Tejun Heo <tj@kernel.org>
+10 −15
Original line number Diff line number Diff line
@@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void)
#ifdef CONFIG_SCHED_SMT
	if (has_big_cores) {
		pr_info("Big cores detected but using small core scheduling\n");
		powerpc_topology[i++] = (struct sched_domain_topology_level){
			smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
		};
		powerpc_topology[i++] =
			SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
	} else {
		powerpc_topology[i++] = (struct sched_domain_topology_level){
			cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
		};
		powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
	}
#endif
	if (shared_caches) {
		powerpc_topology[i++] = (struct sched_domain_topology_level){
			shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
		};
		powerpc_topology[i++] =
			SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
	}

	if (has_coregroup_support()) {
		powerpc_topology[i++] = (struct sched_domain_topology_level){
			cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
		};
		powerpc_topology[i++] =
			SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
	}
	powerpc_topology[i++] = (struct sched_domain_topology_level){
		cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
	};

	powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);

	/* There must be one trailing NULL entry left.  */
	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
+5 −5
Original line number Diff line number Diff line
@@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
}

static struct sched_domain_topology_level s390_topology[] = {
	{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
	SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
	SDTL_INIT(cpu_book_mask, NULL, BOOK),
	SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
	{ NULL, },
};

+24 −27
Original line number Diff line number Diff line
@@ -478,44 +478,41 @@ static int x86_cluster_flags(void)
 */
static bool x86_has_numa_in_package;

static struct sched_domain_topology_level x86_topology[6];

static void __init build_sched_topology(void)
{
	int i = 0;

#ifdef CONFIG_SCHED_SMT
	x86_topology[i++] = (struct sched_domain_topology_level){
		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
	};
#endif
static struct sched_domain_topology_level x86_topology[] = {
	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
	x86_topology[i++] = (struct sched_domain_topology_level){
		cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
	};
	SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
	x86_topology[i++] = (struct sched_domain_topology_level){
		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
	};
	SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
#endif
	SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
	{ NULL },
};

static void __init build_sched_topology(void)
{
	struct sched_domain_topology_level *topology = x86_topology;

	/*
	 * When there is NUMA topology inside the package skip the PKG domain
	 * since the NUMA domains will auto-magically create the right spanning
	 * domains based on the SLIT.
	 * When there is NUMA topology inside the package invalidate the
	 * PKG domain since the NUMA domains will auto-magically create the
	 * right spanning domains based on the SLIT.
	 */
	if (!x86_has_numa_in_package) {
		x86_topology[i++] = (struct sched_domain_topology_level){
			cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
		};
	if (x86_has_numa_in_package) {
		unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;

		memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
	}

	/*
	 * There must be one trailing NULL entry left.
	 * Drop the SMT domains if there is only one thread per-core
	 * since it'll get degenerated by the scheduler anyways.
	 */
	BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
	if (cpu_smt_num_threads <= 1)
		++topology;

	set_sched_topology(x86_topology);
	set_sched_topology(topology);
}

void set_cpu_sibling_map(int cpu)
Loading