Commit 6d2c10e8 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "Scalability and load-balancing improvements:

   - Enable scheduler feature NEXT_BUDDY (Mel Gorman)

   - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman)

   - Skip sched_balance_running cmpxchg when balance is not due (Tim
     Chen)

   - Implement generic code for architecture specific sched domain NUMA
     distances (Tim Chen)

   - Optimize the NUMA distances of the sched-domains builds of Intel
     Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim
     Chen)

   - Implement proportional newidle balance: a randomized algorithm that
     runs newidle balancing proportional to its success rate. (Peter
     Zijlstra)

  Scheduler infrastructure changes:

   - Implement the 'sched_change' scoped_guard() pattern for the entire
     scheduler (Peter Zijlstra)

   - More broadly utilize the sched_change guard (Peter Zijlstra)

   - Add support to pick functions to take runqueue-flags (Joel
     Fernandes)

   - Provide and use set_need_resched_current() (Peter Zijlstra)

  Fair scheduling enhancements:

   - Forfeit vruntime on yield (Fernand Sieber)

   - Only update stats for allowed CPUs when looking for dst group (Adam
     Li)

  CPU-core scheduling enhancements:

   - Optimize core cookie matching check (Fernand Sieber)

  Deadline scheduler fixes:

   - Only set free_cpus for online runqueues (Doug Berger)

   - Fix dl_server time accounting (Peter Zijlstra)

   - Fix dl_server stop condition (Peter Zijlstra)

  Proxy scheduling fixes:

   - Yield the donor task (Fernand Sieber)

  Fixes and cleanups:

   - Fix do_set_cpus_allowed() locking (Peter Zijlstra)

   - Fix migrate_disable_switch() locking (Peter Zijlstra)

   - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
     (Hao Jia)

   - Increase sched_tick_remote timeout (Phil Auld)

   - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth
     Hegde)

   - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)"

* tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits)
  sched: Provide and use set_need_resched_current()
  sched/fair: Proportional newidle balance
  sched/fair: Small cleanup to update_newidle_cost()
  sched/fair: Small cleanup to sched_balance_newidle()
  sched/fair: Revert max_newidle_lb_cost bump
  sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals
  sched/fair: Enable scheduler feature NEXT_BUDDY
  sched: Increase sched_tick_remote timeout
  sched/fair: Have SD_SERIALIZE affect newidle balancing
  sched/fair: Skip sched_balance_running cmpxchg when balance is not due
  sched/deadline: Minor cleanup in select_task_rq_dl()
  sched/deadline: Use cpumask_weight_and() in dl_bw_cpus
  sched/deadline: Document dl_server
  sched/deadline: Fix dl_server stop condition
  sched/deadline: Fix dl_server time accounting
  sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
  sched/eevdf: Fix min_vruntime vs avg_vruntime
  sched/core: Add comment explaining force-idle vruntime snapshots
  sched/core: Optimize core cookie matching check
  sched/proxy: Yield the donor task
  ...
parents 6c26fbe8 c04507ac
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -199,8 +199,7 @@ static void pfault_interrupt(struct ext_code ext_code,
			 * return to userspace schedule() to block.
			 */
			__set_current_state(TASK_UNINTERRUPTIBLE);
			set_tsk_need_resched(tsk);
			set_preempt_need_resched();
			set_need_resched_current();
		}
	}
out:
+2 −0
Original line number Diff line number Diff line
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick

extern int arch_sched_node_distance(int from, int to);

#endif /* _ASM_X86_TOPOLOGY_H */
+70 −0
Original line number Diff line number Diff line
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
	set_sched_topology(topology);
}

#ifdef CONFIG_NUMA
static int sched_avg_remote_distance;
static int avg_remote_numa_distance(void)
{
	int i, j;
	int distance, nr_remote, total_distance;

	if (sched_avg_remote_distance > 0)
		return sched_avg_remote_distance;

	nr_remote = 0;
	total_distance = 0;
	for_each_node_state(i, N_CPU) {
		for_each_node_state(j, N_CPU) {
			distance = node_distance(i, j);

			if (distance >= REMOTE_DISTANCE) {
				nr_remote++;
				total_distance += distance;
			}
		}
	}
	if (nr_remote)
		sched_avg_remote_distance = total_distance / nr_remote;
	else
		sched_avg_remote_distance = REMOTE_DISTANCE;

	return sched_avg_remote_distance;
}

int arch_sched_node_distance(int from, int to)
{
	int d = node_distance(from, to);

	switch (boot_cpu_data.x86_vfm) {
	case INTEL_GRANITERAPIDS_X:
	case INTEL_ATOM_DARKMONT_X:

		if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
		    d < REMOTE_DISTANCE)
			return d;

		/*
		 * With SNC enabled, there could be too many levels of remote
		 * NUMA node distances, creating NUMA domain levels
		 * including local nodes and partial remote nodes.
		 *
		 * Trim finer distance tuning for NUMA nodes in remote package
		 * for the purpose of building sched domains. Group NUMA nodes
		 * in the remote package in the same sched group.
		 * Simplify NUMA domains and avoid extra NUMA levels including
		 * different remote NUMA nodes and local nodes.
		 *
		 * GNR and CWF don't expect systems with more than 2 packages
		 * and more than 2 hops between packages. Single average remote
		 * distance won't be appropriate if there are more than 2
		 * packages as average distance to different remote packages
		 * could be different.
		 */
		WARN_ONCE(topology_max_packages() > 2,
			  "sched: Expect only up to 2 packages for GNR or CWF, "
			  "but saw %d packages when building sched domains.",
			  topology_max_packages());

		d = avg_remote_numa_distance();
	}
	return d;
}
#endif /* CONFIG_NUMA */

void set_cpu_sibling_map(int cpu)
{
	bool has_smt = __max_threads_per_core > 1;
+5 −0
Original line number Diff line number Diff line
@@ -348,6 +348,11 @@ _label: \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond

#define DEFINE_CLASS_IS_UNCONDITIONAL(_name)		\
	__DEFINE_CLASS_IS_CONDITIONAL(_name, false);	\
	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
	{ return (void *)1; }

#define __GUARD_IS_ERR(_ptr)                                       \
	({                                                         \
		unsigned long _rc = (__force unsigned long)(_ptr); \
+20 −13
Original line number Diff line number Diff line
@@ -637,8 +637,8 @@ struct sched_rt_entity {
#endif
} __randomize_layout;

typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
struct rq_flags;
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);

struct sched_dl_entity {
	struct rb_node			rb_node;
@@ -685,20 +685,22 @@ struct sched_dl_entity {
	 *
	 * @dl_server tells if this is a server entity.
	 *
	 * @dl_defer tells if this is a deferred or regular server. For
	 * now only defer server exists.
	 *
	 * @dl_defer_armed tells if the deferrable server is waiting
	 * for the replenishment timer to activate it.
	 *
	 * @dl_server_active tells if the dlserver is active(started).
	 * dlserver is started on first cfs enqueue on an idle runqueue
	 * and is stopped when a dequeue results in 0 cfs tasks on the
	 * runqueue. In other words, dlserver is active only when cpu's
	 * runqueue has atleast one cfs task.
	 *
	 * @dl_defer tells if this is a deferred or regular server. For
	 * now only defer server exists.
	 *
	 * @dl_defer_armed tells if the deferrable server is waiting
	 * for the replenishment timer to activate it.
	 *
	 * @dl_defer_running tells if the deferrable server is actually
	 * running, skipping the defer phase.
	 *
	 * @dl_defer_idle tracks idle state
	 */
	unsigned int			dl_throttled      : 1;
	unsigned int			dl_yielded        : 1;
@@ -709,6 +711,7 @@ struct sched_dl_entity {
	unsigned int			dl_defer	  : 1;
	unsigned int			dl_defer_armed	  : 1;
	unsigned int			dl_defer_running  : 1;
	unsigned int			dl_defer_idle     : 1;

	/*
	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -730,9 +733,6 @@ struct sched_dl_entity {
	 * dl_server_update().
	 *
	 * @rq the runqueue this server is for
	 *
	 * @server_has_tasks() returns true if @server_pick return a
	 * runnable task.
	 */
	struct rq			*rq;
	dl_server_pick_f		server_pick_task;
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);

/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);

/**
 * set_cpus_allowed_ptr - set CPU affinity mask of a task
@@ -2058,6 +2058,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

static inline void set_need_resched_current(void)
{
	lockdep_assert_irqs_disabled();
	set_tsk_need_resched(current);
	set_preempt_need_resched();
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
Loading