Commit 89c572e2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-core-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Fix inconsistency in misfit task load-balancing

 - Fix CPU isolation bugs in the task-wakeup logic

 - Rework and unify the sched_use_asym_prio() and sched_asym_prefer()
   logic

 - Clean up and simplify ->avg_* accesses

 - Misc cleanups and fixes

* tag 'sched-core-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/topology: Rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC
  sched/fair: Check the SD_ASYM_PACKING flag in sched_use_asym_prio()
  sched/fair: Rework sched_use_asym_prio() and sched_asym_prefer()
  sched/fair: Remove unused parameter from sched_asym()
  sched/topology: Remove duplicate descriptions from TOPOLOGY_SD_FLAGS
  sched/fair: Simplify the update_sd_pick_busiest() logic
  sched/fair: Do strict inequality check for busiest misfit task group
  sched/fair: Remove unnecessary goto in update_sd_lb_stats()
  sched/fair: Take the scheduling domain into account in select_idle_core()
  sched/fair: Take the scheduling domain into account in select_idle_smt()
  sched/fair: Add READ_ONCE() and use existing helper function to access ->avg_irq
  sched/fair: Use existing helper functions to access ->avg_rt and ->avg_dl
  sched/core: Simplify code by removing duplicate #ifdefs
parents a5b1a017 54de4427
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init;
/* cpumask of CPUs with asymmetric SMT dependency */
static int powerpc_smt_flags(void)
{
	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;

	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
@@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
static int powerpc_shared_cache_flags(void)
{
	if (static_branch_unlikely(&splpar_asym_pack))
		return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
		return SD_SHARE_LLC | SD_ASYM_PACKING;

	return SD_SHARE_PKG_RESOURCES;
	return SD_SHARE_LLC;
}

static int powerpc_shared_proc_flags(void)
+2 −2
Original line number Diff line number Diff line
@@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)

/*
 * Domain members share CPU package resources (i.e. caches)
 * Domain members share CPU Last Level Caches
 *
 * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
 *               the same cache(s).
 * NEEDS_GROUPS: Caches are shared between groups.
 */
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*
 * Only a single load balancing instance
+3 −3
Original line number Diff line number Diff line
@@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[];
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
{
	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
	return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
}
#endif

#ifdef CONFIG_SCHED_CLUSTER
static inline int cpu_cluster_flags(void)
{
	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
	return SD_CLUSTER | SD_SHARE_LLC;
}
#endif

#ifdef CONFIG_SCHED_MC
static inline int cpu_core_flags(void)
{
	return SD_SHARE_PKG_RESOURCES;
	return SD_SHARE_LLC;
}
#endif

+1 −3
Original line number Diff line number Diff line
@@ -1792,7 +1792,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css);
#endif

#ifdef CONFIG_SYSCTL
#ifdef CONFIG_UCLAMP_TASK
#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
@@ -1898,7 +1897,6 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
	return result;
}
#endif
#endif

static int uclamp_validate(struct task_struct *p,
			   const struct sched_attr *attr)
@@ -2065,7 +2063,7 @@ static void __init init_uclamp(void)
	}
}

#else /* CONFIG_UCLAMP_TASK */
#else /* !CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
+46 −64
Original line number Diff line number Diff line
@@ -7289,7 +7289,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
		if (!available_idle_cpu(cpu)) {
			idle = false;
			if (*idle_cpu == -1) {
				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
					*idle_cpu = cpu;
					break;
				}
@@ -7297,7 +7297,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
			}
			break;
		}
		if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
		if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
			*idle_cpu = cpu;
	}

@@ -7311,13 +7311,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
 * Scan the local SMT mask for idle CPUs.
 */
static int select_idle_smt(struct task_struct *p, int target)
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
	int cpu;

	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
		if (cpu == target)
			continue;
		/*
		 * Check if the CPU is in the LLC scheduling domain of @target.
		 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
		 */
		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
			continue;
		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
			return cpu;
	}
@@ -7341,7 +7347,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
	return __select_idle_cpu(core, p);
}

static inline int select_idle_smt(struct task_struct *p, int target)
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
	return -1;
}
@@ -7591,7 +7597,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
		has_idle_core = test_idle_cores(target);

		if (!has_idle_core && cpus_share_cache(prev, target)) {
			i = select_idle_smt(p, prev);
			i = select_idle_smt(p, sd, prev);
			if ((unsigned int)i < nr_cpumask_bits)
				return i;
		}
@@ -9237,19 +9243,17 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)

static inline bool others_have_blocked(struct rq *rq)
{
	if (READ_ONCE(rq->avg_rt.util_avg))
	if (cpu_util_rt(rq))
		return true;

	if (READ_ONCE(rq->avg_dl.util_avg))
	if (cpu_util_dl(rq))
		return true;

	if (thermal_load_avg(rq))
		return true;

#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
	if (READ_ONCE(rq->avg_irq.util_avg))
	if (cpu_util_irq(rq))
		return true;
#endif

	return false;
}
@@ -9506,8 +9510,8 @@ static unsigned long scale_rt_capacity(int cpu)
	 * avg_thermal.load_avg tracks thermal pressure and the weighted
	 * average uses the actual delta max capacity(load).
	 */
	used = READ_ONCE(rq->avg_rt.util_avg);
	used += READ_ONCE(rq->avg_dl.util_avg);
	used = cpu_util_rt(rq);
	used += cpu_util_dl(rq);
	used += thermal_load_avg(rq);

	if (unlikely(used >= max))
@@ -9740,51 +9744,49 @@ group_type group_classify(unsigned int imbalance_pct,
 */
static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{
	if (!(sd->flags & SD_ASYM_PACKING))
		return false;

	if (!sched_smt_active())
		return true;

	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
}

static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
{
	/*
	 * First check if @dst_cpu can do asym_packing load balance. Only do it
	 * if it has higher priority than @src_cpu.
	 */
	return sched_use_asym_prio(sd, dst_cpu) &&
		sched_asym_prefer(dst_cpu, src_cpu);
}

/**
 * sched_asym - Check if the destination CPU can do asym_packing load balance
 * sched_group_asym - Check if the destination CPU can do asym_packing balance
 * @env:	The load balancing environment
 * @sds:	Load-balancing data with statistics of the local group
 * @sgs:	Load-balancing statistics of the candidate busiest group
 * @group:	The candidate busiest group
 *
 * @env::dst_cpu can do asym_packing if it has higher priority than the
 * preferred CPU of @group.
 *
 * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
 * can do asym_packing balance only if all its SMT siblings are idle. Also, it
 * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
 * imbalances in the number of CPUS are dealt with in find_busiest_group().
 *
 * If we are balancing load within an SMT core, or at PKG domain level, always
 * proceed.
 *
 * Return: true if @env::dst_cpu can do with asym_packing load balance. False
 * otherwise.
 */
static inline bool
sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
	   struct sched_group *group)
sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
{
	/* Ensure that the whole local core is idle, if applicable. */
	if (!sched_use_asym_prio(env->sd, env->dst_cpu))
		return false;

	/*
	 * CPU priorities does not make sense for SMT cores with more than one
	 * CPU priorities do not make sense for SMT cores with more than one
	 * busy sibling.
	 */
	if (group->flags & SD_SHARE_CPUCAPACITY) {
		if (sgs->group_weight - sgs->idle_cpus != 1)
	if ((group->flags & SD_SHARE_CPUCAPACITY) &&
	    (sgs->group_weight - sgs->idle_cpus != 1))
		return false;
	}

	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
}

/* One group has more than one SMT CPU while the other group does not */
@@ -9938,11 +9940,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
	sgs->group_weight = group->group_weight;

	/* Check if dst CPU is idle and preferred to this group */
	if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
	    env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
	    sched_asym(env, sds, sgs, group)) {
	if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
	    sched_group_asym(env, sgs, group))
		sgs->group_asym_packing = 1;
	}

	/* Check for loaded SMT group to be balanced to dst CPU */
	if (!local_group && smt_balance(env, sgs, group))
@@ -10006,9 +10006,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
	switch (sgs->group_type) {
	case group_overloaded:
		/* Select the overloaded group with highest avg_load. */
		if (sgs->avg_load <= busiest->avg_load)
			return false;
		break;
		return sgs->avg_load > busiest->avg_load;

	case group_imbalanced:
		/*
@@ -10019,18 +10017,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,

	case group_asym_packing:
		/* Prefer to move from lowest priority CPU's work */
		if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
			return false;
		break;
		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);

	case group_misfit_task:
		/*
		 * If we have more than one misfit sg go with the biggest
		 * misfit.
		 */
		if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
			return false;
		break;
		return sgs->group_misfit_task_load > busiest->group_misfit_task_load;

	case group_smt_balance:
		/*
@@ -10182,10 +10176,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
	 * be computed and tested before calling idle_cpu_without().
	 */

#ifdef CONFIG_SMP
	if (rq->ttwu_pending)
		return 0;
#endif

	return 1;
}
@@ -10578,16 +10570,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

		update_sg_lb_stats(env, sds, sg, sgs, &sg_status);

		if (local_group)
			goto next_group;


		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
			sds->busiest = sg;
			sds->busiest_stat = *sgs;
		}

next_group:
		/* Now, start updating sd_lb_stats */
		sds->total_load += sgs->group_load;
		sds->total_capacity += sgs->group_capacity;
@@ -10691,7 +10678,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
	 */
	if (local->group_type == group_has_spare) {
		if ((busiest->group_type > group_fully_busy) &&
		    !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
		    !(env->sd->flags & SD_SHARE_LLC)) {
			/*
			 * If busiest is overloaded, try to fill spare
			 * capacity. This might end up creating spare capacity
@@ -11038,10 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
		 * If balancing between cores, let lower priority CPUs help
		 * SMT cores with more than one busy sibling.
		 */
		if ((env->sd->flags & SD_ASYM_PACKING) &&
		    sched_use_asym_prio(env->sd, i) &&
		    sched_asym_prefer(i, env->dst_cpu) &&
		    nr_running == 1)
		if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
			continue;

		switch (env->migration_type) {
@@ -11137,8 +11121,7 @@ asym_active_balance(struct lb_env *env)
	 * the lower priority @env::dst_cpu help it. Do not follow
	 * CPU priority.
	 */
	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
	       sched_use_asym_prio(env->sd, env->dst_cpu) &&
	return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
		!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -11910,8 +11893,7 @@ static void nohz_balancer_kick(struct rq *rq)
		 * preferred CPU must be idle.
		 */
		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
			if (sched_use_asym_prio(sd, i) &&
			    sched_asym_prefer(i, cpu)) {
			if (sched_asym(sd, i, cpu)) {
				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
				goto unlock;
			}
Loading