Commit 02baaa67 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull sched_ext updates from Tejun Heo:

 - Improve recovery from misbehaving BPF schedulers.

   When a scheduler puts many tasks with varying affinity restrictions
   on a shared DSQ, CPUs scanning through tasks they cannot run can
   overwhelm the system, causing lockups.

   Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
   and hooks into the hardlockup detector to attempt recovery.

   Add scx_cpu0 example scheduler to demonstrate this scenario.

 - Add lockless peek operation for DSQs to reduce lock contention for
   schedulers that need to query queue state during load balancing.

 - Allow scx_bpf_reenqueue_local() to be called from anywhere in
   preparation for deprecating cpu_acquire/release() callbacks in favor
   of generic BPF hooks.

 - Prepare for hierarchical scheduler support: add
   scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
   make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
   structs for future aux__prog parameter.

 - Implement cgroup_set_idle() callback to notify BPF schedulers when a
   cgroup's idle state changes.

 - Fix migration tasks being incorrectly downgraded from
   stop_sched_class to rt_sched_class across sched_ext enable/disable.
   Applied late as the fix is low risk and the bug subtle but needs
   stable backporting.

 - Various fixes and cleanups including cgroup exit ordering,
   SCX_KICK_WAIT reliability, and backward compatibility improvements.

* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
  sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
  sched_ext: tools: Removing duplicate targets during non-cross compilation
  sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
  sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
  sched_ext: Update comments replacing breather with aborting mechanism
  sched_ext: Implement load balancer for bypass mode
  sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
  sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
  sched_ext: Add scx_cpu0 example scheduler
  sched_ext: Hook up hardlockup detector
  sched_ext: Make handle_lockup() propagate scx_verror() result
  sched_ext: Refactor lockup handlers into handle_lockup()
  sched_ext: Make scx_exit() and scx_vexit() return bool
  sched_ext: Exit dispatch and move operations immediately when aborting
  sched_ext: Simplify breather mechanism with scx_aborting flag
  sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
  sched_ext: Refactor do_enqueue_task() local and global DSQ paths
  sched_ext: Use shorter slice in bypass mode
  sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
  sched_ext: Minor cleanups to scx_task_iter
  ...
parents 8449d325 1dd6c84f
Loading
Loading
Loading
Loading
+25 −2
Original line number Diff line number Diff line
@@ -17,7 +17,18 @@
enum scx_public_consts {
	SCX_OPS_NAME_LEN	= 128,

	/*
	 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
	 * to set the slice for a task that is selected for execution.
	 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
	 * refill has been triggered.
	 *
	 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
	 * mode. As making forward progress for all tasks is the main goal of
	 * the bypass mode, a shorter slice is used.
	 */
	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
	SCX_SLICE_BYPASS	=  5 * 1000000, /*  5ms */
	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
};

@@ -46,6 +57,7 @@ enum scx_dsq_id_flags {
	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
	SCX_DSQ_BYPASS		= SCX_DSQ_FLAG_BUILTIN | 3,
	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
};
@@ -58,6 +70,7 @@ enum scx_dsq_id_flags {
 */
struct scx_dispatch_q {
	raw_spinlock_t		lock;
	struct task_struct __rcu *first_task; /* lockless peek at head */
	struct list_head	list;	/* tasks in dispatch order */
	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
	u32			nr;
@@ -136,6 +149,13 @@ struct scx_dsq_list_node {
	u32			priv;		/* can be used by iter cursor */
};

#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv)				\
	(struct scx_dsq_list_node) {						\
		.node = LIST_HEAD_INIT((__node).node),				\
		.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags),			\
		.priv = (__priv),						\
	}

/*
 * The following is embedded in task_struct and contains all fields necessary
 * for a task to be scheduled by SCX.
@@ -207,16 +227,18 @@ struct sched_ext_entity {
	struct list_head	tasks_node;
};

void sched_ext_free(struct task_struct *p);
void sched_ext_dead(struct task_struct *p);
void print_scx_info(const char *log_lvl, struct task_struct *p);
void scx_softlockup(u32 dur_s);
bool scx_hardlockup(int cpu);
bool scx_rcu_cpu_stall(void);

#else	/* !CONFIG_SCHED_CLASS_EXT */

static inline void sched_ext_free(struct task_struct *p) {}
static inline void sched_ext_dead(struct task_struct *p) {}
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
static inline void scx_softlockup(u32 dur_s) {}
static inline bool scx_hardlockup(int cpu) { return false; }
static inline bool scx_rcu_cpu_stall(void) { return false; }

#endif	/* CONFIG_SCHED_CLASS_EXT */
@@ -228,6 +250,7 @@ struct scx_task_group {
	u64			bw_period_us;
	u64			bw_quota_us;
	u64			bw_burst_us;
	bool			idle;
#endif
};

+39 −0
Original line number Diff line number Diff line
@@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
	)
);

TRACE_EVENT(sched_ext_bypass_lb,

	TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
		 __u32 before_min, __u32 before_max,
		 __u32 after_min, __u32 after_max),

	TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
		before_min, before_max, after_min, after_max),

	TP_STRUCT__entry(
		__field(	__u32,		node		)
		__field(	__u32,		nr_cpus		)
		__field(	__u32,		nr_tasks	)
		__field(	__u32,		nr_balanced	)
		__field(	__u32,		before_min	)
		__field(	__u32,		before_max	)
		__field(	__u32,		after_min	)
		__field(	__u32,		after_max	)
	),

	TP_fast_assign(
		__entry->node		= node;
		__entry->nr_cpus	= nr_cpus;
		__entry->nr_tasks	= nr_tasks;
		__entry->nr_balanced	= nr_balanced;
		__entry->before_min	= before_min;
		__entry->before_max	= before_max;
		__entry->after_min	= after_min;
		__entry->after_max	= after_max;
	),

	TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
		  __entry->node, __entry->nr_cpus,
		  __entry->nr_tasks, __entry->nr_balanced,
		  __entry->before_min, __entry->after_min,
		  __entry->before_max, __entry->after_max
	)
);

#endif /* _TRACE_SCHED_EXT_H */

/* This part must be outside protection */
+0 −1
Original line number Diff line number Diff line
@@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
	WARN_ON(tsk == current);

	unwind_task_free(tsk);
	sched_ext_free(tsk);
	io_uring_free(tsk);
	cgroup_task_free(tsk);
	task_numa_free(tsk, true);
+6 −0
Original line number Diff line number Diff line
@@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
		if (prev->sched_class->task_dead)
			prev->sched_class->task_dead(prev);

		/*
		 * sched_ext_dead() must come before cgroup_task_dead() to
		 * prevent cgroups from being removed while its member tasks are
		 * visible to SCX schedulers.
		 */
		sched_ext_dead(prev);
		cgroup_task_dead(prev);

		/* Task is done with its stack. */
+798 −269

File changed.

Preview size limit exceeded, changes collapsed.

Loading