Commit 7fb39e4e authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP



SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on
exit. Correct at the top level, but ops can recurse via
scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which
invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner
call returns, the NULL clobbers the outer's state. The parent's BPF then
calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and
re-acquire the already-held rq.

Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq
parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local
can be typed as 'struct rq *' without colliding with the parameter token in
the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel
through the two base macros and inherit the fix.

Fixes: 4f8b1228 ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching")
Reported-by: default avatarChris Mason <clm@meta.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
parent 2f2ea770
Loading
Loading
Loading
Loading
+30 −19
Original line number Diff line number Diff line
@@ -470,24 +470,35 @@ static inline void update_locked_rq(struct rq *rq)
	__this_cpu_write(scx_locked_rq_state, rq);
}

#define SCX_CALL_OP(sch, op, rq, args...)					\
/*
 * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not
 * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit.
 */
#define SCX_CALL_OP(sch, op, locked_rq, args...)				\
do {										\
	if (rq)									\
		update_locked_rq(rq);						\
	struct rq *__prev_locked_rq;						\
										\
	if (locked_rq) {							\
		__prev_locked_rq = scx_locked_rq();				\
		update_locked_rq(locked_rq);					\
	}									\
	(sch)->ops.op(args);							\
	if (rq)									\
		update_locked_rq(NULL);						\
	if (locked_rq)								\
		update_locked_rq(__prev_locked_rq);				\
} while (0)

#define SCX_CALL_OP_RET(sch, op, rq, args...)					\
#define SCX_CALL_OP_RET(sch, op, locked_rq, args...)				\
({										\
	struct rq *__prev_locked_rq;						\
	__typeof__((sch)->ops.op(args)) __ret;					\
										\
	if (rq)									\
		update_locked_rq(rq);						\
	if (locked_rq) {							\
		__prev_locked_rq = scx_locked_rq();				\
		update_locked_rq(locked_rq);					\
	}									\
	__ret = (sch)->ops.op(args);						\
	if (rq)									\
		update_locked_rq(NULL);						\
	if (locked_rq)								\
		update_locked_rq(__prev_locked_rq);				\
	__ret;									\
})

@@ -499,39 +510,39 @@ do { \
 * those subject tasks.
 *
 * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
 * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
 * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
 * kf_tasks[] is set, @p's scheduler-protected fields are stable.
 * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's
 * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
 * So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
 *
 * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
 * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
 * while a previous one is still in progress.
 */
#define SCX_CALL_OP_TASK(sch, op, rq, task, args...)				\
#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...)			\
do {										\
	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
	current->scx.kf_tasks[0] = task;					\
	SCX_CALL_OP((sch), op, rq, task, ##args);				\
	SCX_CALL_OP((sch), op, locked_rq, task, ##args);			\
	current->scx.kf_tasks[0] = NULL;					\
} while (0)

#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...)			\
#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...)			\
({										\
	__typeof__((sch)->ops.op(task, ##args)) __ret;				\
	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
	current->scx.kf_tasks[0] = task;					\
	__ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args);			\
	__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args);		\
	current->scx.kf_tasks[0] = NULL;					\
	__ret;									\
})

#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...)		\
#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...)	\
({										\
	__typeof__((sch)->ops.op(task0, task1, ##args)) __ret;			\
	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
	current->scx.kf_tasks[0] = task0;					\
	current->scx.kf_tasks[1] = task1;					\
	__ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args);		\
	__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args);	\
	current->scx.kf_tasks[0] = NULL;					\
	current->scx.kf_tasks[1] = NULL;					\
	__ret;									\