Commit cde94c03 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Make watchdog sub-sched aware



Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
parent 34ecfb35
Loading
Loading
Loading
Loading
+49 −25
Original line number Diff line number Diff line
@@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);

/*
 * The maximum amount of time in jiffies that a task may be runnable without
 * being scheduled on a CPU. If this timeout is exceeded, it will trigger
 * scx_error().
 * Watchdog interval. All scx_sched's share a single watchdog timer and the
 * interval is half of the shortest sch->watchdog_timeout.
 */
static unsigned long scx_watchdog_timeout;
static unsigned long scx_watchdog_interval;

/*
 * The last time the delayed work was run. This delayed work relies on
@@ -3038,10 +3037,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
		goto out_unlock;

	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
		struct scx_sched *sch = scx_task_sched(p);
		unsigned long last_runnable = p->scx.runnable_at;

		if (unlikely(time_after(jiffies,
					last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
					last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);

			scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -3058,6 +3058,7 @@ static bool check_rq_for_timeouts(struct rq *rq)

static void scx_watchdog_workfn(struct work_struct *work)
{
	unsigned long intv;
	int cpu;

	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -3068,28 +3069,31 @@ static void scx_watchdog_workfn(struct work_struct *work)

		cond_resched();
	}

	intv = READ_ONCE(scx_watchdog_interval);
	if (intv < ULONG_MAX)
		queue_delayed_work(system_unbound_wq, to_delayed_work(work),
			   READ_ONCE(scx_watchdog_timeout) / 2);
				   intv);
}

void scx_tick(struct rq *rq)
{
	struct scx_sched *sch;
	struct scx_sched *root;
	unsigned long last_check;

	if (!scx_enabled())
		return;

	sch = rcu_dereference_bh(scx_root);
	if (unlikely(!sch))
	root = rcu_dereference_bh(scx_root);
	if (unlikely(!root))
		return;

	last_check = READ_ONCE(scx_watchdog_timestamp);
	if (unlikely(time_after(jiffies,
				last_check + READ_ONCE(scx_watchdog_timeout)))) {
				last_check + READ_ONCE(root->watchdog_timeout)))) {
		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);

		scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
		scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
			 "watchdog failed to check in for %u.%03us",
			 dur_ms / 1000, dur_ms % 1000);
	}
@@ -4760,6 +4764,26 @@ static void free_kick_syncs(void)
	}
}

static void refresh_watchdog(void)
{
	struct scx_sched *sch;
	unsigned long intv = ULONG_MAX;

	/* take the shortest timeout and use its half for watchdog interval */
	rcu_read_lock();
	list_for_each_entry_rcu(sch, &scx_sched_all, all)
		intv = max(min(intv, sch->watchdog_timeout / 2), 1);
	rcu_read_unlock();

	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
	WRITE_ONCE(scx_watchdog_interval, intv);

	if (intv < ULONG_MAX)
		mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
	else
		cancel_delayed_work_sync(&scx_watchdog_work);
}

#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);

@@ -4798,6 +4822,8 @@ static void scx_sub_disable(struct scx_sched *sch)
	list_del_rcu(&sch->all);
	raw_spin_unlock_irq(&scx_sched_lock);

	refresh_watchdog();

	mutex_unlock(&scx_enable_mutex);

	/*
@@ -4932,12 +4958,12 @@ static void scx_root_disable(struct scx_sched *sch)
	if (sch->ops.exit)
		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);

	cancel_delayed_work_sync(&scx_watchdog_work);

	raw_spin_lock_irq(&scx_sched_lock);
	list_del_rcu(&sch->all);
	raw_spin_unlock_irq(&scx_sched_lock);

	refresh_watchdog();

	/*
	 * scx_root clearing must be inside cpus_read_lock(). See
	 * handle_hotplug().
@@ -5473,6 +5499,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
	sch->ancestors[level] = sch;
	sch->level = level;

	if (ops->timeout_ms)
		sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
	else
		sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;

	sch->slice_dfl = SCX_SLICE_DFL;
	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@@ -5615,7 +5646,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
	struct scx_sched *sch;
	struct scx_task_iter sti;
	struct task_struct *p;
	unsigned long timeout;
	int i, cpu, ret;

	mutex_lock(&scx_enable_mutex);
@@ -5667,6 +5697,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
	list_add_tail_rcu(&sch->all, &scx_sched_all);
	raw_spin_unlock_irq(&scx_sched_lock);

	refresh_watchdog();

	scx_idle_enable(ops);

	if (sch->ops.init) {
@@ -5697,16 +5729,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
	if (ret)
		goto err_disable;

	if (ops->timeout_ms)
		timeout = msecs_to_jiffies(ops->timeout_ms);
	else
		timeout = SCX_WATCHDOG_MAX_TIMEOUT;

	WRITE_ONCE(scx_watchdog_timeout, timeout);
	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
			   READ_ONCE(scx_watchdog_timeout) / 2);

	/*
	 * Once __scx_enabled is set, %current can be switched to SCX anytime.
	 * This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -5928,6 +5950,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
	list_add_tail_rcu(&sch->all, &scx_sched_all);
	raw_spin_unlock_irq(&scx_sched_lock);

	refresh_watchdog();

	if (sch->level >= SCX_SUB_MAX_DEPTH) {
		scx_error(sch, "max nesting depth %d violated",
			  SCX_SUB_MAX_DEPTH);
+7 −0
Original line number Diff line number Diff line
@@ -1019,6 +1019,13 @@ struct scx_sched {
	bool			sub_attached;
#endif	/* CONFIG_EXT_SUB_SCHED */

	/*
	 * The maximum amount of time in jiffies that a task may be runnable
	 * without being scheduled on a CPU. If this timeout is exceeded, it
	 * will trigger scx_error().
	 */
	unsigned long		watchdog_timeout;

	atomic_t		exit_kind;
	struct scx_exit_info	*exit_info;