Loading Documentation/scheduler/sched-ext.rst +1 −1 Original line number Diff line number Diff line Loading @@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS. .. code-block:: none # make -j16 -C tools/sched_ext # tools/sched_ext/scx_simple # tools/sched_ext/build/bin/scx_simple local=0 global=3 local=5 global=24 local=9 global=44 Loading kernel/sched/core.c +14 −7 Original line number Diff line number Diff line Loading @@ -3518,14 +3518,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline int select_task_rq(struct task_struct *p, int cpu, int wake_flags) int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); else if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); *wake_flags |= WF_RQ_SELECTED; } else { cpu = cpumask_any(p->cpus_ptr); } /* * In order not to call set_task_cpu() on a blocking task we need Loading Loading @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else Loading Loading @@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; wake_flags |= WF_TTWU; if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) Loading Loading @@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); Loading Loading @@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); Loading @@ -4807,7 +4814,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); Loading @@ -4815,7 +4822,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* Loading kernel/sched/ext.c +139 −110 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, Loading @@ -19,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, /* * Iterating all tasks may take a while. Periodically drop * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { Loading Loading @@ -630,6 +635,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info * * ops.exit() is also called on ops.init() failure, which is a bit * unusual. This is to allow rich reporting through @info on how * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); Loading Loading @@ -697,6 +706,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ Loading Loading @@ -857,7 +867,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); static int scx_ops_bypass_depth; static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); Loading Loading @@ -1279,86 +1290,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; u32 cnt; }; /** * scx_task_iter_init - Initialize a task iterator * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, * @iter must eventually be exited with scx_task_iter_exit(). * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter * must eventually be stopped with scx_task_iter_stop(). * * scx_tasks_lock may be released between this and the first next() call or * between any two next() calls. If scx_tasks_lock is released between two * next() calls, the caller is responsible for ensuring that the task being * iterated remains accessible either through RCU read lock or obtaining a * reference count. * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() * between this and the first next() call or between any two next() calls. If * the locks are released between two next() calls, the caller is responsible * for ensuring that the task being iterated remains accessible either through * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ static void scx_task_iter_init(struct scx_task_iter *iter) static void scx_task_iter_start(struct scx_task_iter *iter) { lockdep_assert_held(&scx_tasks_lock); BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; iter->cnt = 0; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { if (iter->locked) { task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; } } /** * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator * @iter: iterator to unlock rq for * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited. Unlock the rq if so. This function can be * safely called anytime during an iteration. * * Returns %true if the rq @iter was locking is unlocked. %false if @iter was * not locking an rq. * the task currently being visited in addition to scx_tasks_lock. Unlock both. * This function can be safely called anytime during an iteration. */ static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) static void scx_task_iter_unlock(struct scx_task_iter *iter) { if (iter->locked) { task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; return true; } else { return false; __scx_task_iter_rq_unlock(iter); spin_unlock_irq(&scx_tasks_lock); } /** * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() * @iter: iterator to re-lock * * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it * doesn't re-lock the rq lock. Must be called before other iterator operations. */ static void scx_task_iter_relock(struct scx_task_iter *iter) { spin_lock_irq(&scx_tasks_lock); } /** * scx_task_iter_exit - Exit a task iterator * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. * If the iterator holds a task's rq lock, that rq lock is released. See * scx_task_iter_init() for details. * Exit a previously initialized @iter. Must be called with scx_tasks_lock held * which is released on return. If the iterator holds a task's rq lock, that rq * lock is also released. See scx_task_iter_start() for details. */ static void scx_task_iter_exit(struct scx_task_iter *iter) static void scx_task_iter_stop(struct scx_task_iter *iter) { lockdep_assert_held(&scx_tasks_lock); scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * * Visit the next task. See scx_task_iter_init() for details. * Visit the next task. See scx_task_iter_start() for details. Locks are dropped * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; lockdep_assert_held(&scx_tasks_lock); if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); scx_task_iter_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) Loading @@ -1379,14 +1408,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify * whether they would like to filter out dead tasks. See scx_task_iter_init() * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; scx_task_iter_rq_unlock(iter); __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* Loading Loading @@ -1954,7 +1983,6 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; Loading @@ -1972,7 +2000,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; if (bypassing) if (scx_rq_bypassing(rq)) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) Loading Loading @@ -2027,7 +2055,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } Loading Loading @@ -3030,8 +3058,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", p->comm, p->pid); printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; Loading Loading @@ -3274,11 +3302,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; if (!static_branch_likely(&scx_builtin_idle_enabled)) { scx_ops_error("built-in idle tracking is disabled"); return prev_cpu; } /* * Determine the scheduling domain only if the task is allowed to run * on all CPUs. Loading Loading @@ -3435,7 +3458,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; if (SCX_HAS_OP(select_cpu)) { if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; Loading Loading @@ -3500,7 +3523,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); if (SCX_HAS_OP(update_idle)) { if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; Loading Loading @@ -4358,7 +4381,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* Loading Loading @@ -4427,6 +4449,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; Loading Loading @@ -4566,36 +4589,40 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. * - ops.select_cpu() is ignored and the default select_cpu() is used. * * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. * %SCX_OPS_ENQ_LAST is also ignored. * * b. ops.dispatch() is ignored. * - ops.dispatch() is ignored. * * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * can't be trusted. Whenever a tick triggers, the running task is rotated to * the tail of the queue with core_sched_at touched. * * d. pick_next_task() suppresses zero slice warning. * - pick_next_task() suppresses zero slice warning. * * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * f. scx_prio_less() reverts to the default core_sched_at order. * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { int depth, cpu; int cpu; unsigned long flags; raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { depth = atomic_inc_return(&scx_ops_bypass_depth); WARN_ON_ONCE(depth <= 0); if (depth != 1) return; scx_ops_bypass_depth++; WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; } else { depth = atomic_dec_return(&scx_ops_bypass_depth); WARN_ON_ONCE(depth < 0); if (depth != 0) return; scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; } /* Loading @@ -4612,7 +4639,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; rq_lock_irqsave(rq, &rf); rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); Loading Loading @@ -4648,11 +4675,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } rq_unlock_irqrestore(rq, &rf); rq_unlock(rq, &rf); /* kick to restore ticks */ /* resched to restore ticks and idle state */ resched_cpu(cpu); } unlock: raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) Loading Loading @@ -4772,15 +4801,13 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_ops_init_task_enabled = false; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); Loading @@ -4789,8 +4816,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ Loading Loading @@ -5258,7 +5284,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } Loading Loading @@ -5351,6 +5377,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } Loading Loading @@ -5443,8 +5470,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and Loading @@ -5454,16 +5480,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; scx_task_iter_rq_unlock(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_ops_error("ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); goto err_disable_unlock_all; } Loading @@ -5471,10 +5495,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); spin_lock_irq(&scx_tasks_lock); scx_task_iter_relock(&sti); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); Loading @@ -5491,14 +5514,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); Loading @@ -5506,20 +5529,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); /* * Returning an error code here would lose the recorded error * information. Exit indicating success so that the error is notified * through ops.exit() with all the details. */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ret = 0; goto err_disable; } Loading Loading @@ -5554,10 +5570,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); /* must be fully disabled before returning */ scx_ops_disable(SCX_EXIT_ERROR); /* * Returning an error code here would not pass all the error information * to userspace. Record errno using scx_ops_error() for cases * scx_ops_error() wasn't already invoked and exit indicating success so * that the error is notified through ops.exit() with all the details. * * Flush scx_ops_disable_work to ensure that error is reported before * init completion. */ scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); return ret; return 0; } Loading Loading @@ -6108,16 +6132,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { *is_idle = false; return prev_cpu; if (!static_branch_likely(&scx_builtin_idle_enabled)) { scx_ops_error("built-in idle tracking is disabled"); goto prev_cpu; } if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) goto prev_cpu; #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); #else #endif prev_cpu: *is_idle = false; return prev_cpu; #endif } __bpf_kfunc_end_defs(); Loading kernel/sched/sched.h +3 −0 Original line number Diff line number Diff line Loading @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); Loading Loading @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ Loading @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 #define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) Loading tools/sched_ext/include/scx/common.bpf.h +4 −4 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; Loading Loading @@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; /* * Use the following as @it__iter when calling Loading Loading @@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; /* * Access a cpumask in read-only mode (typically to check bits). */ const struct cpumask *cast_mask(struct bpf_cpumask *mask) static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) { return (const struct cpumask *)mask; } Loading Loading
Documentation/scheduler/sched-ext.rst +1 −1 Original line number Diff line number Diff line Loading @@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS. .. code-block:: none # make -j16 -C tools/sched_ext # tools/sched_ext/scx_simple # tools/sched_ext/build/bin/scx_simple local=0 global=3 local=5 global=24 local=9 global=44 Loading
kernel/sched/core.c +14 −7 Original line number Diff line number Diff line Loading @@ -3518,14 +3518,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline int select_task_rq(struct task_struct *p, int cpu, int wake_flags) int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); else if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); *wake_flags |= WF_RQ_SELECTED; } else { cpu = cpumask_any(p->cpus_ptr); } /* * In order not to call set_task_cpu() on a blocking task we need Loading Loading @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else Loading Loading @@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; wake_flags |= WF_TTWU; if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) Loading Loading @@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); Loading Loading @@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); Loading @@ -4807,7 +4814,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); Loading @@ -4815,7 +4822,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* Loading
kernel/sched/ext.c +139 −110 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, Loading @@ -19,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, /* * Iterating all tasks may take a while. Periodically drop * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { Loading Loading @@ -630,6 +635,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info * * ops.exit() is also called on ops.init() failure, which is a bit * unusual. This is to allow rich reporting through @info on how * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); Loading Loading @@ -697,6 +706,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ Loading Loading @@ -857,7 +867,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); static int scx_ops_bypass_depth; static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); Loading Loading @@ -1279,86 +1290,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; u32 cnt; }; /** * scx_task_iter_init - Initialize a task iterator * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, * @iter must eventually be exited with scx_task_iter_exit(). * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter * must eventually be stopped with scx_task_iter_stop(). * * scx_tasks_lock may be released between this and the first next() call or * between any two next() calls. If scx_tasks_lock is released between two * next() calls, the caller is responsible for ensuring that the task being * iterated remains accessible either through RCU read lock or obtaining a * reference count. * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() * between this and the first next() call or between any two next() calls. If * the locks are released between two next() calls, the caller is responsible * for ensuring that the task being iterated remains accessible either through * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ static void scx_task_iter_init(struct scx_task_iter *iter) static void scx_task_iter_start(struct scx_task_iter *iter) { lockdep_assert_held(&scx_tasks_lock); BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; iter->cnt = 0; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { if (iter->locked) { task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; } } /** * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator * @iter: iterator to unlock rq for * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited. Unlock the rq if so. This function can be * safely called anytime during an iteration. * * Returns %true if the rq @iter was locking is unlocked. %false if @iter was * not locking an rq. * the task currently being visited in addition to scx_tasks_lock. Unlock both. * This function can be safely called anytime during an iteration. */ static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) static void scx_task_iter_unlock(struct scx_task_iter *iter) { if (iter->locked) { task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; return true; } else { return false; __scx_task_iter_rq_unlock(iter); spin_unlock_irq(&scx_tasks_lock); } /** * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() * @iter: iterator to re-lock * * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it * doesn't re-lock the rq lock. Must be called before other iterator operations. */ static void scx_task_iter_relock(struct scx_task_iter *iter) { spin_lock_irq(&scx_tasks_lock); } /** * scx_task_iter_exit - Exit a task iterator * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. * If the iterator holds a task's rq lock, that rq lock is released. See * scx_task_iter_init() for details. * Exit a previously initialized @iter. Must be called with scx_tasks_lock held * which is released on return. If the iterator holds a task's rq lock, that rq * lock is also released. See scx_task_iter_start() for details. */ static void scx_task_iter_exit(struct scx_task_iter *iter) static void scx_task_iter_stop(struct scx_task_iter *iter) { lockdep_assert_held(&scx_tasks_lock); scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * * Visit the next task. See scx_task_iter_init() for details. * Visit the next task. See scx_task_iter_start() for details. Locks are dropped * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; lockdep_assert_held(&scx_tasks_lock); if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); scx_task_iter_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) Loading @@ -1379,14 +1408,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify * whether they would like to filter out dead tasks. See scx_task_iter_init() * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; scx_task_iter_rq_unlock(iter); __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* Loading Loading @@ -1954,7 +1983,6 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; Loading @@ -1972,7 +2000,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; if (bypassing) if (scx_rq_bypassing(rq)) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) Loading Loading @@ -2027,7 +2055,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } Loading Loading @@ -3030,8 +3058,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", p->comm, p->pid); printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; Loading Loading @@ -3274,11 +3302,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; if (!static_branch_likely(&scx_builtin_idle_enabled)) { scx_ops_error("built-in idle tracking is disabled"); return prev_cpu; } /* * Determine the scheduling domain only if the task is allowed to run * on all CPUs. Loading Loading @@ -3435,7 +3458,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; if (SCX_HAS_OP(select_cpu)) { if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; Loading Loading @@ -3500,7 +3523,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); if (SCX_HAS_OP(update_idle)) { if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; Loading Loading @@ -4358,7 +4381,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* Loading Loading @@ -4427,6 +4449,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; Loading Loading @@ -4566,36 +4589,40 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. * - ops.select_cpu() is ignored and the default select_cpu() is used. * * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. * %SCX_OPS_ENQ_LAST is also ignored. * * b. ops.dispatch() is ignored. * - ops.dispatch() is ignored. * * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * can't be trusted. Whenever a tick triggers, the running task is rotated to * the tail of the queue with core_sched_at touched. * * d. pick_next_task() suppresses zero slice warning. * - pick_next_task() suppresses zero slice warning. * * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * f. scx_prio_less() reverts to the default core_sched_at order. * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { int depth, cpu; int cpu; unsigned long flags; raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { depth = atomic_inc_return(&scx_ops_bypass_depth); WARN_ON_ONCE(depth <= 0); if (depth != 1) return; scx_ops_bypass_depth++; WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; } else { depth = atomic_dec_return(&scx_ops_bypass_depth); WARN_ON_ONCE(depth < 0); if (depth != 0) return; scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; } /* Loading @@ -4612,7 +4639,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; rq_lock_irqsave(rq, &rf); rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); Loading Loading @@ -4648,11 +4675,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } rq_unlock_irqrestore(rq, &rf); rq_unlock(rq, &rf); /* kick to restore ticks */ /* resched to restore ticks and idle state */ resched_cpu(cpu); } unlock: raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) Loading Loading @@ -4772,15 +4801,13 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_ops_init_task_enabled = false; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); Loading @@ -4789,8 +4816,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ Loading Loading @@ -5258,7 +5284,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } Loading Loading @@ -5351,6 +5377,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } Loading Loading @@ -5443,8 +5470,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and Loading @@ -5454,16 +5480,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; scx_task_iter_rq_unlock(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_ops_error("ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); goto err_disable_unlock_all; } Loading @@ -5471,10 +5495,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); spin_lock_irq(&scx_tasks_lock); scx_task_iter_relock(&sti); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); Loading @@ -5491,14 +5514,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); Loading @@ -5506,20 +5529,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); /* * Returning an error code here would lose the recorded error * information. Exit indicating success so that the error is notified * through ops.exit() with all the details. */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ret = 0; goto err_disable; } Loading Loading @@ -5554,10 +5570,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); /* must be fully disabled before returning */ scx_ops_disable(SCX_EXIT_ERROR); /* * Returning an error code here would not pass all the error information * to userspace. Record errno using scx_ops_error() for cases * scx_ops_error() wasn't already invoked and exit indicating success so * that the error is notified through ops.exit() with all the details. * * Flush scx_ops_disable_work to ensure that error is reported before * init completion. */ scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); return ret; return 0; } Loading Loading @@ -6108,16 +6132,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { *is_idle = false; return prev_cpu; if (!static_branch_likely(&scx_builtin_idle_enabled)) { scx_ops_error("built-in idle tracking is disabled"); goto prev_cpu; } if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) goto prev_cpu; #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); #else #endif prev_cpu: *is_idle = false; return prev_cpu; #endif } __bpf_kfunc_end_defs(); Loading
kernel/sched/sched.h +3 −0 Original line number Diff line number Diff line Loading @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); Loading Loading @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ Loading @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 #define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) Loading
tools/sched_ext/include/scx/common.bpf.h +4 −4 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; Loading Loading @@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; /* * Use the following as @it__iter when calling Loading Loading @@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; /* * Access a cpumask in read-only mode (typically to check bits). */ const struct cpumask *cast_mask(struct bpf_cpumask *mask) static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) { return (const struct cpumask *)mask; } Loading