Commit 664f0f6b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-7.1-rc1-fixes' of...

Merge tag 'sched_ext-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:
 "The merge window pulled in the cgroup sub-scheduler infrastructure,
  and new AI reviews are accelerating bug reporting and fixing - hence
  the larger than usual fixes batch:

   - Use-after-frees during scheduler load/unload:
       - The disable path could free the BPF scheduler while deferred
         irq_work / kthread work was still in flight
       - cgroup setter callbacks read the active scheduler outside the
         rwsem that synchronizes against teardown
     Fix both, and reuse the disable drain in the enable error paths so
     the BPF JIT page can't be freed under live callbacks.

   - Several BPF op invocations didn't tell the framework which runqueue
     was already locked, so helper kfuncs that re-acquire the runqueue
     by CPU could deadlock on the held lock

     Fix the affected callsites, including recursive parent-into-child
     dispatch.

   - The hardlockup notifier ran from NMI but eventually took a
     non-NMI-safe lock. Bounce it through irq_work.

   - A handful of bugs in the new sub-scheduler hierarchy:
       - helper kfuncs hard-coded the root instead of resolving the
         caller's scheduler
       - the enable error path tried to disable per-task state that had
         never been initialized, and leaked cpus_read_lock on the way
         out
       - a sysfs object was leaked on every load/unload
       - the dispatch fast-path used the root scheduler instead of the
         task's
       - a couple of CONFIG #ifdef guards were misclassified

   - Verifier-time hardening: BPF programs of unrelated struct_ops types
     (e.g. tcp_congestion_ops) could call sched_ext kfuncs - a semantic
     bug and, once sub-sched was enabled, a KASAN out-of-bounds read.
     Now rejected at load. Plus a few NULL and cross-task argument
     checks on sched_ext kfuncs, and a selftest covering the new deny.

   - rhashtable (Herbert): restore the insecure_elasticity toggle and
     bounce the deferred-resize kick through irq_work to break a
     lock-order cycle observable from raw-spinlock callers. sched_ext's
     scheduler-instance hash is the first user of both.

   - The bypass-mode load balancer used file-scope cpumasks; with
     multiple scheduler instances now possible, those raced. Move to
     per-instance cpumasks, plus a follow-up to skip tasks whose
     recorded CPU is stale relative to the new owning runqueue.

   - Smaller fixes:
       - a dispatch queue's first-task tracking misbehaved when a parked
         iterator cursor sat in the list
       - the runqueue's next-class wasn't promoted on local-queue
         enqueue, leaving an SCX task behind RT in edge cases
       - the reference qmap scheduler stopped erroring on legitimate
         cross-scheduler task-storage misses"

* tag 'sched_ext-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (26 commits)
  sched_ext: Fix scx_flush_disable_work() UAF race
  sched_ext: Call wakeup_preempt() in local_dsq_post_enq()
  sched_ext: Release cpus_read_lock on scx_link_sched() failure in root enable
  sched_ext: Reject NULL-sch callers in scx_bpf_task_set_slice/dsq_vtime
  sched_ext: Refuse cross-task select_cpu_from_kfunc calls
  sched_ext: Align cgroup #ifdef guards with SUB_SCHED vs GROUP_SCHED
  sched_ext: Make bypass LB cpumasks per-scheduler
  sched_ext: Pass held rq to SCX_CALL_OP() for core_sched_before
  sched_ext: Pass held rq to SCX_CALL_OP() for dump_cpu/dump_task
  sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP
  sched_ext: Use dsq->first_task instead of list_empty() in dispatch_enqueue() FIFO-tail
  sched_ext: Resolve caller's scheduler in scx_bpf_destroy_dsq() / scx_bpf_dsq_nr_queued()
  sched_ext: Read scx_root under scx_cgroup_ops_rwsem in cgroup setters
  sched_ext: Don't disable tasks in scx_sub_enable_workfn() abort path
  sched_ext: Skip tasks with stale task_rq in bypass_lb_cpu()
  sched_ext: Guard scx_dsq_move() against NULL kit->dsq after failed iter_new
  sched_ext: Unregister sub_kset on scheduler disable
  sched_ext: Defer scx_hardlockup() out of NMI
  sched_ext: sync disable_irq_work in bpf_scx_unreg()
  sched_ext: Fix local_dsq_post_enq() to use task's scheduler in sub-sched
  ...
parents dca922e0 d99f7a32
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include <linux/alloc_tag.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/irq_work_types.h>
#include <linux/mutex.h>
#include <linux/workqueue_types.h>

@@ -49,6 +50,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
 * @head_offset: Offset of rhash_head in struct to be hashed
 * @max_size: Maximum size while expanding
 * @min_size: Minimum size while shrinking
 * @insecure_elasticity: Set to true to disable chain length checks
 * @automatic_shrinking: Enable automatic shrinking of tables
 * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
 * @obj_hashfn: Function to hash object
@@ -61,6 +63,7 @@ struct rhashtable_params {
	u16			head_offset;
	unsigned int		max_size;
	u16			min_size;
	bool			insecure_elasticity;
	bool			automatic_shrinking;
	rht_hashfn_t		hashfn;
	rht_obj_hashfn_t	obj_hashfn;
@@ -75,6 +78,7 @@ struct rhashtable_params {
 * @p: Configuration parameters
 * @rhlist: True if this is an rhltable
 * @run_work: Deferred worker to expand/shrink asynchronously
 * @run_irq_work: Bounces the @run_work kick through hard IRQ context.
 * @mutex: Mutex to protect current/future table swapping
 * @lock: Spin lock to protect walker list
 * @nelems: Number of elements in table
@@ -86,6 +90,7 @@ struct rhashtable {
	struct rhashtable_params	p;
	bool				rhlist;
	struct work_struct		run_work;
	struct irq_work			run_irq_work;
	struct mutex                    mutex;
	spinlock_t			lock;
	atomic_t			nelems;
+5 −3
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/irq_work.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
@@ -821,14 +822,15 @@ static __always_inline void *__rhashtable_insert_fast(
		goto out;
	}

	if (elasticity <= 0)
	if (elasticity <= 0 && !params.insecure_elasticity)
		goto slow_path;

	data = ERR_PTR(-E2BIG);
	if (unlikely(rht_grow_above_max(ht, tbl)))
		goto out_unlock;

	if (unlikely(rht_grow_above_100(ht, tbl)))
	if (unlikely(rht_grow_above_100(ht, tbl)) &&
	    !params.insecure_elasticity)
		goto slow_path;

	/* Inserting at head of list makes unlocking free. */
@@ -846,7 +848,7 @@ static __always_inline void *__rhashtable_insert_fast(
	rht_assign_unlock(tbl, bkt, obj, flags);

	if (rht_grow_above_75(ht, tbl))
		schedule_work(&ht->run_work);
		irq_work_queue(&ht->run_irq_work);

	data = NULL;
out:
+276 −122

File changed.

Preview size limit exceeded, changes collapsed.

+18 −2
Original line number Diff line number Diff line
@@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
	 * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
	 * lock or @p's pi_lock. Three cases:
	 *
	 *  - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
	 *  - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
	 *    task's pi_lock; the wake-up task is recorded in kf_tasks[0]
	 *    by SCX_CALL_OP_TASK_RET().
	 *  - other rq-locked SCX op: scx_locked_rq() points at the held rq.
	 *  - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
	 *    nothing held, take pi_lock ourselves.
	 *
	 * In the first two cases, BPF schedulers may pass an arbitrary task
	 * that the held lock doesn't cover. Refuse those.
	 */
	if (this_rq()->scx.in_select_cpu) {
		if (!scx_kf_arg_task_ok(sch, p))
			return -EINVAL;
		lockdep_assert_held(&p->pi_lock);
	} else if (!scx_locked_rq()) {
	} else if (scx_locked_rq()) {
		if (task_rq(p) != scx_locked_rq())
			goto cross_task;
	} else {
		raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
		we_locked = true;
	}
@@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
		raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);

	return cpu;

cross_task:
	scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
		  p->comm, p->pid);
	return -EINVAL;
}

/**
@@ -1467,6 +1482,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
	.owner			= THIS_MODULE,
	.set			= &scx_kfunc_ids_idle,
	.filter			= scx_kfunc_context_filter,
};

/*
+1 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@

struct sched_ext_ops;

extern struct btf_id_set8 scx_kfunc_ids_idle;
extern struct btf_id_set8 scx_kfunc_ids_select_cpu;

void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
Loading