sched_ext: Move event_stats_cpu into scx_sched (c201ea15) · Commits · git / linux-net

kernel/sched/ext.c

+58 −42

Original line number	Diff line number	Diff line
		@@ -838,6 +838,13 @@ struct scx_sched {
		struct rhashtable dsq_hash;
		struct scx_dispatch_q **global_dsqs;

		/*
		* The event counters are in a per-CPU variable to minimize the
		* accounting overhead. A system-wide view on the event counter is
		* constructed when requested by scx_bpf_events().
		*/
		struct scx_event_stats __percpu *event_stats_cpu;

		bool warned_zero_slice;

		atomic_t exit_kind;
		@@ -1599,34 +1606,29 @@ static struct task_struct scx_task_iter_next_locked(struct scx_task_iter iter)
		return p;
		}

		/*
		* The event counter is organized by a per-CPU variable to minimize the
		* accounting overhead without synchronization. A system-wide view on the
		* event counter is constructed when requested by scx_bpf_get_event_stat().
		*/
		static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);

		/**
		* scx_add_event - Increase an event counter for 'name' by 'cnt'
		* @sch: scx_sched to account events for
		* @name: an event name defined in struct scx_event_stats
		* @cnt: the number of the event occured
		*
		* This can be used when preemption is not disabled.
		*/
		#define scx_add_event(name, cnt) do { \
		this_cpu_add(event_stats_cpu.name, cnt); \
		trace_sched_ext_event(#name, cnt); \
		#define scx_add_event(sch, name, cnt) do { \
		this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
		trace_sched_ext_event(#name, (cnt)); \
		} while(0)

		/**
		* __scx_add_event - Increase an event counter for 'name' by 'cnt'
		* @sch: scx_sched to account events for
		* @name: an event name defined in struct scx_event_stats
		* @cnt: the number of the event occured
		*
		* This should be used only when preemption is disabled.
		*/
		#define __scx_add_event(name, cnt) do { \
		__this_cpu_add(event_stats_cpu.name, cnt); \
		#define __scx_add_event(sch, name, cnt) do { \
		__this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
		trace_sched_ext_event(#name, cnt); \
		} while(0)

		@@ -1651,7 +1653,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
		} while (0)


		static void scx_read_events(struct scx_event_stats *events);
		static void scx_read_events(struct scx_sched *sch,
		struct scx_event_stats *events);

		static enum scx_enable_state scx_enable_state(void)
		{
		@@ -1877,7 +1880,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
		static void refill_task_slice_dfl(struct task_struct *p)
		{
		p->scx.slice = SCX_SLICE_DFL;
		__scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
		__scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
		}

		static void dispatch_enqueue(struct scx_dispatch_q dsq, struct task_struct p,
		@@ -2206,7 +2209,7 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		goto local;

		if (scx_rq_bypassing(rq)) {
		__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
		goto global;
		}

		@@ -2216,14 +2219,14 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		/* see %SCX_OPS_ENQ_EXITING */
		if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
		unlikely(p->flags & PF_EXITING)) {
		__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
		__scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
		goto local;
		}

		/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
		if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
		is_migration_disabled(p)) {
		__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
		__scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
		goto local;
		}

		@@ -2346,7 +2349,7 @@ static void enqueue_task_scx(struct rq rq, struct task_struct p, int enq_flags

		if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
		unlikely(cpu_of(rq) != p->scx.selected_cpu))
		__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
		__scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
		}

		static void ops_dequeue(struct rq rq, struct task_struct p, u64 deq_flags)
		@@ -2574,7 +2577,8 @@ static bool task_can_run_on_remote_rq(struct task_struct p, struct rq rq,

		if (!scx_rq_online(rq)) {
		if (enforce)
		__scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
		__scx_add_event(scx_root,
		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
		return false;
		}

		@@ -3096,7 +3100,7 @@ static int balance_one(struct rq rq, struct task_struct prev)
		if (prev_on_rq &&
		(!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) \|\| scx_rq_bypassing(rq))) {
		rq->scx.flags \|= SCX_RQ_BAL_KEEP;
		__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
		__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
		goto has_tasks;
		}
		rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
		@@ -3427,6 +3431,7 @@ bool scx_prio_less(const struct task_struct a, const struct task_struct b,

		static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
		{
		struct scx_sched *sch = scx_root;
		bool rq_bypass;

		/*
		@@ -3443,7 +3448,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
		return prev_cpu;

		rq_bypass = scx_rq_bypassing(task_rq(p));
		if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
		if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
		s32 cpu;
		struct task_struct **ddsp_taskp;

		@@ -3472,7 +3477,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
		p->scx.selected_cpu = cpu;

		if (rq_bypass)
		__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
		return cpu;
		}
		}
		@@ -4413,6 +4418,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
		struct scx_dispatch_q *dsq;
		int node;

		free_percpu(sch->event_stats_cpu);

		for_each_node_state(node, N_POSSIBLE)
		kfree(sch->global_dsqs[node]);
		kfree(sch->global_dsqs);
		@@ -4455,10 +4462,11 @@ SCX_ATTR(ops);
		static ssize_t scx_attr_events_show(struct kobject *kobj,
		struct kobj_attribute ka, char buf)
		{
		struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
		struct scx_event_stats events;
		int at = 0;

		scx_read_events(&events);
		scx_read_events(sch, &events);
		at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
		at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
		at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
		@@ -4591,24 +4599,28 @@ static void scx_bypass(bool bypass)
		{
		static DEFINE_RAW_SPINLOCK(bypass_lock);
		static unsigned long bypass_timestamp;

		int cpu;
		struct scx_sched *sch;
		unsigned long flags;
		int cpu;

		raw_spin_lock_irqsave(&bypass_lock, flags);
		sch = rcu_dereference_bh(scx_root);

		if (bypass) {
		scx_bypass_depth++;
		WARN_ON_ONCE(scx_bypass_depth <= 0);
		if (scx_bypass_depth != 1)
		goto unlock;
		bypass_timestamp = ktime_get_ns();
		scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
		if (sch)
		scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
		} else {
		scx_bypass_depth--;
		WARN_ON_ONCE(scx_bypass_depth < 0);
		if (scx_bypass_depth != 0)
		goto unlock;
		scx_add_event(SCX_EV_BYPASS_DURATION,
		if (sch)
		scx_add_event(sch, SCX_EV_BYPASS_DURATION,
		ktime_get_ns() - bypass_timestamp);
		}

		@@ -5182,7 +5194,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
		dump_line(&s, "Event counters");
		dump_line(&s, "--------------");

		scx_read_events(&events);
		scx_read_events(scx_root, &events);
		scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
		scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
		scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
		@@ -5290,6 +5302,10 @@ static struct scx_sched scx_alloc_and_add_sched(struct sched_ext_ops ops)
		sch->global_dsqs[node] = dsq;
		}

		sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
		if (!sch->event_stats_cpu)
		goto err_free_gdsqs;

		atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
		sch->ops = *ops;
		ops->priv = sch;
		@@ -5297,10 +5313,12 @@ static struct scx_sched scx_alloc_and_add_sched(struct sched_ext_ops ops)
		sch->kobj.kset = scx_kset;
		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
		if (ret < 0)
		goto err_free_gdsqs;
		goto err_free_event_stats;

		return sch;

		err_free_event_stats:
		free_percpu(sch->event_stats_cpu);
		err_free_gdsqs:
		for_each_node_state(node, N_POSSIBLE)
		kfree(sch->global_dsqs[node]);
		@@ -5376,15 +5394,6 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)

		mutex_lock(&scx_enable_mutex);

		/*
		* Clear event counters so a new scx scheduler gets
		* fresh event counter values.
		*/
		for_each_possible_cpu(cpu) {
		struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
		memset(e, 0, sizeof(*e));
		}

		if (!scx_helper) {
		WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
		if (!scx_helper) {
		@@ -7407,7 +7416,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
		return clock;
		}

		static void scx_read_events(struct scx_event_stats *events)
		static void scx_read_events(struct scx_sched sch, struct scx_event_stats events)
		{
		struct scx_event_stats *e_cpu;
		int cpu;
		@@ -7415,7 +7424,7 @@ static void scx_read_events(struct scx_event_stats *events)
		/* Aggregate per-CPU event counters into @events. */
		memset(events, 0, sizeof(*events));
		for_each_possible_cpu(cpu) {
		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
		e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
		scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
		@@ -7436,9 +7445,16 @@ static void scx_read_events(struct scx_event_stats *events)
		__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
		size_t events__sz)
		{
		struct scx_sched *sch;
		struct scx_event_stats e_sys;

		scx_read_events(&e_sys);
		rcu_read_lock();
		sch = rcu_dereference(scx_root);
		if (sch)
		scx_read_events(sch, &e_sys);
		else
		memset(&e_sys, 0, sizeof(e_sys));
		rcu_read_unlock();

		/*
		* We cannot entirely trust a BPF-provided size since a BPF program