tools/sched_ext: add scx_pair scheduler (f0262b10) · Commits · git / linux-nf

tools/sched_ext/Makefile

+1 −1

Original line number	Diff line number	Diff line
		@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP

		SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h \| $(BINDIR)

		c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
		c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair

		$(addprefix $(BINDIR)/,$(c-sched-targets)): \
		$(BINDIR)/%: \

tools/sched_ext/scx_pair.bpf.c

0 → 100644

+610 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */
		/*
		* A demo sched_ext core-scheduler which always makes every sibling CPU pair
		* execute from the same CPU cgroup.
		*
		* This scheduler is a minimal implementation and would need some form of
		* priority handling both inside each cgroup and across the cgroups to be
		* practically useful.
		*
		* Each CPU in the system is paired with exactly one other CPU, according to a
		* "stride" value that can be specified when the BPF scheduler program is first
		* loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
		* that they will only ever schedule tasks that belong to the same CPU cgroup.
		*
		* Scheduler Initialization
		* ------------------------
		*
		* The scheduler BPF program is first initialized from user space, before it is
		* enabled. During this initialization process, each CPU on the system is
		* assigned several values that are constant throughout its runtime:
		*
		* 1. Pair CPU: The CPU that it synchronizes with when making scheduling
		* decisions. Paired CPUs always schedule tasks from the same
		* CPU cgroup, and synchronize with each other to guarantee
		* that this constraint is not violated.
		* 2. Pair ID: Each CPU pair is assigned a Pair ID, which is used to access
		* a struct pair_ctx object that is shared between the pair.
		* 3. In-pair-index: An index, 0 or 1, that is assigned to each core in the
		* pair. Each struct pair_ctx has an active_mask field,
		* which is a bitmap used to indicate whether each core
		* in the pair currently has an actively running task.
		* This index specifies which entry in the bitmap corresponds
		* to each CPU in the pair.
		*
		* During this initialization, the CPUs are paired according to a "stride" that
		* may be specified when invoking the user space program that initializes and
		* loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
		*
		* Tasks and cgroups
		* -----------------
		*
		* Every cgroup in the system is registered with the scheduler using the
		* pair_cgroup_init() callback, and every task in the system is associated with
		* exactly one cgroup. At a high level, the idea with the pair scheduler is to
		* always schedule tasks from the same cgroup within a given CPU pair. When a
		* task is enqueued (i.e. passed to the pair_enqueue() callback function), its
		* cgroup ID is read from its task struct, and then a corresponding queue map
		* is used to FIFO-enqueue the task for that cgroup.
		*
		* If you look through the implementation of the scheduler, you'll notice that
		* there is quite a bit of complexity involved with looking up the per-cgroup
		* FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
		* BPF hash map that is used to map a cgroup ID to a globally unique ID that's
		* allocated in the BPF program. This is done because we use separate maps to
		* store the FIFO queue of tasks, and the length of that map, per cgroup. This
		* complexity is only present because of current deficiencies in BPF that will
		* soon be addressed. The main point to keep in mind is that newly enqueued
		* tasks are added to their cgroup's FIFO queue.
		*
		* Dispatching tasks
		* -----------------
		*
		* This section will describe how enqueued tasks are dispatched and scheduled.
		* Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
		* as follows:
		*
		* 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
		* the structure that's used to synchronize amongst the two pair CPUs in their
		* scheduling decisions. After any of the following events have occurred:
		*
		* - The cgroup's slice run has expired, or
		* - The cgroup becomes empty, or
		* - Either CPU in the pair is preempted by a higher priority scheduling class
		*
		* The cgroup transitions to the draining state and stops executing new tasks
		* from the cgroup.
		*
		* 2. If the pair is still executing a task, mark the pair_ctx as draining, and
		* wait for the pair CPU to be preempted.
		*
		* 3. Otherwise, if the pair CPU is not running a task, we can move onto
		* scheduling new tasks. Pop the next cgroup id from the top_q queue.
		*
		* 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
		*
		* Note again that this scheduling behavior is simple, but the implementation
		* is complex mostly because this it hits several BPF shortcomings and has to
		* work around in often awkward ways. Most of the shortcomings are expected to
		* be resolved in the near future which should allow greatly simplifying this
		* scheduler.
		*
		* Dealing with preemption
		* -----------------------
		*
		* SCX is the lowest priority sched_class, and could be preempted by them at
		* any time. To address this, the scheduler implements pair_cpu_release() and
		* pair_cpu_acquire() callbacks which are invoked by the core scheduler when
		* the scheduler loses and gains control of the CPU respectively.
		*
		* In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
		* then invoke:
		*
		* scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT \| SCX_KICK_WAIT);
		*
		* This preempts the pair CPU, and waits until it has re-entered the scheduler
		* before returning. This is necessary to ensure that the higher priority
		* sched_class that preempted our scheduler does not schedule a task
		* concurrently with our pair CPU.
		*
		* When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
		* in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
		* pair scheduling.
		*
		* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
		* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
		* Copyright (c) 2022 David Vernet <dvernet@meta.com>
		*/
		#include <scx/common.bpf.h>
		#include "scx_pair.h"

		char _license[] SEC("license") = "GPL";

		/* !0 for veristat, set during init */
		const volatile u32 nr_cpu_ids = 1;

		/* a pair of CPUs stay on a cgroup for this duration */
		const volatile u32 pair_batch_dur_ns;

		/* cpu ID -> pair cpu ID */
		const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);

		/* cpu ID -> pair_id */
		const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);

		/* CPU ID -> CPU # in the pair (0 or 1) */
		const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);

		struct pair_ctx {
		struct bpf_spin_lock lock;

		/* the cgroup the pair is currently executing */
		u64 cgid;

		/* the pair started executing the current cgroup at */
		u64 started_at;

		/* whether the current cgroup is draining */
		bool draining;

		/* the CPUs that are currently active on the cgroup */
		u32 active_mask;

		/*
		* the CPUs that are currently preempted and running tasks in a
		* different scheduler.
		*/
		u32 preempted_mask;
		};

		struct {
		__uint(type, BPF_MAP_TYPE_ARRAY);
		__type(key, u32);
		__type(value, struct pair_ctx);
		} pair_ctx SEC(".maps");

		/* queue of cgrp_q's possibly with tasks on them */
		struct {
		__uint(type, BPF_MAP_TYPE_QUEUE);
		/*
		* Because it's difficult to build strong synchronization encompassing
		* multiple non-trivial operations in BPF, this queue is managed in an
		* opportunistic way so that we guarantee that a cgroup w/ active tasks
		* is always on it but possibly multiple times. Once we have more robust
		* synchronization constructs and e.g. linked list, we should be able to
		* do this in a prettier way but for now just size it big enough.
		*/
		__uint(max_entries, 4 * MAX_CGRPS);
		__type(value, u64);
		} top_q SEC(".maps");

		/* per-cgroup q which FIFOs the tasks from the cgroup */
		struct cgrp_q {
		__uint(type, BPF_MAP_TYPE_QUEUE);
		__uint(max_entries, MAX_QUEUED);
		__type(value, u32);
		};

		/*
		* Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
		* storage; however, a cgroup local storage can only be accessed from the BPF
		* progs attached to the cgroup. For now, work around by allocating array of
		* cgrp_q's and then allocating per-cgroup indices.
		*
		* Another caveat: It's difficult to populate a large array of maps statically
		* or from BPF. Initialize it from userland.
		*/
		struct {
		__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
		__uint(max_entries, MAX_CGRPS);
		__type(key, s32);
		__array(values, struct cgrp_q);
		} cgrp_q_arr SEC(".maps");

		static u64 cgrp_q_len[MAX_CGRPS];

		/*
		* This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
		* useful to have as a map type.
		*/
		static u32 cgrp_q_idx_cursor;
		static u64 cgrp_q_idx_busy[MAX_CGRPS];

		/*
		* All added up, the following is what we do:
		*
		* 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
		* for a free ID. If not found, fail cgroup creation with -EBUSY.
		*
		* 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
		* cgrp_q_idx_hash.
		*
		* 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
		* cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
		*
		* This is sadly complicated for something pretty simple. Hopefully, we should
		* be able to simplify in the future.
		*/
		struct {
		__uint(type, BPF_MAP_TYPE_HASH);
		__uint(max_entries, MAX_CGRPS);
		__uint(key_size, sizeof(u64)); /* cgrp ID */
		__uint(value_size, sizeof(s32)); /* cgrp_q idx */
		} cgrp_q_idx_hash SEC(".maps");

		/* statistics */
		u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
		u64 nr_exps, nr_exp_waits, nr_exp_empty;
		u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;

		UEI_DEFINE(uei);

		void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
		{
		struct cgroup *cgrp;
		struct cgrp_q *cgq;
		s32 pid = p->pid;
		u64 cgid;
		u32 *q_idx;
		u64 *cgq_len;

		__sync_fetch_and_add(&nr_total, 1);

		cgrp = scx_bpf_task_cgroup(p);
		cgid = cgrp->kn->id;
		bpf_cgroup_release(cgrp);

		/* find the cgroup's q and push @p into it */
		q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
		if (!q_idx) {
		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
		return;
		}

		cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
		if (!cgq) {
		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
		cgid, *q_idx);
		return;
		}

		if (bpf_map_push_elem(cgq, &pid, 0)) {
		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
		return;
		}

		/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
		cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
		if (!cgq_len) {
		scx_bpf_error("MEMBER_VTPR malfunction");
		return;
		}

		if (!__sync_fetch_and_add(cgq_len, 1) &&
		bpf_map_push_elem(&top_q, &cgid, 0)) {
		scx_bpf_error("top_q overflow");
		return;
		}
		}

		static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx *pairc, u32 mask)
		{
		u32 *vptr;

		vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
		if (!vptr)
		return -EINVAL;

		*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
		if (!(*pairc))
		return -EINVAL;

		vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
		if (!vptr)
		return -EINVAL;

		mask = 1U << vptr;

		return 0;
		}

		__attribute__((noinline))
		static int try_dispatch(s32 cpu)
		{
		struct pair_ctx *pairc;
		struct bpf_map *cgq_map;
		struct task_struct *p;
		u64 now = scx_bpf_now();
		bool kick_pair = false;
		bool expired, pair_preempted;
		u32 *vptr, in_pair_mask;
		s32 pid, q_idx;
		u64 cgid;
		int ret;

		ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
		if (ret) {
		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
		cpu);
		return -ENOENT;
		}

		bpf_spin_lock(&pairc->lock);
		pairc->active_mask &= ~in_pair_mask;

		expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
		if (expired \|\| pairc->draining) {
		u64 new_cgid = 0;

		__sync_fetch_and_add(&nr_exps, 1);

		/*
		* We're done with the current cgid. An obvious optimization
		* would be not draining if the next cgroup is the current one.
		* For now, be dumb and always expire.
		*/
		pairc->draining = true;

		pair_preempted = pairc->preempted_mask;
		if (pairc->active_mask \|\| pair_preempted) {
		/*
		* The other CPU is still active, or is no longer under
		* our control due to e.g. being preempted by a higher
		* priority sched_class. We want to wait until this
		* cgroup expires, or until control of our pair CPU has
		* been returned to us.
		*
		* If the pair controls its CPU, and the time already
		* expired, kick. When the other CPU arrives at
		* dispatch and clears its active mask, it'll push the
		* pair to the next cgroup and kick this CPU.
		*/
		__sync_fetch_and_add(&nr_exp_waits, 1);
		bpf_spin_unlock(&pairc->lock);
		if (expired && !pair_preempted)
		kick_pair = true;
		goto out_maybe_kick;
		}

		bpf_spin_unlock(&pairc->lock);

		/*
		* Pick the next cgroup. It'd be easier / cleaner to not drop
		* pairc->lock and use stronger synchronization here especially
		* given that we'll be switching cgroups significantly less
		* frequently than tasks. Unfortunately, bpf_spin_lock can't
		* really protect anything non-trivial. Let's do opportunistic
		* operations instead.
		*/
		bpf_repeat(BPF_MAX_LOOPS) {
		u32 *q_idx;
		u64 *cgq_len;

		if (bpf_map_pop_elem(&top_q, &new_cgid)) {
		/* no active cgroup, go idle */
		__sync_fetch_and_add(&nr_exp_empty, 1);
		return 0;
		}

		q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
		if (!q_idx)
		continue;

		/*
		* This is the only place where empty cgroups are taken
		* off the top_q.
		*/
		cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
		if (!cgq_len \|\| !*cgq_len)
		continue;

		/*
		* If it has any tasks, requeue as we may race and not
		* execute it.
		*/
		bpf_map_push_elem(&top_q, &new_cgid, 0);
		break;
		}

		bpf_spin_lock(&pairc->lock);

		/*
		* The other CPU may already have started on a new cgroup while
		* we dropped the lock. Make sure that we're still draining and
		* start on the new cgroup.
		*/
		if (pairc->draining && !pairc->active_mask) {
		__sync_fetch_and_add(&nr_cgrp_next, 1);
		pairc->cgid = new_cgid;
		pairc->started_at = now;
		pairc->draining = false;
		kick_pair = true;
		} else {
		__sync_fetch_and_add(&nr_cgrp_coll, 1);
		}
		}

		cgid = pairc->cgid;
		pairc->active_mask \|= in_pair_mask;
		bpf_spin_unlock(&pairc->lock);

		/* again, it'd be better to do all these with the lock held, oh well */
		vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
		if (!vptr) {
		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
		return -ENOENT;
		}
		q_idx = *vptr;

		/* claim one task from cgrp_q w/ q_idx */
		bpf_repeat(BPF_MAX_LOOPS) {
		u64 *cgq_len, len;

		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
		if (!cgq_len \|\| !(len = (volatile u64 )cgq_len)) {
		/* the cgroup must be empty, expire and repeat */
		__sync_fetch_and_add(&nr_cgrp_empty, 1);
		bpf_spin_lock(&pairc->lock);
		pairc->draining = true;
		pairc->active_mask &= ~in_pair_mask;
		bpf_spin_unlock(&pairc->lock);
		return -EAGAIN;
		}

		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
		continue;

		break;
		}

		cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
		if (!cgq_map) {
		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
		cgid, q_idx);
		return -ENOENT;
		}

		if (bpf_map_pop_elem(cgq_map, &pid)) {
		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
		cgid, q_idx);
		return -ENOENT;
		}

		p = bpf_task_from_pid(pid);
		if (p) {
		__sync_fetch_and_add(&nr_dispatched, 1);
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
		bpf_task_release(p);
		} else {
		/* we don't handle dequeues, retry on lost tasks */
		__sync_fetch_and_add(&nr_missing, 1);
		return -EAGAIN;
		}

		out_maybe_kick:
		if (kick_pair) {
		s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
		if (pair) {
		__sync_fetch_and_add(&nr_kicks, 1);
		scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
		}
		}
		return 0;
		}

		void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
		{
		bpf_repeat(BPF_MAX_LOOPS) {
		if (try_dispatch(cpu) != -EAGAIN)
		break;
		}
		}

		void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
		{
		int ret;
		u32 in_pair_mask;
		struct pair_ctx *pairc;
		bool kick_pair;

		ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
		if (ret)
		return;

		bpf_spin_lock(&pairc->lock);
		pairc->preempted_mask &= ~in_pair_mask;
		/* Kick the pair CPU, unless it was also preempted. */
		kick_pair = !pairc->preempted_mask;
		bpf_spin_unlock(&pairc->lock);

		if (kick_pair) {
		s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

		if (pair) {
		__sync_fetch_and_add(&nr_kicks, 1);
		scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
		}
		}
		}

		void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
		{
		int ret;
		u32 in_pair_mask;
		struct pair_ctx *pairc;
		bool kick_pair;

		ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
		if (ret)
		return;

		bpf_spin_lock(&pairc->lock);
		pairc->preempted_mask \|= in_pair_mask;
		pairc->active_mask &= ~in_pair_mask;
		/* Kick the pair CPU if it's still running. */
		kick_pair = pairc->active_mask;
		pairc->draining = true;
		bpf_spin_unlock(&pairc->lock);

		if (kick_pair) {
		s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

		if (pair) {
		__sync_fetch_and_add(&nr_kicks, 1);
		scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT \| SCX_KICK_WAIT);
		}
		}
		__sync_fetch_and_add(&nr_preemptions, 1);
		}

		s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
		{
		u64 cgid = cgrp->kn->id;
		s32 i, q_idx;

		bpf_for(i, 0, MAX_CGRPS) {
		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
		break;
		}
		if (i == MAX_CGRPS)
		return -EBUSY;

		if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
		if (busy)
		*busy = 0;
		return -EBUSY;
		}

		return 0;
		}

		void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
		{
		u64 cgid = cgrp->kn->id;
		s32 *q_idx;

		q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
		if (q_idx) {
		u64 busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
		if (busy)
		*busy = 0;
		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
		}
		}

		void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
		{
		UEI_RECORD(uei, ei);
		}

		SCX_OPS_DEFINE(pair_ops,
		.enqueue = (void *)pair_enqueue,
		.dispatch = (void *)pair_dispatch,
		.cpu_acquire = (void *)pair_cpu_acquire,
		.cpu_release = (void *)pair_cpu_release,
		.cgroup_init = (void *)pair_cgroup_init,
		.cgroup_exit = (void *)pair_cgroup_exit,
		.exit = (void *)pair_exit,
		.name = "pair");

tools/sched_ext/scx_pair.c

0 → 100644

+180 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */
		/*
		* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
		* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
		* Copyright (c) 2022 David Vernet <dvernet@meta.com>
		*/
		#include <stdio.h>
		#include <unistd.h>
		#include <inttypes.h>
		#include <signal.h>
		#include <assert.h>
		#include <libgen.h>
		#include <bpf/bpf.h>
		#include <scx/common.h>
		#include "scx_pair.h"
		#include "scx_pair.bpf.skel.h"

		const char help_fmt[] =
		"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
		"execute from the same CPU cgroup.\n"
		"\n"
		"See the top-level comment in .bpf.c for more details.\n"
		"\n"
		"Usage: %s [-S STRIDE]\n"
		"\n"
		" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n"
		" -v Print libbpf debug messages\n"
		" -h Display this help and exit\n";

		static bool verbose;
		static volatile int exit_req;

		static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
		{
		if (level == LIBBPF_DEBUG && !verbose)
		return 0;
		return vfprintf(stderr, format, args);
		}

		static void sigint_handler(int dummy)
		{
		exit_req = 1;
		}

		int main(int argc, char **argv)
		{
		struct scx_pair *skel;
		struct bpf_link *link;
		__u64 seq = 0, ecode;
		__s32 stride, i, opt, outer_fd;

		libbpf_set_print(libbpf_print_fn);
		signal(SIGINT, sigint_handler);
		signal(SIGTERM, sigint_handler);
		restart:
		skel = SCX_OPS_OPEN(pair_ops, scx_pair);

		skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
		assert(skel->rodata->nr_cpu_ids > 0);
		skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");

		/* pair up the earlier half to the latter by default, override with -s */
		stride = skel->rodata->nr_cpu_ids / 2;

		while ((opt = getopt(argc, argv, "S:vh")) != -1) {
		switch (opt) {
		case 'S':
		stride = strtoul(optarg, NULL, 0);
		break;
		case 'v':
		verbose = true;
		break;
		default:
		fprintf(stderr, help_fmt, basename(argv[0]));
		return opt != 'h';
		}
		}

		bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);

		/* Resize arrays so their element count is equal to cpu count. */
		RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids);
		RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids);
		RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids);

		for (i = 0; i < skel->rodata->nr_cpu_ids; i++)
		skel->rodata_pair_cpu->pair_cpu[i] = -1;

		printf("Pairs: ");
		for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
		int j = (i + stride) % skel->rodata->nr_cpu_ids;

		if (skel->rodata_pair_cpu->pair_cpu[i] >= 0)
		continue;

		SCX_BUG_ON(i == j,
		"Invalid stride %d - CPU%d wants to be its own pair",
		stride, i);

		SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0,
		"Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
		stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]);

		skel->rodata_pair_cpu->pair_cpu[i] = j;
		skel->rodata_pair_cpu->pair_cpu[j] = i;
		skel->rodata_pair_id->pair_id[i] = i;
		skel->rodata_pair_id->pair_id[j] = i;
		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;

		printf("[%d, %d] ", i, j);
		}
		printf("\n");

		SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei);

		/*
		* Populate the cgrp_q_arr map which is an array containing per-cgroup
		* queues. It'd probably be better to do this from BPF but there are too
		* many to initialize statically and there's no way to dynamically
		* populate from BPF.
		*/
		outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
		SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd);

		printf("Initializing");
		for (i = 0; i < MAX_CGRPS; i++) {
		__s32 inner_fd;

		if (exit_req)
		break;

		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
		sizeof(__u32), MAX_QUEUED, NULL);
		SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d",
		inner_fd);
		SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY),
		"Failed to set inner map");
		close(inner_fd);

		if (!(i % 10))
		printf(".");
		fflush(stdout);
		}
		printf("\n");

		/*
		* Fully initialized, attach and run.
		*/
		link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair);

		while (!exit_req && !UEI_EXITED(skel, uei)) {
		printf("[SEQ %llu]\n", seq++);
		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n",
		skel->bss->nr_total,
		skel->bss->nr_dispatched,
		skel->bss->nr_missing);
		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
		skel->bss->nr_kicks,
		skel->bss->nr_preemptions);
		printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
		skel->bss->nr_exps,
		skel->bss->nr_exp_waits,
		skel->bss->nr_exp_empty);
		printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n",
		skel->bss->nr_cgrp_next,
		skel->bss->nr_cgrp_coll,
		skel->bss->nr_cgrp_empty);
		fflush(stdout);
		sleep(1);
		}

		bpf_link__destroy(link);
		ecode = UEI_REPORT(skel, uei);
		scx_pair__destroy(skel);

		if (UEI_ECODE_RESTART(ecode))
		goto restart;
		return 0;
		}

tools/sched_ext/scx_pair.h

0 → 100644

+9 −0

Original line number	Diff line number	Diff line
		#ifndef __SCX_EXAMPLE_PAIR_H
		#define __SCX_EXAMPLE_PAIR_H

		enum {
		MAX_QUEUED = 4096,
		MAX_CGRPS = 4096,
		};

		#endif /* __SCX_EXAMPLE_PAIR_H */