sched_ext: Add a cgroup scheduler which uses flattened hierarchy (a4103eac) · Commits · git / linux-net

tools/sched_ext/Makefile

+1 −1

Original line number	Diff line number	Diff line
		@@ -176,7 +176,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP

		SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h \| $(BINDIR)

		c-sched-targets = scx_simple scx_qmap scx_central
		c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg

		$(addprefix $(BINDIR)/,$(c-sched-targets)): \
		$(BINDIR)/%: \

tools/sched_ext/README.md

+12 −0

Original line number	Diff line number	Diff line
		@@ -192,6 +192,18 @@ where this could be particularly useful is running VMs, where running with
		infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
		vmexits.

		## scx_flatcg

		A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
		weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
		layer, by compounding the active weight share at each level. The effect of this
		is a much more performant CPU controller, which does not need to descend down
		cgroup trees in order to properly compute a cgroup's share.

		Similar to scx_simple, in limited scenarios, this scheduler can perform
		reasonably well on single socket-socket systems with a unified L3 cache and show
		significantly lowered hierarchical scheduling overhead.


		# Troubleshooting

tools/sched_ext/scx_flatcg.bpf.c

0 → 100644

+949 −0

File added.

Preview size limit exceeded, changes collapsed.

tools/sched_ext/scx_flatcg.c

0 → 100644

+233 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */
		/*
		* Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
		* Copyright (c) 2023 Tejun Heo <tj@kernel.org>
		* Copyright (c) 2023 David Vernet <dvernet@meta.com>
		*/
		#include <stdio.h>
		#include <signal.h>
		#include <unistd.h>
		#include <libgen.h>
		#include <limits.h>
		#include <inttypes.h>
		#include <fcntl.h>
		#include <time.h>
		#include <bpf/bpf.h>
		#include <scx/common.h>
		#include "scx_flatcg.h"
		#include "scx_flatcg.bpf.skel.h"

		#ifndef FILEID_KERNFS
		#define FILEID_KERNFS 0xfe
		#endif

		const char help_fmt[] =
		"A flattened cgroup hierarchy sched_ext scheduler.\n"
		"\n"
		"See the top-level comment in .bpf.c for more details.\n"
		"\n"
		"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
		"\n"
		" -s SLICE_US Override slice duration\n"
		" -i INTERVAL Report interval\n"
		" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
		" -v Print libbpf debug messages\n"
		" -h Display this help and exit\n";

		static bool verbose;
		static volatile int exit_req;

		static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
		{
		if (level == LIBBPF_DEBUG && !verbose)
		return 0;
		return vfprintf(stderr, format, args);
		}

		static void sigint_handler(int dummy)
		{
		exit_req = 1;
		}

		static float read_cpu_util(__u64 last_sum, __u64 last_idle)
		{
		FILE *fp;
		char buf[4096];
		char line, cur = NULL, *tok;
		__u64 sum = 0, idle = 0;
		__u64 delta_sum, delta_idle;
		int idx;

		fp = fopen("/proc/stat", "r");
		if (!fp) {
		perror("fopen(\"/proc/stat\")");
		return 0.0;
		}

		if (!fgets(buf, sizeof(buf), fp)) {
		perror("fgets(\"/proc/stat\")");
		fclose(fp);
		return 0.0;
		}
		fclose(fp);

		line = buf;
		for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
		char *endp = NULL;
		__u64 v;

		if (idx == 0) {
		line = NULL;
		continue;
		}
		v = strtoull(tok, &endp, 0);
		if (!endp \|\| *endp != '\0') {
		fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
		idx, tok);
		continue;
		}
		sum += v;
		if (idx == 4)
		idle = v;
		}

		delta_sum = sum - *last_sum;
		delta_idle = idle - *last_idle;
		*last_sum = sum;
		*last_idle = idle;

		return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
		}

		static void fcg_read_stats(struct scx_flatcg skel, __u64 stats)
		{
		__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
		__u32 idx;

		memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);

		for (idx = 0; idx < FCG_NR_STATS; idx++) {
		int ret, cpu;

		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
		&idx, cnts[idx]);
		if (ret < 0)
		continue;
		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
		stats[idx] += cnts[idx][cpu];
		}
		}

		int main(int argc, char **argv)
		{
		struct scx_flatcg *skel;
		struct bpf_link *link;
		struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
		bool dump_cgrps = false;
		__u64 last_cpu_sum = 0, last_cpu_idle = 0;
		__u64 last_stats[FCG_NR_STATS] = {};
		unsigned long seq = 0;
		__s32 opt;
		__u64 ecode;

		libbpf_set_print(libbpf_print_fn);
		signal(SIGINT, sigint_handler);
		signal(SIGTERM, sigint_handler);
		restart:
		skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);

		skel->rodata->nr_cpus = libbpf_num_possible_cpus();

		while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
		double v;

		switch (opt) {
		case 's':
		v = strtod(optarg, NULL);
		skel->rodata->cgrp_slice_ns = v * 1000;
		break;
		case 'i':
		v = strtod(optarg, NULL);
		intv_ts.tv_sec = v;
		intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
		break;
		case 'd':
		dump_cgrps = true;
		break;
		case 'f':
		skel->rodata->fifo_sched = true;
		break;
		case 'v':
		verbose = true;
		break;
		case 'h':
		default:
		fprintf(stderr, help_fmt, basename(argv[0]));
		return opt != 'h';
		}
		}

		printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
		(double)skel->rodata->cgrp_slice_ns / 1000000.0,
		(double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
		dump_cgrps);

		SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
		link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);

		while (!exit_req && !UEI_EXITED(skel, uei)) {
		__u64 acc_stats[FCG_NR_STATS];
		__u64 stats[FCG_NR_STATS];
		float cpu_util;
		int i;

		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);

		fcg_read_stats(skel, acc_stats);
		for (i = 0; i < FCG_NR_STATS; i++)
		stats[i] = acc_stats[i] - last_stats[i];

		memcpy(last_stats, acc_stats, sizeof(acc_stats));

		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
		seq++, cpu_util * 100.0, skel->data->hweight_gen);
		printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n",
		stats[FCG_STAT_ACT],
		stats[FCG_STAT_DEACT],
		stats[FCG_STAT_GLOBAL],
		stats[FCG_STAT_LOCAL]);
		printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n",
		stats[FCG_STAT_HWT_CACHE],
		stats[FCG_STAT_HWT_UPDATES],
		stats[FCG_STAT_HWT_SKIP],
		stats[FCG_STAT_HWT_RACE]);
		printf("ENQ skip:%6llu race:%6llu\n",
		stats[FCG_STAT_ENQ_SKIP],
		stats[FCG_STAT_ENQ_RACE]);
		printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
		stats[FCG_STAT_CNS_KEEP],
		stats[FCG_STAT_CNS_EXPIRE],
		stats[FCG_STAT_CNS_EMPTY],
		stats[FCG_STAT_CNS_GONE]);
		printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n",
		stats[FCG_STAT_PNC_NEXT],
		stats[FCG_STAT_PNC_EMPTY],
		stats[FCG_STAT_PNC_NO_CGRP],
		stats[FCG_STAT_PNC_GONE],
		stats[FCG_STAT_PNC_RACE],
		stats[FCG_STAT_PNC_FAIL]);
		printf("BAD remove:%6llu\n",
		acc_stats[FCG_STAT_BAD_REMOVAL]);
		fflush(stdout);

		nanosleep(&intv_ts, NULL);
		}

		bpf_link__destroy(link);
		ecode = UEI_REPORT(skel, uei);
		scx_flatcg__destroy(skel);

		if (UEI_ECODE_RESTART(ecode))
		goto restart;
		return 0;
		}

tools/sched_ext/scx_flatcg.h

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		#ifndef __SCX_EXAMPLE_FLATCG_H
		#define __SCX_EXAMPLE_FLATCG_H

		enum {
		FCG_HWEIGHT_ONE = 1LLU << 16,
		};

		enum fcg_stat_idx {
		FCG_STAT_ACT,
		FCG_STAT_DEACT,
		FCG_STAT_LOCAL,
		FCG_STAT_GLOBAL,

		FCG_STAT_HWT_UPDATES,
		FCG_STAT_HWT_CACHE,
		FCG_STAT_HWT_SKIP,
		FCG_STAT_HWT_RACE,

		FCG_STAT_ENQ_SKIP,
		FCG_STAT_ENQ_RACE,

		FCG_STAT_CNS_KEEP,
		FCG_STAT_CNS_EXPIRE,
		FCG_STAT_CNS_EMPTY,
		FCG_STAT_CNS_GONE,

		FCG_STAT_PNC_NO_CGRP,
		FCG_STAT_PNC_NEXT,
		FCG_STAT_PNC_EMPTY,
		FCG_STAT_PNC_GONE,
		FCG_STAT_PNC_RACE,
		FCG_STAT_PNC_FAIL,

		FCG_STAT_BAD_REMOVAL,

		FCG_NR_STATS,
		};

		struct fcg_cgrp_ctx {
		u32 nr_active;
		u32 nr_runnable;
		u32 queued;
		u32 weight;
		u32 hweight;
		u64 child_weight_sum;
		u64 hweight_gen;
		s64 cvtime_delta;
		u64 tvtime_now;
		};

		#endif /* __SCX_EXAMPLE_FLATCG_H */