Commit ddceadce authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Add support for cgroup bandwidth control interface



From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jun 2025 15:06:47 -1000

- Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both
  CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED.

- Put bandwidth control interface files for both cgroup v1 and v2 under
  CONFIG_GROUP_SCHED_BANDWIDTH.

- Update tg_bandwidth() to fetch configuration parameters from fair if
  CONFIG_CFS_BANDWIDTH, SCX otherwise.

- Update tg_set_bandwidth() to update the parameters for both fair and SCX.

- Add bandwidth control parameters to struct scx_cgroup_init_args.

- Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth
  control parameter updates.

- Update scx_qmap and maximal selftest to test the new feature.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 6e6558a6
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -219,6 +219,9 @@ struct scx_task_group {
#ifdef CONFIG_EXT_GROUP_SCHED
	u32			flags;		/* SCX_TG_* */
	u32			weight;
	u64			bw_period_us;
	u64			bw_quota_us;
	u64			bw_burst_us;
#endif
};

+5 −0
Original line number Diff line number Diff line
@@ -1065,6 +1065,9 @@ if CGROUP_SCHED
config GROUP_SCHED_WEIGHT
	def_bool n

config GROUP_SCHED_BANDWIDTH
        def_bool n

config FAIR_GROUP_SCHED
	bool "Group scheduling for SCHED_OTHER"
	depends on CGROUP_SCHED
@@ -1074,6 +1077,7 @@ config FAIR_GROUP_SCHED
config CFS_BANDWIDTH
	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
	depends on FAIR_GROUP_SCHED
	select GROUP_SCHED_BANDWIDTH
	default n
	help
	  This option allows users to define CPU bandwidth rates (limits) for
@@ -1108,6 +1112,7 @@ config EXT_GROUP_SCHED
	bool
	depends on SCHED_CLASS_EXT && CGROUP_SCHED
	select GROUP_SCHED_WEIGHT
	select GROUP_SCHED_BANDWIDTH
	default y

endif #CGROUP_SCHED
+24 −5
Original line number Diff line number Diff line
@@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)

	return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */

#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
/* More than 203 days if BW_SHIFT equals 20. */
@@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
static void tg_bandwidth(struct task_group *tg,
			 u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
{
#ifdef CONFIG_CFS_BANDWIDTH
	if (period_us_p)
		*period_us_p = tg_get_cfs_period(tg);
	if (quota_us_p)
		*quota_us_p = tg_get_cfs_quota(tg);
	if (burst_us_p)
		*burst_us_p = tg_get_cfs_burst(tg);
#else /* !CONFIG_CFS_BANDWIDTH */
	if (period_us_p)
		*period_us_p = tg->scx.bw_period_us;
	if (quota_us_p)
		*quota_us_p = tg->scx.bw_quota_us;
	if (burst_us_p)
		*burst_us_p = tg->scx.bw_burst_us;
#endif /* CONFIG_CFS_BANDWIDTH */
}

static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg,
			    u64 period_us, u64 quota_us, u64 burst_us)
{
	const u64 max_usec = U64_MAX / NSEC_PER_USEC;
	int ret = 0;

	if (tg == &root_task_group)
		return -EINVAL;
@@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg,
					burst_us + quota_us > max_bw_runtime_us))
		return -EINVAL;

	return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
#ifdef CONFIG_CFS_BANDWIDTH
	ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
#endif /* CONFIG_CFS_BANDWIDTH */
	if (!ret)
		scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
	return ret;
}

static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
	tg_bandwidth(tg, &period_us, &quota_us, NULL);
	return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */

#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = {
		.write_s64 = cpu_idle_write_s64,
	},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
	{
		.name = "cfs_period_us",
		.read_u64 = cpu_period_read_u64,
@@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = {
		.read_u64 = cpu_burst_read_u64,
		.write_u64 = cpu_burst_write_u64,
	},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
	{
		.name = "stat",
		.seq_show = cpu_cfs_stat_show,
@@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
	return 0;
}

#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
	struct task_group *tg = css_tg(seq_css(sf));
@@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = {
		.write_s64 = cpu_idle_write_s64,
	},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
	{
		.name = "max",
		.flags = CFTYPE_NOT_ON_ROOT,
+63 −3
Original line number Diff line number Diff line
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
struct scx_cgroup_init_args {
	/* the weight of the cgroup [1..10000] */
	u32			weight;

	/* bandwidth control parameters from cpu.max and cpu.max.burst */
	u64			bw_period_us;
	u64			bw_quota_us;
	u64			bw_burst_us;
};

enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
	 * @cgrp: cgroup whose weight is being updated
	 * @weight: new weight [1..10000]
	 *
	 * Update @tg's weight to @weight.
	 * Update @cgrp's weight to @weight.
	 */
	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);

	/**
	 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
	 * @cgrp: cgroup whose bandwidth is being updated
	 * @period_us: bandwidth control period
	 * @quota_us: bandwidth control quota
	 * @burst_us: bandwidth control burst
	 *
	 * Update @cgrp's bandwidth control parameters. This is from the cpu.max
	 * cgroup interface.
	 *
	 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
	 * to. For example, if @period_us is 1_000_000 and @quota_us is
	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
	 * interpreted in the same fashion and specifies how much @cgrp can
	 * burst temporarily. The specific control mechanism and thus the
	 * interpretation of @period_us and burstiness is upto to the BPF
	 * scheduler.
	 */
	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
				     u64 period_us, u64 quota_us, u64 burst_us);

#endif	/* CONFIG_EXT_GROUP_SCHED */

	/*
@@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
{
	tg->scx.weight = CGROUP_WEIGHT_DFL;
	tg->scx.bw_period_us = default_bw_period_us();
	tg->scx.bw_quota_us = RUNTIME_INF;
}

int scx_tg_online(struct task_group *tg)
@@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg)
	if (scx_cgroup_enabled) {
		if (SCX_HAS_OP(sch, cgroup_init)) {
			struct scx_cgroup_init_args args =
				{ .weight = tg->scx.weight };
				{ .weight = tg->scx.weight,
				  .bw_period_us = tg->scx.bw_period_us,
				  .bw_quota_us = tg->scx.bw_quota_us,
				  .bw_burst_us = tg->scx.bw_burst_us };

			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
					      NULL, tg->css.cgroup, &args);
@@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
	/* TODO: Implement ops->cgroup_set_idle() */
}

void scx_group_set_bandwidth(struct task_group *tg,
			     u64 period_us, u64 quota_us, u64 burst_us)
{
	struct scx_sched *sch = scx_root;

	percpu_down_read(&scx_cgroup_rwsem);

	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
	    (tg->scx.bw_period_us != period_us ||
	     tg->scx.bw_quota_us != quota_us ||
	     tg->scx.bw_burst_us != burst_us))
		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
			    tg_cgrp(tg), period_us, quota_us, burst_us);

	tg->scx.bw_period_us = period_us;
	tg->scx.bw_quota_us = quota_us;
	tg->scx.bw_burst_us = burst_us;

	percpu_up_read(&scx_cgroup_rwsem);
}

static void scx_cgroup_lock(void)
{
	percpu_down_write(&scx_cgroup_rwsem);
@@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch)
	rcu_read_lock();
	css_for_each_descendant_pre(css, &root_task_group.css) {
		struct task_group *tg = css_tg(css);
		struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
		struct scx_cgroup_init_args args = {
			.weight = tg->scx.weight,
			.bw_period_us = tg->scx.bw_period_us,
			.bw_quota_us = tg->scx.bw_quota_us,
			.bw_burst_us = tg->scx.bw_burst_us,
		};

		if ((tg->scx.flags &
		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
@@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
	.cgroup_move		= sched_ext_ops__cgroup_move,
	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
	.cgroup_set_bandwidth	= sched_ext_ops__cgroup_set_bandwidth,
#endif
	.cpu_online		= sched_ext_ops__cpu_online,
	.cpu_offline		= sched_ext_ops__cpu_offline,
+2 −0
Original line number Diff line number Diff line
@@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
#else	/* CONFIG_EXT_GROUP_SCHED */
static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif	/* CONFIG_EXT_GROUP_SCHED */
#endif	/* CONFIG_CGROUP_SCHED */
Loading