Commit d741f297 authored by Daniel Bristot de Oliveira's avatar Daniel Bristot de Oliveira Committed by Peter Zijlstra
Browse files

sched/fair: Fair server interface



Add an interface for fair server setup on debugfs.

Each CPU has two files under /debug/sched/fair_server/cpu{ID}:

 - runtime: set runtime in ns
 - period:  set period in ns

This then leaves /proc/sys/kernel/sched_rt_{period,runtime}_us to set
bounds on admission control.

The interface also add the server to the dl bandwidth accounting.

Signed-off-by: default avatarDaniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarJuri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/a9ef9fc69bcedb44bddc9bc34f2b313296052819.1716811044.git.bristot@kernel.org
parent a110a81c
Loading
Loading
Loading
Loading
+86 −17
Original line number Diff line number Diff line
@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
		__sub_running_bw(dl_se->dl_bw, dl_rq);
}

static void dl_change_utilization(struct task_struct *p, u64 new_bw)
static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
{
	struct rq *rq;

	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);

	if (task_on_rq_queued(p))
		return;
	if (dl_se->dl_non_contending) {
		sub_running_bw(dl_se, &rq->dl);
		dl_se->dl_non_contending = 0;

	rq = task_rq(p);
	if (p->dl.dl_non_contending) {
		sub_running_bw(&p->dl, &rq->dl);
		p->dl.dl_non_contending = 0;
		/*
		 * If the timer handler is currently running and the
		 * timer cannot be canceled, inactive_task_timer()
@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
		 * will not touch the rq's active utilization,
		 * so we are still safe.
		 */
		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
			put_task_struct(p);
		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
			if (!dl_server(dl_se))
				put_task_struct(dl_task_of(dl_se));
		}
	}
	__sub_rq_bw(p->dl.dl_bw, &rq->dl);
	__sub_rq_bw(dl_se->dl_bw, &rq->dl);
	__add_rq_bw(new_bw, &rq->dl);
}

static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);

	if (task_on_rq_queued(p))
		return;

	dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
}

static void __dl_clear_params(struct sched_dl_entity *dl_se);

/*
@@ -1621,11 +1626,17 @@ void dl_server_start(struct sched_dl_entity *dl_se)
{
	struct rq *rq = dl_se->rq;

	/*
	 * XXX: the apply do not work fine at the init phase for the
	 * fair server because things are not yet set. We need to improve
	 * this before getting generic.
	 */
	if (!dl_server(dl_se)) {
		/* Disabled */
		dl_se->dl_runtime = 0;
		dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
		dl_se->dl_period = 1000 * NSEC_PER_MSEC;
		u64 runtime = 0;
		u64 period = 1000 * NSEC_PER_MSEC;

		dl_server_apply_params(dl_se, runtime, period, 1);

		dl_se->dl_server = 1;
		dl_se->dl_defer = 1;
@@ -1660,6 +1671,64 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
	dl_se->server_pick = pick;
}

void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
{
	u64 new_bw = dl_se->dl_bw;
	int cpu = cpu_of(rq);
	struct dl_bw *dl_b;

	dl_b = dl_bw_of(cpu_of(rq));
	guard(raw_spinlock)(&dl_b->lock);

	if (!dl_bw_cpus(cpu))
		return;

	__dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
}

int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
{
	u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
	u64 new_bw = to_ratio(period, runtime);
	struct rq *rq = dl_se->rq;
	int cpu = cpu_of(rq);
	struct dl_bw *dl_b;
	unsigned long cap;
	int retval = 0;
	int cpus;

	dl_b = dl_bw_of(cpu);
	guard(raw_spinlock)(&dl_b->lock);

	cpus = dl_bw_cpus(cpu);
	cap = dl_bw_capacity(cpu);

	if (__dl_overflow(dl_b, cap, old_bw, new_bw))
		return -EBUSY;

	if (init) {
		__add_rq_bw(new_bw, &rq->dl);
		__dl_add(dl_b, new_bw, cpus);
	} else {
		__dl_sub(dl_b, dl_se->dl_bw, cpus);
		__dl_add(dl_b, new_bw, cpus);

		dl_rq_change_utilization(rq, dl_se, new_bw);
	}

	dl_se->dl_runtime = runtime;
	dl_se->dl_deadline = period;
	dl_se->dl_period = period;

	dl_se->runtime = 0;
	dl_se->deadline = 0;

	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);

	return retval;
}

/*
 * Update the current task's runtime statistics (provided it is still
 * a -deadline task and has not been removed from the dl_rq).
+159 −0
Original line number Diff line number Diff line
@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
	.release	= seq_release,
};

enum dl_param {
	DL_RUNTIME = 0,
	DL_PERIOD,
};

static unsigned long fair_server_period_max = (1 << 22) * NSEC_PER_USEC; /* ~4 seconds */
static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC;     /* 100 us */

static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
				       size_t cnt, loff_t *ppos, enum dl_param param)
{
	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
	struct rq *rq = cpu_rq(cpu);
	u64 runtime, period;
	size_t err;
	int retval;
	u64 value;

	err = kstrtoull_from_user(ubuf, cnt, 10, &value);
	if (err)
		return err;

	scoped_guard (rq_lock_irqsave, rq) {
		runtime  = rq->fair_server.dl_runtime;
		period = rq->fair_server.dl_period;

		switch (param) {
		case DL_RUNTIME:
			if (runtime == value)
				break;
			runtime = value;
			break;
		case DL_PERIOD:
			if (value == period)
				break;
			period = value;
			break;
		}

		if (runtime > period ||
		    period > fair_server_period_max ||
		    period < fair_server_period_min) {
			return  -EINVAL;
		}

		if (rq->cfs.h_nr_running) {
			update_rq_clock(rq);
			dl_server_stop(&rq->fair_server);
		}

		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
		if (retval)
			cnt = retval;

		if (!runtime)
			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
					cpu_of(rq));

		if (rq->cfs.h_nr_running)
			dl_server_start(&rq->fair_server);
	}

	*ppos += cnt;
	return cnt;
}

static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
{
	unsigned long cpu = (unsigned long) m->private;
	struct rq *rq = cpu_rq(cpu);
	u64 value;

	switch (param) {
	case DL_RUNTIME:
		value = rq->fair_server.dl_runtime;
		break;
	case DL_PERIOD:
		value = rq->fair_server.dl_period;
		break;
	}

	seq_printf(m, "%llu\n", value);
	return 0;

}

static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
				size_t cnt, loff_t *ppos)
{
	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
}

static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
	return sched_fair_server_show(m, v, DL_RUNTIME);
}

static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
}

static const struct file_operations fair_server_runtime_fops = {
	.open		= sched_fair_server_runtime_open,
	.write		= sched_fair_server_runtime_write,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
			       size_t cnt, loff_t *ppos)
{
	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
}

static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
	return sched_fair_server_show(m, v, DL_PERIOD);
}

static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, sched_fair_server_period_show, inode->i_private);
}

static const struct file_operations fair_server_period_fops = {
	.open		= sched_fair_server_period_open,
	.write		= sched_fair_server_period_write,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static struct dentry *debugfs_sched;

static void debugfs_fair_server_init(void)
{
	struct dentry *d_fair;
	unsigned long cpu;

	d_fair = debugfs_create_dir("fair_server", debugfs_sched);
	if (!d_fair)
		return;

	for_each_possible_cpu(cpu) {
		struct dentry *d_cpu;
		char buf[32];

		snprintf(buf, sizeof(buf), "cpu%lu", cpu);
		d_cpu = debugfs_create_dir(buf, d_fair);

		debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
		debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
	}
}

static __init int sched_init_debug(void)
{
	struct dentry __maybe_unused *numa;
@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)

	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

	debugfs_fair_server_init();

	return 0;
}
late_initcall(sched_init_debug);
+3 −0
Original line number Diff line number Diff line
@@ -366,6 +366,9 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void dl_server_update_idle_time(struct rq *rq,
		    struct task_struct *p);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
		    u64 runtime, u64 period, bool init);

#ifdef CONFIG_CGROUP_SCHED

+8 −0
Original line number Diff line number Diff line
@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
		set_rq_online(rq);

	/*
	 * Because the rq is not a task, dl_add_task_root_domain() did not
	 * move the fair server bw to the rd if it already started.
	 * Add it now.
	 */
	if (rq->fair_server.dl_server)
		__dl_server_attach_root(&rq->fair_server, rq);

	rq_unlock_irqrestore(rq, &rf);

	if (old_rd)