Commit 36929ebd authored by Emil Tsalapatis's avatar Emil Tsalapatis Committed by Tejun Heo
Browse files

tools/sched_ext: add arena based scheduler



Add a scheduler that uses BPF arenas to manage task context data.

Signed-off-by: default avatarEmil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent f0262b10
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP

SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)

c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt

$(addprefix $(BINDIR)/,$(c-sched-targets)): \
	$(BINDIR)/%: \
+710 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Arena-based task data scheduler. This is a variation of scx_simple
 * that uses a combined allocator and indexing structure to organize
 * task data. Task context allocation is done when a task enters the
 * scheduler, while freeing is done when it exits. Task contexts are
 * retrieved from task-local storage, pointing to the allocated memory.
 *
 * The main purpose of this scheduler is to demostrate arena memory
 * management.
 *
 * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
 * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
 *
 */
#include <scx/common.bpf.h>
#include <scx/bpf_arena_common.bpf.h>

#include "scx_sdt.h"

char _license[] SEC("license") = "GPL";

UEI_DEFINE(uei);

struct {
	__uint(type, BPF_MAP_TYPE_ARENA);
	__uint(map_flags, BPF_F_MMAPABLE);
#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
	__uint(max_entries, 1 << 16); /* number of pages */
        __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
#else
	__uint(max_entries, 1 << 20); /* number of pages */
        __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
#endif
} arena __weak SEC(".maps");

#define SHARED_DSQ 0

#define DEFINE_SDT_STAT(metric)				\
static inline void				\
stat_inc_##metric(struct scx_stats __arena *stats)	\
{							\
	cast_kern(stats);				\
	stats->metric += 1;				\
}							\
__u64 stat_##metric;					\

DEFINE_SDT_STAT(enqueue);
DEFINE_SDT_STAT(init);
DEFINE_SDT_STAT(exit);
DEFINE_SDT_STAT(select_idle_cpu);
DEFINE_SDT_STAT(select_busy_cpu);

/*
 * Necessary for cond_break/can_loop's semantics. According to kernel commit
 * 011832b, the loop counter variable must be seen as imprecise and bounded
 * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
 * makes it precise and prevents may_goto from helping with converging the
 * loop. For these loops we must initialize the loop counter from a variable
 * whose value the verifier cannot reason about when checking the program, so
 * that the loop counter's value is imprecise.
 */
static __u64 zero = 0;

/*
 * XXX Hack to get the verifier to find the arena for sdt_exit_task.
 * As of 6.12-rc5, The verifier associates arenas with programs by
 * checking LD.IMM instruction operands for an arena and populating
 * the program state with the first instance it finds. This requires
 * accessing our global arena variable, but scx methods do not necessarily
 * do so while still using pointers from that arena. Insert a bpf_printk
 * statement that triggers at most once to generate an LD.IMM instruction
 * to access the arena and help the verifier.
 */
static volatile bool scx_arena_verify_once;

__hidden void scx_arena_subprog_init(void)
{
	if (scx_arena_verify_once)
		return;

	bpf_printk("%s: arena pointer %p", __func__, &arena);
	scx_arena_verify_once = true;
}


private(LOCK) struct bpf_spin_lock alloc_lock;
private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;

/* allocation pools */
struct sdt_pool desc_pool;
struct sdt_pool chunk_pool;

/* Protected by alloc_lock. */
struct scx_alloc_stats alloc_stats;


/* Allocate element from the pool. Must be called with a then pool lock held. */
static
void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
{
	__u64 elem_size, max_elems;
	void __arena *slab;
	void __arena *ptr;

	elem_size = pool->elem_size;
	max_elems = pool->max_elems;

	/* If the chunk is spent, get a new one. */
	if (pool->idx >= max_elems) {
		slab = bpf_arena_alloc_pages(&arena, NULL,
			div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
		if (!slab)
			return NULL;

		pool->slab = slab;
		pool->idx = 0;
	}

	ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
	pool->idx += 1;

	return ptr;
}

/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
static sdt_desc_t *scx_alloc_chunk(void)
{
	struct sdt_chunk __arena *chunk;
	sdt_desc_t *desc;
	sdt_desc_t *out;

	chunk = scx_alloc_from_pool(&chunk_pool);
	if (!chunk)
		return NULL;

	desc = scx_alloc_from_pool(&desc_pool);
	if (!desc) {
		/*
		 * Effectively frees the previous chunk allocation.
		 * Index cannot be 0, so decrementing is always
		 * valid.
		 */
		chunk_pool.idx -= 1;
		return NULL;
	}

	out = desc;

	desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
	desc->chunk = chunk;

	alloc_stats.chunk_allocs += 1;

	return out;
}

static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
{
	if (unlikely(data_size % 8))
		return -EINVAL;

	if (unlikely(nr_pages == 0))
		return -EINVAL;

	pool->elem_size = data_size;
	pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
	/* Populate the pool slab on the first allocation. */
	pool->idx = pool->max_elems;

	return 0;
}

/* Initialize both the base pool allocators and the root chunk of the index. */
__hidden int
scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
{
	size_t min_chunk_size;
	int ret;

	_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
		"chunk size must fit into a page");

	ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
	if (ret != 0)
		return ret;

	ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
	if (ret != 0)
		return ret;

	/* Wrap data into a descriptor and word align. */
	data_size += sizeof(struct sdt_data);
	data_size = round_up(data_size, 8);

	/*
	 * Ensure we allocate large enough chunks from the arena to avoid excessive
	 * internal fragmentation when turning chunks it into structs.
	 */
	min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
	ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
	if (ret != 0)
		return ret;

	bpf_spin_lock(&alloc_lock);
	alloc->root = scx_alloc_chunk();
	bpf_spin_unlock(&alloc_lock);
	if (!alloc->root)
		return -ENOMEM;

	return 0;
}

static
int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
{
	__u64 __arena *allocated = desc->allocated;
	__u64 bit;

	if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
		return -EINVAL;

	bit = (__u64)1 << (pos % 64);

	if (state)
		allocated[pos / 64] |= bit;
	else
		allocated[pos / 64] &= ~bit;

	return 0;
}

static __noinline
int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
{
	sdt_desc_t *desc;
	__u64 u, level;
	int ret;

	for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
		level = SDT_TASK_LEVELS - 1 - u;

		/* Only propagate upwards if we are the parent's only free chunk. */
		desc = lv_desc[level];

		ret = set_idx_state(desc, lv_pos[level], false);
		if (unlikely(ret != 0))
			return ret;

		desc->nr_free += 1;
		if (desc->nr_free > 1)
			return 0;
	}

	return 0;
}

/*
 * Free the allocated struct with the given index. Called with the
 * allocator lock taken.
 */
__hidden
int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
{
	const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
	sdt_desc_t * __arena *desc_children;
	struct sdt_chunk __arena *chunk;
	sdt_desc_t *desc;
	struct sdt_data __arena *data;
	__u64 level, shift, pos;
	__u64 lv_pos[SDT_TASK_LEVELS];
	int ret;
	int i;

	if (!alloc)
		return 0;

	desc = alloc->root;
	if (unlikely(!desc))
		return -EINVAL;

	/* To appease the verifier. */
	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
		lv_desc[level] = NULL;
		lv_pos[level] = 0;
	}

	/* Find the leaf node containing the index. */
	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
		shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
		pos = (idx >> shift) & mask;

		lv_desc[level] = desc;
		lv_pos[level] = pos;

		if (level == SDT_TASK_LEVELS - 1)
			break;

		chunk = desc->chunk;

		desc_children = (sdt_desc_t * __arena *)chunk->descs;
		desc = desc_children[pos];

		if (unlikely(!desc))
			return -EINVAL;
	}

	chunk = desc->chunk;

	pos = idx & mask;
	data = chunk->data[pos];
	if (likely(data)) {
		data[pos] = (struct sdt_data) {
			.tid.genn = data->tid.genn + 1,
		};

		/* Zero out one word at a time. */
		for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
			data->payload[i] = 0;
		}
	}

	ret = mark_nodes_avail(lv_desc, lv_pos);
	if (unlikely(ret != 0))
		return ret;

	alloc_stats.active_allocs -= 1;
	alloc_stats.free_ops += 1;

	return 0;
}

static inline
int ffs(__u64 word)
{
	unsigned int num = 0;

	if ((word & 0xffffffff) == 0) {
		num += 32;
		word >>= 32;
	}

	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}

	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}

	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}

	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}

	if ((word & 0x1) == 0) {
		num += 1;
		word >>= 1;
	}

	return num;
}


/* find the first empty slot */
__hidden
__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
{
	__u64 freeslots;
	__u64 i;

	for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
		freeslots = ~desc->allocated[i];
		if (freeslots == (__u64)0)
			continue;

		return (i * 64) + ffs(freeslots);
	}

	return SDT_TASK_ENTS_PER_CHUNK;
}

/*
 * Find and return an available idx on the allocator.
 * Called with the task spinlock held.
 */
static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
{
	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
	sdt_desc_t * __arena *desc_children;
	struct sdt_chunk __arena *chunk;
	sdt_desc_t *tmp;
	__u64 lv_pos[SDT_TASK_LEVELS];
	__u64 u, pos, level;
	__u64 idx = 0;
	int ret;

	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
		pos = chunk_find_empty(desc);

		/* If we error out, something has gone very wrong. */
		if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
			return NULL;

		if (pos == SDT_TASK_ENTS_PER_CHUNK)
			return NULL;

		idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
		idx |= pos;

		/* Log the levels to complete allocation. */
		lv_desc[level] = desc;
		lv_pos[level] = pos;

		/* The rest of the loop is for internal node traversal. */
		if (level == SDT_TASK_LEVELS - 1)
			break;

		/* Allocate an internal node if necessary. */
		chunk = desc->chunk;
		desc_children = (sdt_desc_t * __arena *)chunk->descs;

		desc = desc_children[pos];
		if (!desc) {
			desc = scx_alloc_chunk();
			if (!desc)
				return NULL;

			desc_children[pos] = desc;
		}
	}

	/*
	 * Finding the descriptor along with any internal node
	 * allocations was successful. Update all levels with
	 * the new allocation.
	 */
	bpf_for(u, 0, SDT_TASK_LEVELS) {
		level = SDT_TASK_LEVELS - 1 - u;
		tmp = lv_desc[level];

		ret = set_idx_state(tmp, lv_pos[level], true);
		if (ret != 0)
			break;

		tmp->nr_free -= 1;
		if (tmp->nr_free > 0)
			break;
	}

	*idxp = idx;

	return desc;
}

__hidden
void __arena *scx_alloc(struct scx_allocator *alloc)
{
	struct sdt_data __arena *data = NULL;
	struct sdt_chunk __arena *chunk;
	sdt_desc_t *desc;
	__u64 idx, pos;

	if (!alloc)
		return NULL;

	bpf_spin_lock(&alloc_lock);

	/* We unlock if we encounter an error in the function. */
	desc = desc_find_empty(alloc->root, &idx);
	if (unlikely(desc == NULL)) {
		bpf_spin_unlock(&alloc_lock);
		return NULL;
	}

	chunk = desc->chunk;

	/* Populate the leaf node if necessary. */
	pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
	data = chunk->data[pos];
	if (!data) {
		data = scx_alloc_from_pool(&alloc->pool);
		if (!data) {
			scx_alloc_free_idx(alloc, idx);
			bpf_spin_unlock(&alloc_lock);
			return NULL;
		}
	}

	chunk->data[pos] = data;

	/* The data counts as a chunk */
	alloc_stats.data_allocs += 1;
	alloc_stats.alloc_ops += 1;
	alloc_stats.active_allocs += 1;

	data->tid.idx = idx;

	bpf_spin_unlock(&alloc_lock);

	return data;
}

/*
 * Task BPF map entry recording the task's assigned ID and pointing to the data
 * area allocated in arena.
 */
struct scx_task_map_val {
	union sdt_id		tid;
	__u64			tptr;
	struct sdt_data __arena	*data;
};

struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct scx_task_map_val);
} scx_task_map SEC(".maps");

static struct scx_allocator scx_task_allocator;

__hidden
void __arena *scx_task_alloc(struct task_struct *p)
{
	struct sdt_data __arena *data = NULL;
	struct scx_task_map_val *mval;

	mval = bpf_task_storage_get(&scx_task_map, p, 0,
				    BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!mval)
		return NULL;

	data = scx_alloc(&scx_task_allocator);
	if (unlikely(!data))
		return NULL;

	mval->tid = data->tid;
	mval->tptr = (__u64) p;
	mval->data = data;

	return (void __arena *)data->payload;
}

__hidden
int scx_task_init(__u64 data_size)
{
	return scx_alloc_init(&scx_task_allocator, data_size);
}

__hidden
void __arena *scx_task_data(struct task_struct *p)
{
	struct sdt_data __arena *data;
	struct scx_task_map_val *mval;

	scx_arena_subprog_init();

	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
	if (!mval)
		return NULL;

	data = mval->data;

	return (void __arena *)data->payload;
}

__hidden
void scx_task_free(struct task_struct *p)
{
	struct scx_task_map_val *mval;

	scx_arena_subprog_init();

	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
	if (!mval)
		return;

	bpf_spin_lock(&alloc_lock);
	scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
	bpf_spin_unlock(&alloc_lock);

	bpf_task_storage_delete(&scx_task_map, p);
}

static inline void
scx_stat_global_update(struct scx_stats __arena *stats)
{
	cast_kern(stats);
	__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
	__sync_fetch_and_add(&stat_init, stats->init);
	__sync_fetch_and_add(&stat_exit, stats->exit);
	__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
	__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
}

s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
	struct scx_stats __arena *stats;
	bool is_idle = false;
	s32 cpu;

	stats = scx_task_data(p);
	if (!stats) {
		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
		return 0;
	}

	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
	if (is_idle) {
		stat_inc_select_idle_cpu(stats);
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
	} else {
		stat_inc_select_busy_cpu(stats);
	}

	return cpu;
}

void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
{
	struct scx_stats __arena *stats;

	stats = scx_task_data(p);
	if (!stats) {
		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
		return;
	}

	stat_inc_enqueue(stats);

	scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
}

void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
{
	scx_bpf_dsq_move_to_local(SHARED_DSQ);
}

s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
			     struct scx_init_task_args *args)
{
	struct scx_stats __arena *stats;

	stats = scx_task_alloc(p);
	if (!stats) {
		scx_bpf_error("arena allocator out of memory");
		return -ENOMEM;
	}

	stats->pid = p->pid;

	stat_inc_init(stats);

	return 0;
}

void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
			      struct scx_exit_task_args *args)
{
	struct scx_stats __arena *stats;

	stats = scx_task_data(p);
	if (!stats) {
		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
		return;
	}

	stat_inc_exit(stats);
	scx_stat_global_update(stats);

	scx_task_free(p);
}

s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
{
	int ret;

	ret = scx_task_init(sizeof(struct scx_stats));
	if (ret < 0) {
		scx_bpf_error("%s: failed with %d", __func__, ret);
		return ret;
	}

	return scx_bpf_create_dsq(SHARED_DSQ, -1);
}

void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
{
	UEI_RECORD(uei, ei);
}

SCX_OPS_DEFINE(sdt_ops,
	       .select_cpu		= (void *)sdt_select_cpu,
	       .enqueue			= (void *)sdt_enqueue,
	       .dispatch		= (void *)sdt_dispatch,
	       .init_task		= (void *)sdt_init_task,
	       .exit_task		= (void *)sdt_exit_task,
	       .init			= (void *)sdt_init,
	       .exit			= (void *)sdt_exit,
	       .name			= "sdt");
+101 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
 * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
 */
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>

#include "scx_sdt.h"
#include "scx_sdt.bpf.skel.h"

const char help_fmt[] =
"A simple arena-based sched_ext scheduler.\n"
"\n"
"Modified version of scx_simple that demonstrates arena-based data structures.\n"
"\n"
"Usage: %s [-f] [-v]\n"
"\n"
"  -v            Print libbpf debug messages\n"
"  -h            Display this help and exit\n";

static bool verbose;
static volatile int exit_req;

static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
	if (level == LIBBPF_DEBUG && !verbose)
		return 0;
	return vfprintf(stderr, format, args);
}

static void sigint_handler(int sig)
{
	exit_req = 1;
}

int main(int argc, char **argv)
{
	struct scx_sdt *skel;
	struct bpf_link *link;
	__u32 opt;
	__u64 ecode;

	libbpf_set_print(libbpf_print_fn);
	signal(SIGINT, sigint_handler);
	signal(SIGTERM, sigint_handler);
restart:
	skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);

	while ((opt = getopt(argc, argv, "fvh")) != -1) {
		switch (opt) {
		case 'v':
			verbose = true;
			break;
		default:
			fprintf(stderr, help_fmt, basename(argv[0]));
			return opt != 'h';
		}
	}

	SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
	link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);

	while (!exit_req && !UEI_EXITED(skel, uei)) {
		printf("====SCHEDULING STATS====\n");
		printf("enqueues=%llu\t", skel->bss->stat_enqueue);
		printf("inits=%llu\t", skel->bss->stat_init);
		printf("exits=%llu\t", skel->bss->stat_exit);
		printf("\n");

		printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
		printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
		printf("\n");

		printf("====ALLOCATION STATS====\n");
		printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs);
		printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs);
		printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops);
		printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops);
		printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs);
		printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used);
		printf("\n\n");

		fflush(stdout);
		sleep(1);
	}

	bpf_link__destroy(link);
	ecode = UEI_REPORT(skel, uei);
	scx_sdt__destroy(skel);

	if (UEI_ECODE_RESTART(ecode))
		goto restart;
	return 0;
}
+113 −0
Original line number Diff line number Diff line
/*
 * SPDX-License-Identifier: GPL-2.0
 * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
 * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com>
 */
#pragma once

#ifndef __BPF__
#define __arena
#endif /* __BPF__ */

struct scx_alloc_stats {
	__u64		chunk_allocs;
	__u64		data_allocs;
	__u64		alloc_ops;
	__u64		free_ops;
	__u64		active_allocs;
	__u64		arena_pages_used;
};

struct sdt_pool {
	void __arena	*slab;
	__u64		elem_size;
	__u64		max_elems;
	__u64		idx;
};

#ifndef div_round_up
#define div_round_up(a, b) (((a) + (b) - 1) / (b))
#endif

#ifndef round_up
#define round_up(a, b) (div_round_up((a), (b)) * (b))
#endif

typedef struct sdt_desc __arena sdt_desc_t;

enum sdt_consts {
	SDT_TASK_ENTS_PER_PAGE_SHIFT	= 9,
	SDT_TASK_LEVELS			= 3,
	SDT_TASK_ENTS_PER_CHUNK		= 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT,
	SDT_TASK_CHUNK_BITMAP_U64S	= div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
	SDT_TASK_MIN_ELEM_PER_ALLOC 	= 8,
};

union sdt_id {
	__s64				val;
	struct {
		__s32			idx;	/* index in the radix tree */
		__s32			genn;	/* ++'d on recycle so that it forms unique'ish 64bit ID */
	};
};

struct sdt_chunk;

/*
 * Each index page is described by the following descriptor which carries the
 * bitmap. This way the actual index can host power-of-two numbers of entries
 * which makes indexing cheaper.
 */
struct sdt_desc {
	__u64				allocated[SDT_TASK_CHUNK_BITMAP_U64S];
	__u64				nr_free;
	struct sdt_chunk __arena	*chunk;
};

/*
 * Leaf node containing per-task data.
 */
struct sdt_data {
	union sdt_id			tid;
	__u64				payload[];
};

/*
 * Intermediate node pointing to another intermediate node or leaf node.
 */
struct sdt_chunk {
	union {
		sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK];
		struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
	};
};

struct scx_allocator {
	struct sdt_pool	pool;
	sdt_desc_t	*root;
};

struct scx_stats {
	int	seq;
	pid_t	pid;
	__u64	enqueue;
	__u64	exit;
	__u64	init;
	__u64	select_busy_cpu;
	__u64	select_idle_cpu;
};

#ifdef __BPF__

void __arena *scx_task_data(struct task_struct *p);
int scx_task_init(__u64 data_size);
void __arena *scx_task_alloc(struct task_struct *p);
void scx_task_free(struct task_struct *p);
void scx_arena_subprog_init(void);

int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size);
u64 scx_alloc_internal(struct scx_allocator *alloc);
int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx);

#endif /* __BPF__ */