Commit 406d17c6 authored by Benjamin Berg's avatar Benjamin Berg Committed by Johannes Berg
Browse files

um: Implement kernel side of SECCOMP based process handling



This adds the kernel side of the seccomp based process handling.

Co-authored-by: default avatarJohannes Berg <johannes@sipsolutions.net>
Signed-off-by: default avatarBenjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: default avatarBenjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-6-benjamin@sipsolutions.net


Signed-off-by: default avatarJohannes Berg <johannes.berg@intel.com>
parent 8420e08f
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -16,3 +16,5 @@ DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);

DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES);

DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE);
+1 −1
Original line number Diff line number Diff line
@@ -286,7 +286,7 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len);

/* skas/process.c */
extern int is_skas_winch(int pid, int fd, void *data);
extern int start_userspace(unsigned long stub_stack);
extern int start_userspace(struct mm_id *mm_id);
extern void userspace(struct uml_pt_regs *regs);
extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void));
extern void switch_threads(jmp_buf *me, jmp_buf *you);
+4 −1
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
#define FUTEX_IN_KERN 1

struct stub_init_data {
	int seccomp;

	unsigned long stub_start;

	int stub_code_fd;
@@ -24,7 +26,8 @@ struct stub_init_data {
	int stub_data_fd;
	unsigned long stub_data_offset;

	unsigned long segv_handler;
	unsigned long signal_handler;
	unsigned long signal_restorer;
};

#define STUB_NEXT_SYSCALL(s) \
+2 −4
Original line number Diff line number Diff line
@@ -40,11 +40,9 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
		list_add(&mm->context.list, &mm_list);
	}

	new_id->pid = start_userspace(stack);
	if (new_id->pid < 0) {
		ret = new_id->pid;
	ret = start_userspace(new_id);
	if (ret < 0)
		goto out_free;
	}

	/* Ensure the new MM is clean and nothing unwanted is mapped */
	unmap(new_id, 0, STUB_START);
+130 −11
Original line number Diff line number Diff line
@@ -3,6 +3,9 @@
#include <asm/unistd.h>
#include <sysdep/stub.h>
#include <stub-data.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <generated/asm-offsets.h>

void _start(void);

@@ -25,8 +28,6 @@ noinline static void real_init(void)
	} sa = {
		/* Need to set SA_RESTORER (but the handler never returns) */
		.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
		/* no need to mask any signals */
		.sa_mask = 0,
	};

	/* set a nice name */
@@ -35,6 +36,9 @@ noinline static void real_init(void)
	/* Make sure this process dies if the kernel dies */
	stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);

	/* Needed in SECCOMP mode (and safe to do anyway) */
	stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);

	/* read information from STDIN and close it */
	res = stub_syscall3(__NR_read, 0,
			    (unsigned long)&init_data, sizeof(init_data));
@@ -63,18 +67,133 @@ noinline static void real_init(void)
	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);

	/* register SIGSEGV handler */
	sa.sa_handler_ = (void *) init_data.segv_handler;
	res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
			    sizeof(sa.sa_mask));
	/* register signal handlers */
	sa.sa_handler_ = (void *) init_data.signal_handler;
	sa.sa_restorer = (void *) init_data.signal_restorer;
	if (!init_data.seccomp) {
		/* In ptrace mode, the SIGSEGV handler never returns */
		sa.sa_mask = 0;

		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 13);
	} else {
		/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
		sa.sa_mask = ~0ULL;

		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 14);

		res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 15);

		res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 16);

		res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 17);

		res = stub_syscall4(__NR_rt_sigaction, SIGILL,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 18);

		res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
		if (res != 0)
			stub_syscall1(__NR_exit, 19);
	}

	/*
	 * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
	 * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
	 */
	if (init_data.seccomp) {
		struct sock_filter filter[] = {
#if __BITS_PER_LONG > 32
			/* [0] Load upper 32bit of instruction pointer from seccomp_data */
			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
				 (offsetof(struct seccomp_data, instruction_pointer) + 4)),

			/* [1] Jump forward 3 instructions if the upper address is not identical */
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3),
#endif
			/* [2] Load lower 32bit of instruction pointer from seccomp_data */
			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
				 (offsetof(struct seccomp_data, instruction_pointer))),

			/* [3] Mask out lower bits */
			BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),

			/* [4] Jump to [6] if the lower bits are not on the expected page */
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0),

			/* [5] Trap call, allow */
			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),

			/* [6,7] Check architecture */
			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
				 offsetof(struct seccomp_data, arch)),
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
				 UM_SECCOMP_ARCH_NATIVE, 1, 0),

			/* [8] Kill (for architecture check) */
			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),

			/* [9] Load syscall number */
			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
				 offsetof(struct seccomp_data, nr)),

			/* [10-14] Check against permitted syscalls */
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
				 5, 0),
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
				 4, 0),
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
				 3, 0),
#ifdef __i386__
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
				 2, 0),
#else
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
				 2, 0),
#endif
			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
				 1, 0),

			/* [15] Not one of the permitted syscalls */
			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),

			/* [16] Permitted call for the stub */
			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
		};
		struct sock_fprog prog = {
			.len = sizeof(filter) / sizeof(filter[0]),
			.filter = filter,
		};

		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
				  SECCOMP_FILTER_FLAG_TSYNC,
				  (unsigned long)&prog) != 0)
			stub_syscall1(__NR_exit, 20);

		/* Fall through, the exit syscall will cause SIGSYS */
	} else {
		stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);

		stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
	}

	stub_syscall1(__NR_exit, 14);
	stub_syscall1(__NR_exit, 30);

	__builtin_unreachable();
}
Loading