Commit 32e8eaf2 authored by Benjamin Berg's avatar Benjamin Berg Committed by Johannes Berg
Browse files

um: use execveat to create userspace MMs



Using clone will not undo features that have been enabled by libc. An
example of this already happening is rseq, which could cause the kernel
to read/write memory of the userspace process. In the future the
standard library might also use mseal by default to protect itself,
which would also thwart our attempts at unmapping everything.

Solve all this by taking a step back and doing an execve into a tiny
static binary that sets up the minimal environment required for the
stub without using any standard library. That way we have a clean
execution environment that is fully under the control of UML.

Note that this changes things a bit as the FDs are not anymore shared
with the kernel. Instead, we explicitly share the FDs for the physical
memory and all existing iomem regions. Doing this is fine, as iomem
regions cannot be added at runtime.

Signed-off-by: default avatarBenjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20240919124511.282088-3-benjamin@sipsolutions.net


[use pipe() instead of pipe2(), remove unneeded close() calls]
Signed-off-by: default avatarJohannes Berg <johannes.berg@intel.com>
parent cbb8e65e
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \
	$(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap	\
	-Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \
	-Din6addr_loopback=kernel_in6addr_loopback \
	-Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr
	-Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \
	-D__close_range=kernel__close_range

KBUILD_RUSTFLAGS += -Crelocation-model=pie

+11 −0
Original line number Diff line number Diff line
@@ -12,6 +12,17 @@
#include <as-layout.h>
#include <sysdep/tls.h>

struct stub_init_data {
	unsigned long stub_start;

	int stub_code_fd;
	unsigned long stub_code_offset;
	int stub_data_fd;
	unsigned long stub_data_offset;

	unsigned long segv_handler;
};

#define STUB_NEXT_SYSCALL(s) \
	((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len))

+2 −0
Original line number Diff line number Diff line
stub_exe
stub_exe.dbg
+31 −2
Original line number Diff line number Diff line
@@ -3,14 +3,43 @@
# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
#

obj-y := stub.o mmu.o process.o syscall.o uaccess.o
obj-y := stub.o mmu.o process.o syscall.o uaccess.o \
	 stub_exe_embed.o

# Stub executable

stub_exe_objs-y := stub_exe.o

stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F)

# Object file containing the ELF executable
$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe

$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE
	$(call if_changed,stub_exe)

$(obj)/stub_exe: OBJCOPYFLAGS := -S
$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE
	$(call if_changed,objcopy)

quiet_cmd_stub_exe = STUB_EXE $@
      cmd_stub_exe = $(CC) -nostdlib -o $@ \
			   $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \
			   $(filter %.o,$^)

STUB_EXE_LDFLAGS = -n -static

targets += stub_exe.dbg stub_exe $(stub_exe_objs-y)

# end

# stub.o is in the stub, so it can't be built with profiling
# GCC hardened also auto-enables -fpic, but we need %ebx so it can't work ->
# disable it

CFLAGS_stub.o := $(CFLAGS_NO_HARDENING)
UNPROFILE_OBJS := stub.o
CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING)
UNPROFILE_OBJS := stub.o stub_exe.o
KCOV_INSTRUMENT := n

include $(srctree)/arch/um/scripts/Makefile.rules
+88 −0
Original line number Diff line number Diff line
#include <sys/ptrace.h>
#include <sys/prctl.h>
#include <asm/unistd.h>
#include <sysdep/stub.h>
#include <stub-data.h>

void _start(void);

noinline static void real_init(void)
{
	struct stub_init_data init_data;
	unsigned long res;
	struct {
		void  *ss_sp;
		int    ss_flags;
		size_t ss_size;
	} stack = {
		.ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
	};
	struct {
		void *sa_handler_;
		unsigned long sa_flags;
		void *sa_restorer;
		unsigned long long sa_mask;
	} sa = {
		/* Need to set SA_RESTORER (but the handler never returns) */
		.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
		/* no need to mask any signals */
		.sa_mask = 0,
	};

	/* set a nice name */
	stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace");

	/* read information from STDIN and close it */
	res = stub_syscall3(__NR_read, 0,
			    (unsigned long)&init_data, sizeof(init_data));
	if (res != sizeof(init_data))
		stub_syscall1(__NR_exit, 10);

	stub_syscall1(__NR_close, 0);

	/* map stub code + data */
	res = stub_syscall6(STUB_MMAP_NR,
			    init_data.stub_start, UM_KERN_PAGE_SIZE,
			    PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED,
			    init_data.stub_code_fd, init_data.stub_code_offset);
	if (res != init_data.stub_start)
		stub_syscall1(__NR_exit, 11);

	res = stub_syscall6(STUB_MMAP_NR,
			    init_data.stub_start + UM_KERN_PAGE_SIZE,
			    STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
			    PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
			    init_data.stub_data_fd, init_data.stub_data_offset);
	if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
		stub_syscall1(__NR_exit, 12);

	/* setup signal stack inside stub data */
	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);

	/* register SIGSEGV handler */
	sa.sa_handler_ = (void *) init_data.segv_handler;
	res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
			    sizeof(sa.sa_mask));
	if (res != 0)
		stub_syscall1(__NR_exit, 13);

	stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);

	stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);

	stub_syscall1(__NR_exit, 14);

	__builtin_unreachable();
}

void _start(void)
{
	char *alloc;

	/* Make enough space for the stub (including space for alignment) */
	alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) * UM_KERN_PAGE_SIZE);
	asm volatile("" : "+r,m"(alloc) : : "memory");

	real_init();
}
Loading