Commit 8be7258a authored by Jeff Xu's avatar Jeff Xu Committed by Andrew Morton
Browse files

mseal: add mseal syscall

The new mseal() is an syscall on 64 bit CPU, and with following signature:

int mseal(void addr, size_t len, unsigned long flags)
addr/len: memory range.
flags: reserved.

mseal() blocks following operations for the given memory range.

1> Unmapping, moving to another location, and shrinking the size,
   via munmap() and mremap(), can leave an empty space, therefore can
   be replaced with a VMA with a new set of attributes.

2> Moving or expanding a different VMA into the current location,
   via mremap().

3> Modifying a VMA via mmap(MAP_FIXED).

4> Size expansion, via mremap(), does not appear to pose any specific
   risks to sealed VMAs. It is included anyway because the use case is
   unclear. In any case, users can rely on merging to expand a sealed VMA.

5> mprotect() and pkey_mprotect().

6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
   memory, when users don't have write permission to the memory. Those
   behaviors can alter region contents by discarding pages, effectively a
   memset(0) for anonymous memory.

Following input during RFC are incooperated into this patch:

Jann Horn: raising awareness and providing valuable insights on the
destructive madvise operations.
Linus Torvalds: assisting in defining system call signature and scope.
Liam R. Howlett: perf optimization.
Theo de Raadt: sharing the experiences and insight gained from
  implementing mimmutable() in OpenBSD.

Finally, the idea that inspired this patch comes from Stephen Röttger's
work in Chrome V8 CFI.

[jeffxu@chromium.org: add branch prediction hint, per Pedro]
  Link: https://lkml.kernel.org/r/20240423192825.1273679-2-jeffxu@chromium.org
Link: https://lkml.kernel.org/r/20240415163527.626541-3-jeffxu@chromium.org


Signed-off-by: default avatarJeff Xu <jeffxu@chromium.org>
Reviewed-by: default avatarKees Cook <keescook@chromium.org>
Reviewed-by: default avatarLiam R. Howlett <Liam.Howlett@oracle.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Stephen Röttger <sroettger@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Amer Al Shanawany <amer.shanawany@gmail.com>
Cc: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent ff388fe5
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
			unsigned long prot, unsigned long pgoff,
			unsigned long flags);
asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags);
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
				unsigned long mode,
				const unsigned long __user *nmask,
+4 −0
Original line number Diff line number Diff line
@@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU)	+= process_vm_access.o
endif

ifdef CONFIG_64BIT
mmu-$(CONFIG_MMU)	+= mseal.o
endif

obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
			   maccess.o page-writeback.o folio-compat.o \
			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
+37 −0
Original line number Diff line number Diff line
@@ -1435,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
			  int priority);

#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED	_BITUL(63)
#endif

#ifdef CONFIG_64BIT
static inline int can_do_mseal(unsigned long flags)
{
	if (flags)
		return -EINVAL;

	return 0;
}

bool can_modify_mm(struct mm_struct *mm, unsigned long start,
		unsigned long end);
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
		unsigned long end, int behavior);
#else
static inline int can_do_mseal(unsigned long flags)
{
	return -EPERM;
}

static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
		unsigned long end)
{
	return true;
}

static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
		unsigned long end, int behavior)
{
	return true;
}
#endif

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
			struct shrinker *shrinker, const char *fmt, va_list ap)
+12 −0
Original line number Diff line number Diff line
@@ -1401,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 *  -EIO    - an I/O error occurred while paging in data.
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 *  -EPERM  - memory is sealed.
 */
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
@@ -1444,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
	start = untagged_addr_remote(mm, start);
	end = start + len;

	/*
	 * Check if the address range is sealed for do_madvise().
	 * can_modify_mm_madv assumes we have acquired the lock on MM.
	 */
	if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
		error = -EPERM;
		goto out;
	}

	blk_start_plug(&plug);
	switch (behavior) {
	case MADV_POPULATE_READ:
@@ -1456,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
		break;
	}
	blk_finish_plug(&plug);

out:
	if (write)
		mmap_write_unlock(mm);
	else
+30 −1
Original line number Diff line number Diff line
@@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
	if (mm->map_count > sysctl_max_map_count)
		return -ENOMEM;

	/*
	 * addr is returned from get_unmapped_area,
	 * There are two cases:
	 * 1> MAP_FIXED == false
	 *	unallocated memory, no need to check sealing.
	 * 1> MAP_FIXED == true
	 *	sealing is checked inside mmap_region when
	 *	do_vmi_munmap is called.
	 */

	if (prot == PROT_EXEC) {
		pkey = execute_only_pkey(mm);
		if (pkey < 0)
@@ -2727,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
	if (end == start)
		return -EINVAL;

	/*
	 * Check if memory is sealed before arch_unmap.
	 * Prevent unmapping a sealed VMA.
	 * can_modify_mm assumes we have acquired the lock on MM.
	 */
	if (unlikely(!can_modify_mm(mm, start, end)))
		return -EPERM;

	 /* arch_unmap() might do unmaps itself.  */
	arch_unmap(mm, start, end);

@@ -2789,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
	}

	/* Unmap any existing mapping in the area */
	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
	error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
	if (error == -EPERM)
		return error;
	else if (error)
		return -ENOMEM;

	/*
@@ -3139,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
{
	struct mm_struct *mm = vma->vm_mm;

	/*
	 * Check if memory is sealed before arch_unmap.
	 * Prevent unmapping a sealed VMA.
	 * can_modify_mm assumes we have acquired the lock on MM.
	 */
	if (unlikely(!can_modify_mm(mm, start, end)))
		return -EPERM;

	arch_unmap(mm, start, end);
	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
Loading