Commit dd7a6246 authored by Lorenzo Stoakes's avatar Lorenzo Stoakes Committed by Andrew Morton
Browse files

mm: abstract initial stack setup to mm subsystem

There are peculiarities within the kernel where what is very clearly mm
code is performed elsewhere arbitrarily.

This violates separation of concerns and makes it harder to refactor code
to make changes to how fundamental initialisation and operation of mm
logic is performed.

One such case is the creation of the VMA containing the initial stack upon
execve()'ing a new process.  This is currently performed in
__bprm_mm_init() in fs/exec.c.

Abstract this operation to create_init_stack_vma().  This allows us to
limit use of vma allocation and free code to fork and mm only.

We previously did the same for the step at which we relocate the initial
stack VMA downwards via relocate_vma_down(), now we move the initial VMA
establishment too.

Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's
no longer needed anywhere outside of mm.

Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com


Signed-off-by: default avatarLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarSuren Baghdasaryan <surenb@google.com>
Reviewed-by: default avatarLiam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: default avatarPedro Falcato <pfalcato@suse.de>
Reviewed-by: default avatarKees Cook <kees@kernel.org>
Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 6c36ac1e
Loading
Loading
Loading
Loading
+5 −61
Original line number Diff line number Diff line
@@ -245,60 +245,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
	int err;
	struct vm_area_struct *vma = NULL;
	struct mm_struct *mm = bprm->mm;

	bprm->vma = vma = vm_area_alloc(mm);
	if (!vma)
		return -ENOMEM;
	vma_set_anonymous(vma);

	if (mmap_write_lock_killable(mm)) {
		err = -EINTR;
		goto err_free;
	}

	/*
	 * Need to be called with mmap write lock
	 * held, to avoid race with ksmd.
	 */
	err = ksm_execve(mm);
	if (err)
		goto err_ksm;

	/*
	 * Place the stack at the largest stack address the architecture
	 * supports. Later, we'll move this to an appropriate place. We don't
	 * use STACK_TOP because that can depend on attributes which aren't
	 * configured yet.
	 */
	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
	vma->vm_end = STACK_TOP_MAX;
	vma->vm_start = vma->vm_end - PAGE_SIZE;
	vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

	err = insert_vm_struct(mm, vma);
	if (err)
		goto err;

	mm->stack_vm = mm->total_vm = 1;
	mmap_write_unlock(mm);
	bprm->p = vma->vm_end - sizeof(void *);
	return 0;
err:
	ksm_exit(mm);
err_ksm:
	mmap_write_unlock(mm);
err_free:
	bprm->vma = NULL;
	vm_area_free(vma);
	return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
	return len <= MAX_ARG_STRLEN;
@@ -351,12 +297,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
	return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
	return len <= bprm->p;
@@ -385,9 +325,13 @@ static int bprm_mm_init(struct linux_binprm *bprm)
	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
	task_unlock(current->group_leader);

	err = __bprm_mm_init(bprm);
#ifndef CONFIG_MMU
	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
#else
	err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
	if (err)
		goto err;
#endif

	return 0;

+0 −42
Original line number Diff line number Diff line
@@ -1321,48 +1321,6 @@ void exit_mmap(struct mm_struct *mm)
	vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
	unsigned long charged = vma_pages(vma);


	if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
		return -ENOMEM;

	if ((vma->vm_flags & VM_ACCOUNT) &&
	     security_vm_enough_memory_mm(mm, charged))
		return -ENOMEM;

	/*
	 * The vm_pgoff of a purely anonymous vma should be irrelevant
	 * until its first write fault, when page's anon_vma and index
	 * are set.  But now set the vm_pgoff it will almost certainly
	 * end up with (unless mremap moves it elsewhere before that
	 * first wfault), so /proc/pid/maps tells a consistent story.
	 *
	 * By setting it to reflect the virtual start address of the
	 * vma, merges and splits can happen in a seamless way, just
	 * using the existing file pgoff checks and manipulations.
	 * Similarly in do_mmap and in do_brk_flags.
	 */
	if (vma_is_anonymous(vma)) {
		BUG_ON(vma->anon_vma);
		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
	}

	if (vma_link(mm, vma)) {
		if (vma->vm_flags & VM_ACCOUNT)
			vm_unacct_memory(charged);
		return -ENOMEM;
	}

	return 0;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
+43 −0
Original line number Diff line number Diff line
@@ -3052,3 +3052,46 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
	userfaultfd_unmap_complete(mm, &uf);
	return ret;
}


/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
	unsigned long charged = vma_pages(vma);


	if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
		return -ENOMEM;

	if ((vma->vm_flags & VM_ACCOUNT) &&
	     security_vm_enough_memory_mm(mm, charged))
		return -ENOMEM;

	/*
	 * The vm_pgoff of a purely anonymous vma should be irrelevant
	 * until its first write fault, when page's anon_vma and index
	 * are set.  But now set the vm_pgoff it will almost certainly
	 * end up with (unless mremap moves it elsewhere before that
	 * first wfault), so /proc/pid/maps tells a consistent story.
	 *
	 * By setting it to reflect the virtual start address of the
	 * vma, merges and splits can happen in a seamless way, just
	 * using the existing file pgoff checks and manipulations.
	 * Similarly in do_mmap and in do_brk_flags.
	 */
	if (vma_is_anonymous(vma)) {
		BUG_ON(vma->anon_vma);
		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
	}

	if (vma_link(mm, vma)) {
		if (vma->vm_flags & VM_ACCOUNT)
			vm_unacct_memory(charged);
		return -ENOMEM;
	}

	return 0;
}
+4 −0
Original line number Diff line number Diff line
@@ -548,8 +548,12 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);

int __vm_munmap(unsigned long start, size_t len, bool unlock);

int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);

/* vma_exec.c */
#ifdef CONFIG_MMU
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
			  unsigned long *top_mem_p);
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
#endif

+69 −0
Original line number Diff line number Diff line
@@ -90,3 +90,72 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
	/* Shrink the vma to just the new range */
	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}

/*
 * Establish the stack VMA in an execve'd process, located temporarily at the
 * maximum stack address provided by the architecture.
 *
 * We later relocate this downwards in relocate_vma_down().
 *
 * This function is almost certainly NOT what you want for anything other than
 * early executable initialisation.
 *
 * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
 * maximum addressable location in the stack (that is capable of storing a
 * system word of data).
 */
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
			  unsigned long *top_mem_p)
{
	int err;
	struct vm_area_struct *vma = vm_area_alloc(mm);

	if (!vma)
		return -ENOMEM;

	vma_set_anonymous(vma);

	if (mmap_write_lock_killable(mm)) {
		err = -EINTR;
		goto err_free;
	}

	/*
	 * Need to be called with mmap write lock
	 * held, to avoid race with ksmd.
	 */
	err = ksm_execve(mm);
	if (err)
		goto err_ksm;

	/*
	 * Place the stack at the largest stack address the architecture
	 * supports. Later, we'll move this to an appropriate place. We don't
	 * use STACK_TOP because that can depend on attributes which aren't
	 * configured yet.
	 */
	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
	vma->vm_end = STACK_TOP_MAX;
	vma->vm_start = vma->vm_end - PAGE_SIZE;
	vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

	err = insert_vm_struct(mm, vma);
	if (err)
		goto err;

	mm->stack_vm = mm->total_vm = 1;
	mmap_write_unlock(mm);
	*vmap = vma;
	*top_mem_p = vma->vm_end - sizeof(void *);
	return 0;

err:
	ksm_exit(mm);
err_ksm:
	mmap_write_unlock(mm);
err_free:
	*vmap = NULL;
	vm_area_free(vma);
	return err;
}
Loading