Commit 8e02b1b7 authored by Linus Walleij's avatar Linus Walleij Committed by Andrew Morton
Browse files

fork: define a local GFP_VMAP_STACK

The current allocation of VMAP stack memory is using (THREADINFO_GFP &
~__GFP_ACCOUNT) which is a complicated way of saying (GFP_KERNEL |
__GFP_ZERO):

<linux/thread_info.h>:
define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
<linux/gfp_types.h>:
define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)

This is an unfortunate side-effect of independent changes blurring the
picture:

commit 19809c2d changed (THREADINFO_GFP |
__GFP_HIGHMEM) to just THREADINFO_GFP since highmem became implicit.

commit 9b6f7e16 then added stack caching
and rewrote the allocation to (THREADINFO_GFP & ~__GFP_ACCOUNT) as cached
stacks need to be accounted separately.  However that code, when it
eventually accounts the memory does this:

  ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0)

so the memory is charged as a GFP_KERNEL allocation.

Define a unique GFP_VMAP_STACK to use
GFP_KERNEL | __GFP_ZERO and move the comment there.

Link: https://lkml.kernel.org/r/20250509-gfp-stack-v1-1-82f6f7efc210@linaro.org


Signed-off-by: default avatarLinus Walleij <linus.walleij@linaro.org>
Reported-by: default avatarMateusz Guzik <mjguzik@gmail.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent d82893c5
Loading
Loading
Loading
Loading
+45 −43
Original line number Diff line number Diff line
@@ -185,6 +185,12 @@ static inline void free_task_struct(struct task_struct *tsk)
	kmem_cache_free(task_struct_cachep, tsk);
}

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#  ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
@@ -198,14 +204,14 @@ struct vm_stack {
	struct vm_struct *stack_vm_area;
};

static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
{
	unsigned int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *tmp = NULL;

		if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
		if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
			return true;
	}
	return false;
@@ -214,12 +220,11 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
static void thread_stack_free_rcu(struct rcu_head *rh)
{
	struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
	struct vm_struct *vm_area = vm_stack->stack_vm_area;

	if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
		return;

	vfree(vm_area->addr);
	vfree(vm_stack);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -232,32 +237,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk)

static int free_vm_stack_cache(unsigned int cpu)
{
	struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *vm_area = cached_vm_stack_areas[i];
		struct vm_struct *vm_stack = cached_vm_stacks[i];

		if (!vm_area)
		if (!vm_stack)
			continue;

		vfree(vm_area->addr);
		cached_vm_stack_areas[i] = NULL;
		vfree(vm_stack->addr);
		cached_vm_stacks[i] = NULL;
	}

	return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
	int i;
	int ret;
	int nr_charged = 0;

	BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
	BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
		ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
		ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
		if (ret)
			goto err;
		nr_charged++;
@@ -265,35 +270,38 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
	return 0;
err:
	for (i = 0; i < nr_charged; i++)
		memcg_kmem_uncharge_page(vm_area->pages[i], 0);
		memcg_kmem_uncharge_page(vm->pages[i], 0);
	return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
	struct vm_struct *vm_area;
	struct vm_struct *vm;
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		vm_area = this_cpu_xchg(cached_stacks[i], NULL);
		if (!vm_area)
			continue;
		struct vm_struct *s;

		if (memcg_charge_kernel_stack(vm_area)) {
			vfree(vm_area->addr);
			return -ENOMEM;
		}
		s = this_cpu_xchg(cached_stacks[i], NULL);

		if (!s)
			continue;

		/* Reset stack metadata. */
		kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
		kasan_unpoison_range(s->addr, THREAD_SIZE);

		stack = kasan_reset_tag(vm_area->addr);
		stack = kasan_reset_tag(s->addr);

		/* Clear stale pointers from reused stack. */
		memset(stack, 0, THREAD_SIZE);

		tsk->stack_vm_area = vm_area;
		if (memcg_charge_kernel_stack(s)) {
			vfree(s->addr);
			return -ENOMEM;
		}

		tsk->stack_vm_area = s;
		tsk->stack = stack;
		return 0;
	}
@@ -309,8 +317,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
	if (!stack)
		return -ENOMEM;

	vm_area = find_vm_area(stack);
	if (memcg_charge_kernel_stack(vm_area)) {
	vm = find_vm_area(stack);
	if (memcg_charge_kernel_stack(vm)) {
		vfree(stack);
		return -ENOMEM;
	}
@@ -319,7 +327,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
	tsk->stack_vm_area = vm_area;
	tsk->stack_vm_area = vm;
	stack = kasan_reset_tag(stack);
	tsk->stack = stack;
	return 0;
@@ -336,12 +344,6 @@ static void free_thread_stack(struct task_struct *tsk)

#  else /* !CONFIG_VMAP_STACK */

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
#if THREAD_SIZE >= PAGE_SIZE

static void thread_stack_free_rcu(struct rcu_head *rh)
{
	__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
@@ -372,7 +374,8 @@ static void free_thread_stack(struct task_struct *tsk)
	tsk->stack = NULL;
}

#else /* !(THREAD_SIZE >= PAGE_SIZE) */
#  endif /* CONFIG_VMAP_STACK */
# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */

static struct kmem_cache *thread_stack_cache;

@@ -411,8 +414,7 @@ void thread_stack_cache_init(void)
	BUG_ON(thread_stack_cache == NULL);
}

#endif /* THREAD_SIZE >= PAGE_SIZE */
#endif /* CONFIG_VMAP_STACK */
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -515,11 +517,11 @@ void vm_area_free(struct vm_area_struct *vma)
static void account_kernel_stack(struct task_struct *tsk, int account)
{
	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
		struct vm_struct *vm_area = task_stack_vm_area(tsk);
		struct vm_struct *vm = task_stack_vm_area(tsk);
		int i;

		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
			mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
			mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
					      account * (PAGE_SIZE / 1024));
	} else {
		void *stack = task_stack_page(tsk);
@@ -535,12 +537,12 @@ void exit_task_stack_account(struct task_struct *tsk)
	account_kernel_stack(tsk, -1);

	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
		struct vm_struct *vm_area;
		struct vm_struct *vm;
		int i;

		vm_area = task_stack_vm_area(tsk);
		vm = task_stack_vm_area(tsk);
		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
			memcg_kmem_uncharge_page(vm_area->pages[i], 0);
			memcg_kmem_uncharge_page(vm->pages[i], 0);
	}
}