Commit 073d5f15 authored by Vlastimil Babka's avatar Vlastimil Babka
Browse files

slab: simplify kmalloc_nolock()



The kmalloc_nolock() implementation has several complications and
restrictions due to SLUB's cpu slab locking, lockless fastpath and
PREEMPT_RT differences. With cpu slab usage removed, we can simplify
things:

- relax the PREEMPT_RT context checks as they were before commit
  99a3e3a1 ("slab: fix kmalloc_nolock() context check for
  PREEMPT_RT") and also reference the explanation comment in the page
  allocator

- the local_lock_cpu_slab() macros became unused, remove them

- we no longer need to set up lockdep classes on PREEMPT_RT

- we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
  since there's no lockless cpu freelist manipulation anymore

- __slab_alloc_node() can be called from kmalloc_nolock_noprof()
  unconditionally. It can also no longer return EBUSY. But trylock
  failures can still happen so retry with the larger bucket if the
  allocation fails for any reason.

Note that we still need __CMPXCHG_DOUBLE, because while it was removed
we don't use cmpxchg16b on cpu freelist anymore, we still use it on
slab freelist, and the alternative is slab_lock() which can be
interrupted by a nmi. Clarify the comment to mention it specifically.

Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
Reviewed-by: default avatarHao Li <hao.li@linux.dev>
Reviewed-by: default avatarSuren Baghdasaryan <surenb@google.com>
Reviewed-by: default avatarHarry Yoo <harry.yoo@oracle.com>
Signed-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
parent ab2f752a
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -190,7 +190,6 @@ struct kmem_cache_order_objects {
 */
struct kmem_cache {
	struct kmem_cache_cpu __percpu *cpu_slab;
	struct lock_class_key lock_key;
	struct slub_percpu_sheaves __percpu *cpu_sheaves;
	/* Used for retrieving partial slabs, etc. */
	slab_flags_t flags;
+29 −115
Original line number Diff line number Diff line
@@ -3690,29 +3690,12 @@ static inline unsigned int init_tid(int cpu)

static void init_kmem_cache_cpus(struct kmem_cache *s)
{
#ifdef CONFIG_PREEMPT_RT
	/*
	 * Register lockdep key for non-boot kmem caches to avoid
	 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
	 */
	bool finegrain_lockdep = !init_section_contains(s, 1);
#else
	/*
	 * Don't bother with different lockdep classes for each
	 * kmem_cache, since we only use local_trylock_irqsave().
	 */
	bool finegrain_lockdep = false;
#endif
	int cpu;
	struct kmem_cache_cpu *c;

	if (finegrain_lockdep)
		lockdep_register_key(&s->lock_key);
	for_each_possible_cpu(cpu) {
		c = per_cpu_ptr(s->cpu_slab, cpu);
		local_trylock_init(&c->lock);
		if (finegrain_lockdep)
			lockdep_set_class(&c->lock, &s->lock_key);
		c->tid = init_tid(cpu);
	}
}
@@ -3799,47 +3782,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
	}
}

/*
 * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
 * can be acquired without a deadlock before invoking the function.
 *
 * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
 * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
 * and kmalloc() is not used in an unsupported context.
 *
 * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
 * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
 * lockdep_assert() will catch a bug in case:
 * #1
 * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
 * or
 * #2
 * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
 *
 * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
 * disabled context. The lock will always be acquired and if needed it
 * block and sleep until the lock is available.
 * #1 is possible in !PREEMPT_RT only.
 * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
 * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
 *    tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
 *
 * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
 */
#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
#define local_lock_cpu_slab(s, flags)	\
	local_lock_irqsave(&(s)->cpu_slab->lock, flags)
#else
#define local_lock_cpu_slab(s, flags)					       \
	do {								       \
		bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
		lockdep_assert(__l);					       \
	} while (0)
#endif

#define local_unlock_cpu_slab(s, flags)	\
	local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)

static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
	unsigned long flags;
@@ -4405,20 +4347,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
	return object;
}

/*
 * We disallow kprobes in ___slab_alloc() to prevent reentrance
 *
 * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
 * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
 * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
 * manipulating c->freelist without lock.
 *
 * This does not prevent kprobe in functions called from ___slab_alloc() such as
 * local_lock_irqsave() itself, and that is fine, we only need to protect the
 * c->freelist manipulation in ___slab_alloc() itself.
 */
NOKPROBE_SYMBOL(___slab_alloc);

static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
@@ -5259,13 +5187,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
	if (unlikely(!size))
		return ZERO_SIZE_PTR;

	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
	/*
		 * kmalloc_nolock() in PREEMPT_RT is not supported from
		 * non-preemptible context because local_lock becomes a
		 * sleeping lock on RT.
	 * See the comment for the same check in
	 * alloc_frozen_pages_nolock_noprof()
	 */
	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
		return NULL;

retry:
	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
		return NULL;
@@ -5274,10 +5202,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
	if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
		/*
		 * kmalloc_nolock() is not supported on architectures that
		 * don't implement cmpxchg16b, but debug caches don't use
		 * per-cpu slab and per-cpu partial slabs. They rely on
		 * kmem_cache_node->list_lock, so kmalloc_nolock() can
		 * attempt to allocate from debug caches by
		 * don't implement cmpxchg16b and thus need slab_lock()
		 * which could be preempted by a nmi.
		 * But debug caches don't use that and only rely on
		 * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
		 * to allocate from debug caches by
		 * spin_trylock_irqsave(&n->list_lock, ...)
		 */
		return NULL;
@@ -5286,30 +5215,21 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
	if (ret)
		goto success;

	ret = ERR_PTR(-EBUSY);

	/*
	 * Do not call slab_alloc_node(), since trylock mode isn't
	 * compatible with slab_pre_alloc_hook/should_failslab and
	 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
	 * and slab_post_alloc_hook() directly.
	 *
	 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
	 * in irq saved region. It assumes that the same cpu will not
	 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
	 * Therefore use in_nmi() to check whether particular bucket is in
	 * irq protected section.
	 *
	 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
	 * this cpu was interrupted somewhere inside ___slab_alloc() after
	 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
	 * In this case fast path with __update_cpu_freelist_fast() is not safe.
	 */
	if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
	ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);

	if (PTR_ERR(ret) == -EBUSY) {
		if (can_retry) {
	/*
	 * It's possible we failed due to trylock as we preempted someone with
	 * the sheaves locked, and the list_lock is also held by another cpu.
	 * But it should be rare that multiple kmalloc buckets would have
	 * sheaves locked, so try a larger one.
	 */
	if (!ret && can_retry) {
		/* pick the next kmalloc bucket */
		size = s->object_size + 1;
		/*
@@ -5321,8 +5241,6 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
		can_retry = false;
		goto retry;
	}
		ret = NULL;
	}

success:
	maybe_wipe_obj_freeptr(s, ret);
@@ -7374,10 +7292,6 @@ void __kmem_cache_release(struct kmem_cache *s)
{
	cache_random_seq_destroy(s);
	pcs_destroy(s);
#ifdef CONFIG_PREEMPT_RT
	if (s->cpu_slab)
		lockdep_unregister_key(&s->lock_key);
#endif
	free_percpu(s->cpu_slab);
	free_kmem_cache_nodes(s);
}