Commit aa918db7 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull bpf try_alloc_pages() support from Alexei Starovoitov:
 "The pull includes work from Sebastian, Vlastimil and myself with a lot
  of help from Michal and Shakeel.

  This is a first step towards making kmalloc reentrant to get rid of
  slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches
  make page allocator safe from any context.

  Vlastimil kicked off this effort at LSFMM 2024:

    https://lwn.net/Articles/974138/

  and we continued at LSFMM 2025:

    https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/

  Why:

  SLAB wrappers bind memory to a particular subsystem making it
  unavailable to the rest of the kernel. Some BPF maps in production
  consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G,
  1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF
  map preallocation won't be necessary.

  How:

  Synchronous kmalloc/page alloc stack has multiple stages going from
  fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages ->
  rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already
  relying on trylock.

  This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and
  return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this
  functionality into try_alloc_pages() helper. We make sure that the
  logic is sane in PREEMPT_RT.

  End result: try_alloc_pages()/free_pages_nolock() are safe to call
  from any context.

  try_kmalloc() for any context with similar trylock approach will
  follow. It will use try_alloc_pages() when slab needs a new page.
  Though such try_kmalloc/page_alloc() is an opportunistic allocator,
  this design ensures that the probability of successful allocation of
  small objects (up to one page in size) is high.

  Even before we have try_kmalloc(), we already use try_alloc_pages() in
  BPF arena implementation and it's going to be used more extensively in
  BPF"

* tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  mm: Fix the flipped condition in gfpflags_allow_spinning()
  bpf: Use try_alloc_pages() to allocate pages for bpf needs.
  mm, bpf: Use memcg in try_alloc_pages().
  memcg: Use trylock to access memcg stock_lock.
  mm, bpf: Introduce free_pages_nolock()
  mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation
  locking/local_lock: Introduce localtry_lock_t
parents 494e7fe5 f90b474a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2385,7 +2385,7 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);

int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
			unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+23 −0
Original line number Diff line number Diff line
@@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
	return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
	/*
	 * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
	 * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
	 * All GFP_* flags including GFP_NOWAIT use one or both flags.
	 * try_alloc_pages() is the only API that doesn't specify either flag.
	 *
	 * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
	 * those are guaranteed to never block on a sleeping lock.
	 * Here we are enforcing that the allocation doesn't ever spin
	 * on any locks (i.e. only trylocks). There is no high level
	 * GFP_$FOO flag for this use in try_alloc_pages() as the
	 * regular page allocator doesn't fully support this
	 * allocation mode.
	 */
	return !!(gfp_flags & __GFP_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
@@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...)			alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

struct page *try_alloc_pages_noprof(int nid, unsigned int order);
#define try_alloc_pages(...)			alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)			alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

@@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
	__get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

#define __free_page(page) __free_pages((page), 0)
+70 −0
Original line number Diff line number Diff line
@@ -51,6 +51,76 @@
#define local_unlock_irqrestore(lock, flags)			\
	__local_unlock_irqrestore(lock, flags)

/**
 * localtry_lock_init - Runtime initialize a lock instance
 */
#define localtry_lock_init(lock)		__localtry_lock_init(lock)

/**
 * localtry_lock - Acquire a per CPU local lock
 * @lock:	The lock variable
 */
#define localtry_lock(lock)		__localtry_lock(lock)

/**
 * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts
 * @lock:	The lock variable
 */
#define localtry_lock_irq(lock)		__localtry_lock_irq(lock)

/**
 * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable
 *			 interrupts
 * @lock:	The lock variable
 * @flags:	Storage for interrupt flags
 */
#define localtry_lock_irqsave(lock, flags)				\
	__localtry_lock_irqsave(lock, flags)

/**
 * localtry_trylock - Try to acquire a per CPU local lock.
 * @lock:	The lock variable
 *
 * The function can be used in any context such as NMI or HARDIRQ. Due to
 * locking constrains it will _always_ fail to acquire the lock in NMI or
 * HARDIRQ context on PREEMPT_RT.
 */
#define localtry_trylock(lock)		__localtry_trylock(lock)

/**
 * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
 *			      interrupts if acquired
 * @lock:	The lock variable
 * @flags:	Storage for interrupt flags
 *
 * The function can be used in any context such as NMI or HARDIRQ. Due to
 * locking constrains it will _always_ fail to acquire the lock in NMI or
 * HARDIRQ context on PREEMPT_RT.
 */
#define localtry_trylock_irqsave(lock, flags)				\
	__localtry_trylock_irqsave(lock, flags)

/**
 * local_unlock - Release a per CPU local lock
 * @lock:	The lock variable
 */
#define localtry_unlock(lock)		__localtry_unlock(lock)

/**
 * local_unlock_irq - Release a per CPU local lock and enable interrupts
 * @lock:	The lock variable
 */
#define localtry_unlock_irq(lock)		__localtry_unlock_irq(lock)

/**
 * localtry_unlock_irqrestore - Release a per CPU local lock and restore
 *			      interrupt flags
 * @lock:	The lock variable
 * @flags:      Interrupt flags to restore
 */
#define localtry_unlock_irqrestore(lock, flags)			\
	__localtry_unlock_irqrestore(lock, flags)

DEFINE_GUARD(local_lock, local_lock_t __percpu*,
	     local_lock(_T),
	     local_unlock(_T))
+146 −0
Original line number Diff line number Diff line
@@ -15,6 +15,11 @@ typedef struct {
#endif
} local_lock_t;

typedef struct {
	local_lock_t	llock;
	unsigned int	acquired;
} localtry_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)		\
	.dep_map = {					\
@@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l)
	l->owner = current;
}

static inline void local_trylock_acquire(local_lock_t *l)
{
	lock_map_acquire_try(&l->dep_map);
	DEBUG_LOCKS_WARN_ON(l->owner);
	l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
	DEBUG_LOCKS_WARN_ON(l->owner != current);
@@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l)
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)	{ LOCAL_LOCK_DEBUG_INIT(lockname) }
#define INIT_LOCALTRY_LOCK(lockname)	{ .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }}

#define __local_lock_init(lock)					\
do {								\
@@ -118,6 +132,104 @@ do { \
#define __local_unlock_nested_bh(lock)				\
	local_lock_release(this_cpu_ptr(lock))

/* localtry_lock_t variants */

#define __localtry_lock_init(lock)				\
do {								\
	__local_lock_init(&(lock)->llock);			\
	WRITE_ONCE((lock)->acquired, 0);			\
} while (0)

#define __localtry_lock(lock)					\
	do {							\
		localtry_lock_t *lt;				\
		preempt_disable();				\
		lt = this_cpu_ptr(lock);			\
		local_lock_acquire(&lt->llock);			\
		WRITE_ONCE(lt->acquired, 1);			\
	} while (0)

#define __localtry_lock_irq(lock)				\
	do {							\
		localtry_lock_t *lt;				\
		local_irq_disable();				\
		lt = this_cpu_ptr(lock);			\
		local_lock_acquire(&lt->llock);			\
		WRITE_ONCE(lt->acquired, 1);			\
	} while (0)

#define __localtry_lock_irqsave(lock, flags)			\
	do {							\
		localtry_lock_t *lt;				\
		local_irq_save(flags);				\
		lt = this_cpu_ptr(lock);			\
		local_lock_acquire(&lt->llock);			\
		WRITE_ONCE(lt->acquired, 1);			\
	} while (0)

#define __localtry_trylock(lock)				\
	({							\
		localtry_lock_t *lt;				\
		bool _ret;					\
								\
		preempt_disable();				\
		lt = this_cpu_ptr(lock);			\
		if (!READ_ONCE(lt->acquired)) {			\
			WRITE_ONCE(lt->acquired, 1);		\
			local_trylock_acquire(&lt->llock);	\
			_ret = true;				\
		} else {					\
			_ret = false;				\
			preempt_enable();			\
		}						\
		_ret;						\
	})

#define __localtry_trylock_irqsave(lock, flags)			\
	({							\
		localtry_lock_t *lt;				\
		bool _ret;					\
								\
		local_irq_save(flags);				\
		lt = this_cpu_ptr(lock);			\
		if (!READ_ONCE(lt->acquired)) {			\
			WRITE_ONCE(lt->acquired, 1);		\
			local_trylock_acquire(&lt->llock);	\
			_ret = true;				\
		} else {					\
			_ret = false;				\
			local_irq_restore(flags);		\
		}						\
		_ret;						\
	})

#define __localtry_unlock(lock)					\
	do {							\
		localtry_lock_t *lt;				\
		lt = this_cpu_ptr(lock);			\
		WRITE_ONCE(lt->acquired, 0);			\
		local_lock_release(&lt->llock);			\
		preempt_enable();				\
	} while (0)

#define __localtry_unlock_irq(lock)				\
	do {							\
		localtry_lock_t *lt;				\
		lt = this_cpu_ptr(lock);			\
		WRITE_ONCE(lt->acquired, 0);			\
		local_lock_release(&lt->llock);			\
		local_irq_enable();				\
	} while (0)

#define __localtry_unlock_irqrestore(lock, flags)		\
	do {							\
		localtry_lock_t *lt;				\
		lt = this_cpu_ptr(lock);			\
		WRITE_ONCE(lt->acquired, 0);			\
		local_lock_release(&lt->llock);			\
		local_irq_restore(flags);			\
	} while (0)

#else /* !CONFIG_PREEMPT_RT */

/*
@@ -125,8 +237,10 @@ do { \
 * critical section while staying preemptible.
 */
typedef spinlock_t local_lock_t;
typedef spinlock_t localtry_lock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname)

#define __local_lock_init(l)					\
	do {							\
@@ -169,4 +283,36 @@ do { \
	spin_unlock(this_cpu_ptr((lock)));			\
} while (0)

/* localtry_lock_t variants */

#define __localtry_lock_init(lock)			__local_lock_init(lock)
#define __localtry_lock(lock)				__local_lock(lock)
#define __localtry_lock_irq(lock)			__local_lock(lock)
#define __localtry_lock_irqsave(lock, flags)		__local_lock_irqsave(lock, flags)
#define __localtry_unlock(lock)				__local_unlock(lock)
#define __localtry_unlock_irq(lock)			__local_unlock(lock)
#define __localtry_unlock_irqrestore(lock, flags)	__local_unlock_irqrestore(lock, flags)

#define __localtry_trylock(lock)				\
	({							\
		int __locked;					\
								\
		if (in_nmi() | in_hardirq()) {			\
			__locked = 0;				\
		} else {					\
			migrate_disable();			\
			__locked = spin_trylock(this_cpu_ptr((lock)));	\
			if (!__locked)				\
				migrate_enable();		\
		}						\
		__locked;					\
	})

#define __localtry_trylock_irqsave(lock, flags)			\
	({							\
		typecheck(unsigned long, flags);		\
		flags = 0;					\
		__localtry_trylock(lock);			\
	})

#endif /* CONFIG_PREEMPT_RT */
+4 −0
Original line number Diff line number Diff line
@@ -99,6 +99,10 @@ struct page {
				/* Or, free page */
				struct list_head buddy_list;
				struct list_head pcp_list;
				struct {
					struct llist_node pcp_llist;
					unsigned int order;
				};
			};
			/* See page-flags.h for PAGE_MAPPING_FLAGS */
			struct address_space *mapping;
Loading