Commit 56180dd2 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

futex: Use RCU-based per-CPU reference counting instead of rcuref_t



The use of rcuref_t for reference counting introduces a performance bottleneck
when accessed concurrently by multiple threads during futex operations.

Replace rcuref_t with special crafted per-CPU reference counters. The
lifetime logic remains the same.

The newly allocate private hash starts in FR_PERCPU state. In this state, each
futex operation that requires the private hash uses a per-CPU counter (an
unsigned int) for incrementing or decrementing the reference count.

When the private hash is about to be replaced, the per-CPU counters are
migrated to a atomic_t counter mm_struct::futex_atomic.
The migration process:
- Waiting for one RCU grace period to ensure all users observe the
  current private hash. This can be skipped if a grace period elapsed
  since the private hash was assigned.

- futex_private_hash::state is set to FR_ATOMIC, forcing all users to
  use mm_struct::futex_atomic for reference counting.

- After a RCU grace period, all users are guaranteed to be using the
  atomic counter. The per-CPU counters can now be summed up and added to
  the atomic_t counter. If the resulting count is zero, the hash can be
  safely replaced. Otherwise, active users still hold a valid reference.

- Once the atomic reference count drops to zero, the next futex
  operation will switch to the new private hash.

call_rcu_hurry() is used to speed up transition which otherwise might be
delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The
side effects would be that on auto scaling the new hash is used later
and the SET_SLOTS prctl() will block longer.

[bigeasy: commit description + mm get/ put_async]

Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarSebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de
parent a255b78d
Loading
Loading
Loading
Loading
+5 −11
Original line number Diff line number Diff line
@@ -85,18 +85,12 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
#ifdef CONFIG_FUTEX_PRIVATE_HASH
int futex_hash_allocate_default(void);
void futex_hash_free(struct mm_struct *mm);

static inline void futex_mm_init(struct mm_struct *mm)
{
	RCU_INIT_POINTER(mm->futex_phash, NULL);
	mm->futex_phash_new = NULL;
	mutex_init(&mm->futex_hash_lock);
}
int futex_mm_init(struct mm_struct *mm);

#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline int futex_hash_allocate_default(void) { return 0; }
static inline void futex_hash_free(struct mm_struct *mm) { }
static inline void futex_mm_init(struct mm_struct *mm) { }
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
#endif /* CONFIG_FUTEX_PRIVATE_HASH */

#else /* !CONFIG_FUTEX */
@@ -118,8 +112,8 @@ static inline int futex_hash_allocate_default(void)
{
	return 0;
}
static inline void futex_hash_free(struct mm_struct *mm) { }
static inline void futex_mm_init(struct mm_struct *mm) { }
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
static inline int futex_mm_init(struct mm_struct *mm) { return 0; }

#endif

+5 −0
Original line number Diff line number Diff line
@@ -1070,6 +1070,11 @@ struct mm_struct {
		struct mutex			futex_hash_lock;
		struct futex_private_hash	__rcu *futex_phash;
		struct futex_private_hash	*futex_phash_new;
		/* futex-ref */
		unsigned long			futex_batches;
		struct rcu_head			futex_rcu;
		atomic_long_t			futex_atomic;
		unsigned int			__percpu *futex_ref;
#endif

		unsigned long hiwater_rss; /* High-watermark of RSS usage */
+1 −1
Original line number Diff line number Diff line
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
+0 −4
Original line number Diff line number Diff line
@@ -1716,13 +1716,9 @@ config FUTEX_PI
	depends on FUTEX && RT_MUTEXES
	default y

#
# marked broken for performance reasons; gives us one more cycle to sort things out.
#
config FUTEX_PRIVATE_HASH
	bool
	depends on FUTEX && !BASE_SMALL && MMU
	depends on BROKEN
	default y

config FUTEX_MPOL
+6 −2
Original line number Diff line number Diff line
@@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	RCU_INIT_POINTER(mm->exe_file, NULL);
	mmu_notifier_subscriptions_init(mm);
	init_tlb_flush_pending(mm);
	futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
	mm->pmd_huge_pte = NULL;
#endif
@@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
		mm->def_flags = 0;
	}

	if (futex_mm_init(mm))
		goto fail_mm_init;

	if (mm_alloc_pgd(mm))
		goto fail_nopgd;

@@ -1090,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
fail_noid:
	mm_free_pgd(mm);
fail_nopgd:
	futex_hash_free(mm);
fail_mm_init:
	free_mm(mm);
	return NULL;
}
@@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
	struct mm_struct *mm = container_of(work, struct mm_struct,
Loading