Commit 4afeb0ed authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar
Browse files

x86/mm: Enable broadcast TLB invalidation for multi-threaded processes

There is not enough room in the 12-bit ASID address space to hand out
broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
when they are observed to be simultaneously running on 4 or more CPUs.

This also allows single threaded process to continue using the cheaper, local
TLB invalidation instructions like INVLPGB.

Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
a generically named broadcast_tlb_flush() function which can later also be
used for Intel RAR.

Combined with the removal of unnecessary lru_add_drain calls() (see
https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
a nice performance boost for the will-it-scale tlb_flush2_threads test on an
AMD Milan system with 36 cores:

  - vanilla kernel:           527k loops/second
  - lru_add_drain removal:    731k loops/second
  - only INVLPGB:             527k loops/second
  - lru_add_drain + INVLPGB: 1157k loops/second

Profiling with only the INVLPGB changes showed while TLB invalidation went
down from 40% of the total CPU time to only around 4% of CPU time, the
contention simply moved to the LRU lock.

Fixing both at the same time about doubles the number of iterations per second
from this case.

Comparing will-it-scale tlb_flush2_threads with several different numbers of
threads on a 72 CPU AMD Milan shows similar results. The number represents the
total number of loops per second across all the threads:

  threads	tip		INVLPGB

  1		315k		304k
  2		423k		424k
  4		644k		1032k
  8		652k		1267k
  16		737k		1368k
  32		759k		1199k
  64		636k		1094k
  72		609k		993k

1 and 2 thread performance is similar with and without INVLPGB, because
INVLPGB is only used on processes using 4 or more CPUs simultaneously.

The number is the median across 5 runs.

Some numbers closer to real world performance can be found at Phoronix, thanks
to Michael:

https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits

  [ bp:
   - Massage
   - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
   - :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
   - Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com


   ]

Signed-off-by: default avatarRik van Riel <riel@surriel.com>
Signed-off-by: default avatarBorislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Reviewed-by: default avatarNadav Amit <nadav.amit@gmail.com>
Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
parent c9826613
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
	smp_store_release(&mm->context.global_asid, asid);
}

static inline void mm_clear_asid_transition(struct mm_struct *mm)
{
	WRITE_ONCE(mm->context.asid_transition, false);
}

static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition(struct mm_struct *mm)
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */

+103 −1
Original line number Diff line number Diff line
@@ -430,6 +430,105 @@ static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
	return false;
}

/*
 * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
 * systems have over 8k CPUs. Because of this potential ASID shortage,
 * global ASIDs are handed out to processes that have frequent TLB
 * flushes and are active on 4 or more CPUs simultaneously.
 */
static void consider_global_asid(struct mm_struct *mm)
{
	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
		return;

	/* Check every once in a while. */
	if ((current->pid & 0x1f) != (jiffies & 0x1f))
		return;

	/*
	 * Assign a global ASID if the process is active on
	 * 4 or more CPUs simultaneously.
	 */
	if (mm_active_cpus_exceeds(mm, 3))
		use_global_asid(mm);
}

static void finish_asid_transition(struct flush_tlb_info *info)
{
	struct mm_struct *mm = info->mm;
	int bc_asid = mm_global_asid(mm);
	int cpu;

	if (!mm_in_asid_transition(mm))
		return;

	for_each_cpu(cpu, mm_cpumask(mm)) {
		/*
		 * The remote CPU is context switching. Wait for that to
		 * finish, to catch the unlikely case of it switching to
		 * the target mm with an out of date ASID.
		 */
		while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
			cpu_relax();

		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
			continue;

		/*
		 * If at least one CPU is not using the global ASID yet,
		 * send a TLB flush IPI. The IPI should cause stragglers
		 * to transition soon.
		 *
		 * This can race with the CPU switching to another task;
		 * that results in a (harmless) extra IPI.
		 */
		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
			flush_tlb_multi(mm_cpumask(info->mm), info);
			return;
		}
	}

	/* All the CPUs running this process are using the global ASID. */
	mm_clear_asid_transition(mm);
}

static void broadcast_tlb_flush(struct flush_tlb_info *info)
{
	bool pmd = info->stride_shift == PMD_SHIFT;
	unsigned long asid = mm_global_asid(info->mm);
	unsigned long addr = info->start;

	/*
	 * TLB flushes with INVLPGB are kicked off asynchronously.
	 * The inc_mm_tlb_gen() guarantees page table updates are done
	 * before these TLB flushes happen.
	 */
	if (info->end == TLB_FLUSH_ALL) {
		invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
		/* Do any CPUs supporting INVLPGB need PTI? */
		if (cpu_feature_enabled(X86_FEATURE_PTI))
			invlpgb_flush_single_pcid_nosync(user_pcid(asid));
	} else do {
		unsigned long nr = 1;

		if (info->stride_shift <= PMD_SHIFT) {
			nr = (info->end - addr) >> info->stride_shift;
			nr = clamp_val(nr, 1, invlpgb_count_max);
		}

		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
		if (cpu_feature_enabled(X86_FEATURE_PTI))
			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);

		addr += nr << info->stride_shift;
	} while (addr < info->end);

	finish_asid_transition(info);

	/* Wait for the INVLPGBs kicked off above to finish. */
	__tlbsync();
}

/*
 * Given an ASID, flush the corresponding user ASID.  We can delay this
 * until the next time we switch to it.
@@ -1260,9 +1359,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
	 * a local TLB flush is needed. Optimize this use-case by calling
	 * flush_tlb_func_local() directly in this case.
	 */
	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
	if (mm_global_asid(mm)) {
		broadcast_tlb_flush(info);
	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
		info->trim_cpumask = should_trim_cpumask(mm);
		flush_tlb_multi(mm_cpumask(mm), info);
		consider_global_asid(mm);
	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
		lockdep_assert_irqs_enabled();
		local_irq_disable();