Merge tag 'kvm-x86-pir-6.16' of https://github.com/kvm-x86/linux into HEAD (db44dcbd) · Commits · git / linux-nf

arch/x86/include/asm/posted_intr.h

+71 −7

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */
		#ifndef _X86_POSTED_INTR_H
		#define _X86_POSTED_INTR_H

		#include <asm/cmpxchg.h>
		#include <asm/rwonce.h>
		#include <asm/irq_vectors.h>

		#include <linux/bitmap.h>

		#define POSTED_INTR_ON 0
		#define POSTED_INTR_SN 1

		#define PID_TABLE_ENTRY_VALID 1

		#define NR_PIR_VECTORS 256
		#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)

		/* Posted-Interrupt Descriptor */
		struct pi_desc {
		union {
		u32 pir[8]; /* Posted interrupt requested */
		u64 pir64[4];
		};
		unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
		union {
		struct {
		u16 notifications; /* Suppress and outstanding bits */
		@@ -26,6 +31,65 @@ struct pi_desc {
		u32 rsvd[6];
		} __aligned(64);

		/*
		* De-multiplexing posted interrupts is on the performance path, the code
		* below is written to optimize the cache performance based on the following
		* considerations:
		* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
		* accessed by both CPU and IOMMU.
		* 2.During software processing of posted interrupts, the CPU needs to do
		* natural width read and xchg for checking and clearing posted interrupt
		* request (PIR), a 256 bit field within the PID.
		* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
		* line when posting interrupts and setting control bits.
		* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
		* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
		* cache line. The cache line states after each operation are as follows,
		* assuming a 64-bit kernel:
		* CPU IOMMU PID Cache line state
		* ---------------------------------------------------------------
		*...read64 exclusive
		*...lock xchg64 modified
		*... post/atomic swap invalid
		*...-------------------------------------------------------------
		*
		* To reduce L1 data cache miss, it is important to avoid contention with
		* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
		* when processing posted interrupts in software, e.g. to dispatch interrupt
		* handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
		* in KVM.
		*
		* In addition, the code is trying to keep the cache line state consistent
		* as much as possible. e.g. when making a copy and clearing the PIR
		* (assuming non-zero PIR bits are present in the entire PIR), it does:
		* read, read, read, read, xchg, xchg, xchg, xchg
		* instead of:
		* read, xchg, read, xchg, read, xchg, read, xchg
		*/
		static __always_inline bool pi_harvest_pir(unsigned long *pir,
		unsigned long *pir_vals)
		{
		unsigned long pending = 0;
		int i;

		for (i = 0; i < NR_PIR_WORDS; i++) {
		pir_vals[i] = READ_ONCE(pir[i]);
		pending \|= pir_vals[i];
		}

		if (!pending)
		return false;

		for (i = 0; i < NR_PIR_WORDS; i++) {
		if (!pir_vals[i])
		continue;

		pir_vals[i] = arch_xchg(&pir[i], 0);
		}

		return true;
		}

		static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
		{
		return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
		@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)

		static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
		{
		return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
		return test_and_set_bit(vector, pi_desc->pir);
		}

		static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
		{
		return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
		return bitmap_empty(pi_desc->pir, NR_VECTORS);
		}

		static inline void pi_set_sn(struct pi_desc *pi_desc)
		@@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
		if (WARN_ON_ONCE(vector > NR_VECTORS \|\| vector < FIRST_EXTERNAL_VECTOR))
		return false;

		return test_bit(vector, (unsigned long *)pid->pir);
		return test_bit(vector, pid->pir);
		}

		extern void intel_posted_msi_init(void);

arch/x86/kernel/irq.c

+10 −53

Original line number	Diff line number	Diff line
		@@ -380,61 +380,18 @@ void intel_posted_msi_init(void)
		this_cpu_write(posted_msi_pi_desc.ndst, destination);
		}

		/*
		* De-multiplexing posted interrupts is on the performance path, the code
		* below is written to optimize the cache performance based on the following
		* considerations:
		* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
		* accessed by both CPU and IOMMU.
		* 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
		* for checking and clearing posted interrupt request (PIR), a 256 bit field
		* within the PID.
		* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
		* line when posting interrupts and setting control bits.
		* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
		* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
		* cache line. The cache line states after each operation are as follows:
		* CPU IOMMU PID Cache line state
		* ---------------------------------------------------------------
		*...read64 exclusive
		*...lock xchg64 modified
		*... post/atomic swap invalid
		*...-------------------------------------------------------------
		*
		* To reduce L1 data cache miss, it is important to avoid contention with
		* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
		* to dispatch interrupt handlers.
		*
		* In addition, the code is trying to keep the cache line state consistent
		* as much as possible. e.g. when making a copy and clearing the PIR
		* (assuming non-zero PIR bits are present in the entire PIR), it does:
		* read, read, read, read, xchg, xchg, xchg, xchg
		* instead of:
		* read, xchg, read, xchg, read, xchg, read, xchg
		*/
		static __always_inline bool handle_pending_pir(u64 pir, struct pt_regs regs)
		static __always_inline bool handle_pending_pir(unsigned long pir, struct pt_regs regs)
		{
		int i, vec = FIRST_EXTERNAL_VECTOR;
		unsigned long pir_copy[4];
		bool handled = false;
		unsigned long pir_copy[NR_PIR_WORDS];
		int vec = FIRST_EXTERNAL_VECTOR;

		for (i = 0; i < 4; i++)
		pir_copy[i] = pir[i];
		if (!pi_harvest_pir(pir, pir_copy))
		return false;

		for (i = 0; i < 4; i++) {
		if (!pir_copy[i])
		continue;

		pir_copy[i] = arch_xchg(&pir[i], 0);
		handled = true;
		}

		if (handled) {
		for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
		call_irq_handler(vec, regs);
		}

		return handled;
		return true;
		}

		/*
		@@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
		* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
		*/
		while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
		if (!handle_pending_pir(pid->pir64, regs))
		if (!handle_pending_pir(pid->pir, regs))
		break;
		}

		@@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
		* process PIR bits one last time such that handling the new interrupts
		* are not delayed until the next IRQ.
		*/
		handle_pending_pir(pid->pir64, regs);
		handle_pending_pir(pid->pir, regs);

		apic_eoi();
		irq_exit();

arch/x86/kvm/lapic.c

+11 −9

Original line number	Diff line number	Diff line
		@@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap)
		return count;
		}

		bool __kvm_apic_update_irr(u32 pir, void regs, int *max_irr)
		bool __kvm_apic_update_irr(unsigned long pir, void regs, int *max_irr)
		{
		unsigned long pir_vals[NR_PIR_WORDS];
		u32 __pir = (void )pir_vals;
		u32 i, vec;
		u32 pir_val, irr_val, prev_irr_val;
		u32 irr_val, prev_irr_val;
		int max_updated_irr;

		max_updated_irr = -1;
		*max_irr = -1;

		if (!pi_harvest_pir(pir, pir_vals))
		return false;

		for (i = vec = 0; i <= 7; i++, vec += 32) {
		u32 p_irr = (u32 )(regs + APIC_IRR + i * 0x10);

		irr_val = *p_irr;
		pir_val = READ_ONCE(pir[i]);

		if (pir_val) {
		pir_val = xchg(&pir[i], 0);
		irr_val = READ_ONCE(*p_irr);

		if (__pir[i]) {
		prev_irr_val = irr_val;
		do {
		irr_val = prev_irr_val \| pir_val;
		irr_val = prev_irr_val \| __pir[i];
		} while (prev_irr_val != irr_val &&
		!try_cmpxchg(p_irr, &prev_irr_val, irr_val));

		@@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 pir, void regs, int *max_irr)
		}
		EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);

		bool kvm_apic_update_irr(struct kvm_vcpu vcpu, u32 pir, int *max_irr)
		bool kvm_apic_update_irr(struct kvm_vcpu vcpu, unsigned long pir, int *max_irr)
		{
		struct kvm_lapic *apic = vcpu->arch.apic;
		bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);

arch/x86/kvm/lapic.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu vcpu, struct kvm_lapic source,
		int shorthand, unsigned int dest, int dest_mode);
		int kvm_apic_compare_prio(struct kvm_vcpu vcpu1, struct kvm_vcpu vcpu2);
		void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
		bool __kvm_apic_update_irr(u32 pir, void regs, int *max_irr);
		bool kvm_apic_update_irr(struct kvm_vcpu vcpu, u32 pir, int *max_irr);
		bool __kvm_apic_update_irr(unsigned long pir, void regs, int *max_irr);
		bool kvm_apic_update_irr(struct kvm_vcpu vcpu, unsigned long pir, int *max_irr);
		void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
		int kvm_apic_set_irq(struct kvm_vcpu vcpu, struct kvm_lapic_irq irq,
		struct dest_map *dest_map);

arch/x86/kvm/vmx/posted_intr.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
		{
		int vec;

		vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
		vec = find_last_bit(pi_desc->pir, 256);
		return vec < 256 ? vec : -1;
		}