Loading arch/x86/include/asm/posted_intr.h +71 −7 Original line number Diff line number Diff line /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H #include <asm/cmpxchg.h> #include <asm/rwonce.h> #include <asm/irq_vectors.h> #include <linux/bitmap.h> #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 #define PID_TABLE_ENTRY_VALID 1 #define NR_PIR_VECTORS 256 #define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) /* Posted-Interrupt Descriptor */ struct pi_desc { union { u32 pir[8]; /* Posted interrupt requested */ u64 pir64[4]; }; unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ Loading @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); /* * De-multiplexing posted interrupts is on the performance path, the code * below is written to optimize the cache performance based on the following * considerations: * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently * accessed by both CPU and IOMMU. * 2.During software processing of posted interrupts, the CPU needs to do * natural width read and xchg for checking and clearing posted interrupt * request (PIR), a 256 bit field within the PID. * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache * line when posting interrupts and setting control bits. * 4.The CPU can access the cache line a magnitude faster than the IOMMU. * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID * cache line. The cache line states after each operation are as follows, * assuming a 64-bit kernel: * CPU IOMMU PID Cache line state * --------------------------------------------------------------- *...read64 exclusive *...lock xchg64 modified *... post/atomic swap invalid *...------------------------------------------------------------- * * To reduce L1 data cache miss, it is important to avoid contention with * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used * when processing posted interrupts in software, e.g. to dispatch interrupt * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR * in KVM. * * In addition, the code is trying to keep the cache line state consistent * as much as possible. e.g. when making a copy and clearing the PIR * (assuming non-zero PIR bits are present in the entire PIR), it does: * read, read, read, read, xchg, xchg, xchg, xchg * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ static __always_inline bool pi_harvest_pir(unsigned long *pir, unsigned long *pir_vals) { unsigned long pending = 0; int i; for (i = 0; i < NR_PIR_WORDS; i++) { pir_vals[i] = READ_ONCE(pir[i]); pending |= pir_vals[i]; } if (!pending) return false; for (i = 0; i < NR_PIR_WORDS; i++) { if (!pir_vals[i]) continue; pir_vals[i] = arch_xchg(&pir[i], 0); } return true; } static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); Loading @@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) Loading Loading @@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector) if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; return test_bit(vector, (unsigned long *)pid->pir); return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); Loading arch/x86/kernel/irq.c +10 −53 Original line number Diff line number Diff line Loading @@ -380,61 +380,18 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } /* * De-multiplexing posted interrupts is on the performance path, the code * below is written to optimize the cache performance based on the following * considerations: * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently * accessed by both CPU and IOMMU. * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg * for checking and clearing posted interrupt request (PIR), a 256 bit field * within the PID. * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache * line when posting interrupts and setting control bits. * 4.The CPU can access the cache line a magnitude faster than the IOMMU. * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID * cache line. The cache line states after each operation are as follows: * CPU IOMMU PID Cache line state * --------------------------------------------------------------- *...read64 exclusive *...lock xchg64 modified *... post/atomic swap invalid *...------------------------------------------------------------- * * To reduce L1 data cache miss, it is important to avoid contention with * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used * to dispatch interrupt handlers. * * In addition, the code is trying to keep the cache line state consistent * as much as possible. e.g. when making a copy and clearing the PIR * (assuming non-zero PIR bits are present in the entire PIR), it does: * read, read, read, read, xchg, xchg, xchg, xchg * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { int i, vec = FIRST_EXTERNAL_VECTOR; unsigned long pir_copy[4]; bool handled = false; unsigned long pir_copy[NR_PIR_WORDS]; int vec = FIRST_EXTERNAL_VECTOR; for (i = 0; i < 4; i++) pir_copy[i] = pir[i]; if (!pi_harvest_pir(pir, pir_copy)) return false; for (i = 0; i < 4; i++) { if (!pir_copy[i]) continue; pir_copy[i] = arch_xchg(&pir[i], 0); handled = true; } if (handled) { for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) call_irq_handler(vec, regs); } return handled; return true; } /* Loading Loading @@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. */ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { if (!handle_pending_pir(pid->pir64, regs)) if (!handle_pending_pir(pid->pir, regs)) break; } Loading @@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * process PIR bits one last time such that handling the new interrupts * are not delayed until the next IRQ. */ handle_pending_pir(pid->pir64, regs); handle_pending_pir(pid->pir, regs); apic_eoi(); irq_exit(); Loading arch/x86/kvm/lapic.c +11 −9 Original line number Diff line number Diff line Loading @@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap) return count; } bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { unsigned long pir_vals[NR_PIR_WORDS]; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 pir_val, irr_val, prev_irr_val; u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; if (!pi_harvest_pir(pir, pir_vals)) return false; for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = *p_irr; pir_val = READ_ONCE(pir[i]); if (pir_val) { pir_val = xchg(&pir[i], 0); irr_val = READ_ONCE(*p_irr); if (__pir[i]) { prev_irr_val = irr_val; do { irr_val = prev_irr_val | pir_val; irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); Loading @@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); Loading arch/x86/kvm/lapic.h +2 −2 Original line number Diff line number Diff line Loading @@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); Loading arch/x86/kvm/vmx/posted_intr.h +1 −1 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; vec = find_last_bit((unsigned long *)pi_desc->pir, 256); vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } Loading Loading
arch/x86/include/asm/posted_intr.h +71 −7 Original line number Diff line number Diff line /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H #include <asm/cmpxchg.h> #include <asm/rwonce.h> #include <asm/irq_vectors.h> #include <linux/bitmap.h> #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 #define PID_TABLE_ENTRY_VALID 1 #define NR_PIR_VECTORS 256 #define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) /* Posted-Interrupt Descriptor */ struct pi_desc { union { u32 pir[8]; /* Posted interrupt requested */ u64 pir64[4]; }; unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ Loading @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); /* * De-multiplexing posted interrupts is on the performance path, the code * below is written to optimize the cache performance based on the following * considerations: * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently * accessed by both CPU and IOMMU. * 2.During software processing of posted interrupts, the CPU needs to do * natural width read and xchg for checking and clearing posted interrupt * request (PIR), a 256 bit field within the PID. * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache * line when posting interrupts and setting control bits. * 4.The CPU can access the cache line a magnitude faster than the IOMMU. * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID * cache line. The cache line states after each operation are as follows, * assuming a 64-bit kernel: * CPU IOMMU PID Cache line state * --------------------------------------------------------------- *...read64 exclusive *...lock xchg64 modified *... post/atomic swap invalid *...------------------------------------------------------------- * * To reduce L1 data cache miss, it is important to avoid contention with * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used * when processing posted interrupts in software, e.g. to dispatch interrupt * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR * in KVM. * * In addition, the code is trying to keep the cache line state consistent * as much as possible. e.g. when making a copy and clearing the PIR * (assuming non-zero PIR bits are present in the entire PIR), it does: * read, read, read, read, xchg, xchg, xchg, xchg * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ static __always_inline bool pi_harvest_pir(unsigned long *pir, unsigned long *pir_vals) { unsigned long pending = 0; int i; for (i = 0; i < NR_PIR_WORDS; i++) { pir_vals[i] = READ_ONCE(pir[i]); pending |= pir_vals[i]; } if (!pending) return false; for (i = 0; i < NR_PIR_WORDS; i++) { if (!pir_vals[i]) continue; pir_vals[i] = arch_xchg(&pir[i], 0); } return true; } static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); Loading @@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) Loading Loading @@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector) if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; return test_bit(vector, (unsigned long *)pid->pir); return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); Loading
arch/x86/kernel/irq.c +10 −53 Original line number Diff line number Diff line Loading @@ -380,61 +380,18 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } /* * De-multiplexing posted interrupts is on the performance path, the code * below is written to optimize the cache performance based on the following * considerations: * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently * accessed by both CPU and IOMMU. * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg * for checking and clearing posted interrupt request (PIR), a 256 bit field * within the PID. * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache * line when posting interrupts and setting control bits. * 4.The CPU can access the cache line a magnitude faster than the IOMMU. * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID * cache line. The cache line states after each operation are as follows: * CPU IOMMU PID Cache line state * --------------------------------------------------------------- *...read64 exclusive *...lock xchg64 modified *... post/atomic swap invalid *...------------------------------------------------------------- * * To reduce L1 data cache miss, it is important to avoid contention with * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used * to dispatch interrupt handlers. * * In addition, the code is trying to keep the cache line state consistent * as much as possible. e.g. when making a copy and clearing the PIR * (assuming non-zero PIR bits are present in the entire PIR), it does: * read, read, read, read, xchg, xchg, xchg, xchg * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { int i, vec = FIRST_EXTERNAL_VECTOR; unsigned long pir_copy[4]; bool handled = false; unsigned long pir_copy[NR_PIR_WORDS]; int vec = FIRST_EXTERNAL_VECTOR; for (i = 0; i < 4; i++) pir_copy[i] = pir[i]; if (!pi_harvest_pir(pir, pir_copy)) return false; for (i = 0; i < 4; i++) { if (!pir_copy[i]) continue; pir_copy[i] = arch_xchg(&pir[i], 0); handled = true; } if (handled) { for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) call_irq_handler(vec, regs); } return handled; return true; } /* Loading Loading @@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. */ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { if (!handle_pending_pir(pid->pir64, regs)) if (!handle_pending_pir(pid->pir, regs)) break; } Loading @@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * process PIR bits one last time such that handling the new interrupts * are not delayed until the next IRQ. */ handle_pending_pir(pid->pir64, regs); handle_pending_pir(pid->pir, regs); apic_eoi(); irq_exit(); Loading
arch/x86/kvm/lapic.c +11 −9 Original line number Diff line number Diff line Loading @@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap) return count; } bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { unsigned long pir_vals[NR_PIR_WORDS]; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 pir_val, irr_val, prev_irr_val; u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; if (!pi_harvest_pir(pir, pir_vals)) return false; for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = *p_irr; pir_val = READ_ONCE(pir[i]); if (pir_val) { pir_val = xchg(&pir[i], 0); irr_val = READ_ONCE(*p_irr); if (__pir[i]) { prev_irr_val = irr_val; do { irr_val = prev_irr_val | pir_val; irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); Loading @@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); Loading
arch/x86/kvm/lapic.h +2 −2 Original line number Diff line number Diff line Loading @@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); Loading
arch/x86/kvm/vmx/posted_intr.h +1 −1 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; vec = find_last_bit((unsigned long *)pi_desc->pir, 256); vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } Loading