From 6777885605e1538bd8126360c62669c4615690c1 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Mon, 7 Jul 2025 22:47:14 +0000 Subject: [PATCH 001/142] KVM: x86/mmu: Track possible NX huge pages separately for TDP vs. Shadow MMU Track possible NX huge pages for the TDP MMU separately from Shadow MMUs in anticipation of doing recovery for the TDP MMU while holding mmu_lock for read instead of write. Use a small structure to hold the list of pages along with the number of pages/entries in the list, as relying on kvm->stat.nx_lpage_splits to calculating the number of pages to recover would result in over-zapping when both TDP and Shadow MMUs are active. Suggested-by: Sean Christopherson Suggested-by: David Matlack Signed-off-by: Vipin Sharma Co-developed-by: James Houghton Signed-off-by: James Houghton Link: https://lore.kernel.org/r/20250707224720.4016504-2-jthoughton@google.com [sean: rewrite changelog, use #ifdef instead of dummy KVM_TDP_MMU #define] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 39 ++++++++++++++-------- arch/x86/kvm/mmu/mmu.c | 59 ++++++++++++++++++++++----------- arch/x86/kvm/mmu/mmu_internal.h | 6 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 4 +-- 4 files changed, 72 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..c038d7cd187d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1348,6 +1348,30 @@ enum kvm_apicv_inhibit { __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) +struct kvm_possible_nx_huge_pages { + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head pages; + u64 nr_pages; +}; + +enum kvm_mmu_type { + KVM_SHADOW_MMU, +#ifdef CONFIG_X86_64 + KVM_TDP_MMU, +#endif + KVM_NR_MMU_TYPES, +}; + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; @@ -1360,18 +1384,7 @@ struct kvm_arch { bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; - /* - * A list of kvm_mmu_page structs that, if zapped, could possibly be - * replaced by an NX huge page. A shadow page is on this list if its - * existence disallows an NX huge page (nx_huge_page_disallowed is set) - * and there are no other conditions that prevent a huge page, e.g. - * the backing host page is huge, dirtly logging is not enabled for its - * memslot, etc... Note, zapping shadow pages on this list doesn't - * guarantee an NX huge page will be created in its stead, e.g. if the - * guest attempts to execute from the region then KVM obviously can't - * create an NX huge page (without hanging the guest). - */ - struct list_head possible_nx_huge_pages; + struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES]; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; #endif @@ -1526,7 +1539,7 @@ struct kvm_arch { * is held in read mode: * - tdp_mmu_roots (above) * - the link field of kvm_mmu_page structs used by the TDP MMU - * - possible_nx_huge_pages; + * - possible_nx_huge_pages[KVM_TDP_MMU]; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * Because the lock is only taken within the MMU lock, strictly diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..e0d6579db531 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } -void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type) { /* * If it's possible to replace the shadow page with an NX huge page, @@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) return; ++kvm->stat.nx_lpage_splits; + ++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages; list_add_tail(&sp->possible_nx_huge_page_link, - &kvm->arch.possible_nx_huge_pages); + &kvm->arch.possible_nx_huge_pages[mmu_type].pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, sp->nx_huge_page_disallowed = true; if (nx_huge_page_possible) - track_possible_nx_huge_page(kvm, sp); + track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type) { if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; + --kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages; list_del_init(&sp->possible_nx_huge_page_link); } @@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); + untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU); } static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, @@ -6737,11 +6741,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) int kvm_mmu_init_vm(struct kvm *kvm) { - int r; + int r, i; kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); + for (i = 0; i < KVM_NR_MMU_TYPES; ++i) + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); if (tdp_mmu_enabled) { @@ -7582,16 +7587,32 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_huge_pages(struct kvm *kvm) +static unsigned long nx_huge_pages_to_zap(struct kvm *kvm, + enum kvm_mmu_type mmu_type) { - unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages); + unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + return ratio ? DIV_ROUND_UP(pages, ratio) : 0; +} + +static void kvm_recover_nx_huge_pages(struct kvm *kvm, + enum kvm_mmu_type mmu_type) +{ +#ifdef CONFIG_X86_64 + const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU; +#else + const bool is_tdp_mmu = false; +#endif + unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type); + struct list_head *nx_huge_pages; struct kvm_memory_slot *slot; - int rcu_idx; struct kvm_mmu_page *sp; - unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; - ulong to_zap; + int rcu_idx; + + nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages; rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); @@ -7603,10 +7624,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) */ rcu_read_lock(); - ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.possible_nx_huge_pages)) + if (list_empty(nx_huge_pages)) break; /* @@ -7616,7 +7635,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, + sp = list_first_entry(nx_huge_pages, struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); @@ -7653,7 +7672,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); - else if (is_tdp_mmu_page(sp)) + else if (is_tdp_mmu) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); @@ -7684,9 +7703,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data) static bool kvm_nx_huge_page_recovery_worker(void *data) { struct kvm *kvm = data; + long remaining_time; bool enabled; uint period; - long remaining_time; + int i; enabled = calc_nx_huge_pages_recovery_period(&period); if (!enabled) @@ -7701,7 +7721,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) } __set_current_state(TASK_RUNNING); - kvm_recover_nx_huge_pages(kvm); + for (i = 0; i < KVM_NR_MMU_TYPES; ++i) + kvm_recover_nx_huge_pages(kvm, i); kvm->arch.nx_huge_page_last = get_jiffies_64(); return true; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 65f3c89d7c5d..0b772c758f37 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); -void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..48b070f9f4e1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_lock(&kvm->arch.tdp_mmu_pages_lock); sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); + untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } @@ -1303,7 +1303,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) fault->req_level >= iter.level) { spin_lock(&kvm->arch.tdp_mmu_pages_lock); if (sp->nx_huge_page_disallowed) - track_possible_nx_huge_page(kvm, sp); + track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } From 62105564226e5711cc117769ff7874970b126efe Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Mon, 7 Jul 2025 22:47:15 +0000 Subject: [PATCH 002/142] KVM: x86/mmu: Rename kvm_tdp_mmu_zap_sp() to better indicate its purpose kvm_tdp_mmu_zap_sp() is only used for NX huge page recovery, so rename it to kvm_tdp_mmu_zap_possible_nx_huge_page(). In a future commit, this function will be changed to include logic specific to NX huge page recovery. Signed-off-by: Vipin Sharma Signed-off-by: James Houghton https://lore.kernel.org/r/20250707224720.4016504-3-jthoughton@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 3 ++- arch/x86/kvm/mmu/tdp_mmu.h | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e0d6579db531..a9aafa88de2b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7673,7 +7673,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); else if (is_tdp_mmu) - flush |= kvm_tdp_mmu_zap_sp(kvm, sp); + flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 48b070f9f4e1..19907eb04a9c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -925,7 +925,8 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_unlock(); } -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, + struct kvm_mmu_page *sp) { u64 old_spte; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 52acf99d40a0..bd62977c9199 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,7 +64,8 @@ static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, } bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, + struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, enum kvm_tdp_mmu_root_types root_types); From a57750909580a4e3f7278ea6c13336677ea46af6 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Mon, 7 Jul 2025 22:47:16 +0000 Subject: [PATCH 003/142] KVM: x86/mmu: Recover TDP MMU NX huge pages using MMU read lock Use MMU read lock to recover TDP MMU NX huge pages. To prevent concurrent modification of the list of potential huge pages, iterate over the list under tdp_mmu_pages_lock protection and unaccount the page before dropping the lock. Zapping under MMU read lock unblocks vCPUs which are waiting for MMU read lock, which solves a guest jitter issue on Windows VMs which were observing an increase in network latency. Do not zap an SPTE if: - The SPTE is a root page. - The SPTE does not point at the SP's page table. If the SPTE does not point at the SP's page table, then something else has change the SPTE, so KVM cannot safely zap it. Warn if zapping SPTE fails and current SPTE is still pointing to same page table, as it should be impossible for the CMPXCHG to fail due to all other write scenarios being mutually exclusive. There is always a race between dirty logging, vCPU faults, and NX huge page recovery for backing a gfn by an NX huge page or an executable small page. Unaccounting sooner during the list traversal increases the window of that race, but functionally, it is okay. Accounting doesn't protect against iTLB multi-hit bug, it is there purely to prevent KVM from bouncing a gfn between two page sizes. The only downside is that a vCPU will end up doing more work in tearing down all the child SPTEs. This should be a very rare race. Suggested-by: Sean Christopherson Signed-off-by: Vipin Sharma Co-developed-by: James Houghton Signed-off-by: James Houghton Link: https://lore.kernel.org/r/20250707224720.4016504-4-jthoughton@google.com [sean: clean up kvm_mmu_sp_dirty_logging_enabled() and the changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 104 +++++++++++++++++++++++-------------- arch/x86/kvm/mmu/tdp_mmu.c | 42 ++++++++++++--- 2 files changed, 100 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9aafa88de2b..92ff15969a36 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7596,17 +7596,43 @@ static unsigned long nx_huge_pages_to_zap(struct kvm *kvm, return ratio ? DIV_ROUND_UP(pages, ratio) : 0; } +static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, + struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot; + + /* + * Skip the memslot lookup if dirty tracking can't possibly be enabled, + * as memslot lookups are relatively expensive. + * + * If a memslot update is in progress, reading an incorrect value of + * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming + * zero, KVM will do an unnecessary memslot lookup; if it is becoming + * nonzero, the page will be zapped unnecessarily. Either way, this + * only affects efficiency in racy situations, and not correctness. + */ + if (!atomic_read(&kvm->nr_memslots_dirty_logging)) + return false; + + slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn); + if (WARN_ON_ONCE(!slot)) + return false; + + return kvm_slot_dirty_track_enabled(slot); +} + static void kvm_recover_nx_huge_pages(struct kvm *kvm, - enum kvm_mmu_type mmu_type) + const enum kvm_mmu_type mmu_type) { #ifdef CONFIG_X86_64 const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU; + spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock; #else const bool is_tdp_mmu = false; + spinlock_t *tdp_mmu_pages_lock = NULL; #endif unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type); struct list_head *nx_huge_pages; - struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); bool flush = false; @@ -7615,7 +7641,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages; rcu_idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); /* * Zapping TDP MMU shadow pages, including the remote TLB flush, must @@ -7625,8 +7654,14 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, rcu_read_lock(); for ( ; to_zap; --to_zap) { - if (list_empty(nx_huge_pages)) + if (is_tdp_mmu) + spin_lock(tdp_mmu_pages_lock); + + if (list_empty(nx_huge_pages)) { + if (is_tdp_mmu) + spin_unlock(tdp_mmu_pages_lock); break; + } /* * We use a separate list instead of just using active_mmu_pages @@ -7641,50 +7676,38 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); - /* - * Unaccount and do not attempt to recover any NX Huge Pages - * that are being dirty tracked, as they would just be faulted - * back in as 4KiB pages. The NX Huge Pages in this slot will be - * recovered, along with all the other huge pages in the slot, - * when dirty logging is disabled. - * - * Since gfn_to_memslot() is relatively expensive, it helps to - * skip it if it the test cannot possibly return true. On the - * other hand, if any memslot has logging enabled, chances are - * good that all of them do, in which case unaccount_nx_huge_page() - * is much cheaper than zapping the page. - * - * If a memslot update is in progress, reading an incorrect value - * of kvm->nr_memslots_dirty_logging is not a problem: if it is - * becoming zero, gfn_to_memslot() will be done unnecessarily; if - * it is becoming nonzero, the page will be zapped unnecessarily. - * Either way, this only affects efficiency in racy situations, - * and not correctness. - */ - slot = NULL; - if (atomic_read(&kvm->nr_memslots_dirty_logging)) { - struct kvm_memslots *slots; + unaccount_nx_huge_page(kvm, sp); + + if (is_tdp_mmu) + spin_unlock(tdp_mmu_pages_lock); + + /* + * Do not attempt to recover any NX Huge Pages that are being + * dirty tracked, as they would just be faulted back in as 4KiB + * pages. The NX Huge Pages in this slot will be recovered, + * along with all the other huge pages in the slot, when dirty + * logging is disabled. + */ + if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) { + if (is_tdp_mmu) + flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); + else + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - WARN_ON_ONCE(!slot); } - if (slot && kvm_slot_dirty_track_enabled(slot)) - unaccount_nx_huge_page(kvm, sp); - else if (is_tdp_mmu) - flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); - else - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); - cond_resched_rwlock_write(&kvm->mmu_lock); - flush = false; + if (is_tdp_mmu) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; rcu_read_lock(); } } @@ -7692,7 +7715,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, rcu_read_unlock(); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 19907eb04a9c..31d921705dee 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -928,21 +928,49 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 old_spte; + struct tdp_iter iter = { + .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0, + .sptep = sp->ptep, + .level = sp->role.level + 1, + .gfn = sp->gfn, + .as_id = kvm_mmu_page_as_id(sp), + }; + + lockdep_assert_held_read(&kvm->mmu_lock); + + if (WARN_ON_ONCE(!is_tdp_mmu_page(sp))) + return false; /* - * This helper intentionally doesn't allow zapping a root shadow page, - * which doesn't have a parent page table and thus no associated entry. + * Root shadow pages don't have a parent page table and thus no + * associated entry, but they can never be possible NX huge pages. */ if (WARN_ON_ONCE(!sp->ptep)) return false; - old_spte = kvm_tdp_mmu_read_spte(sp->ptep); - if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) + /* + * Since mmu_lock is held in read mode, it's possible another task has + * already modified the SPTE. Zap the SPTE if and only if the SPTE + * points at the SP's page table, as checking shadow-present isn't + * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even + * another SP. Note, spte_to_child_pt() also checks that the SPTE is + * shadow-present, i.e. guards against zapping a frozen SPTE. + */ + if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level)) return false; - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, - SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); + /* + * If a different task modified the SPTE, then it should be impossible + * for the SPTE to still be used for the to-be-zapped SP. Non-leaf + * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when + * creating non-leaf SPTEs, and all other bits are immutable for non- + * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are + * zapping and replacement. + */ + if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) { + WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level)); + return false; + } return true; } From e750f85391286a4c8100275516973324b621a269 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jul 2025 12:06:38 -0700 Subject: [PATCH 004/142] KVM: x86: Don't (re)check L1 intercepts when completing userspace I/O When completing emulation of instruction that generated a userspace exit for I/O, don't recheck L1 intercepts as KVM has already finished that phase of instruction execution, i.e. has already committed to allowing L2 to perform I/O. If L1 (or host userspace) modifies the I/O permission bitmaps during the exit to userspace, KVM will treat the access as being intercepted despite already having emulated the I/O access. Pivot on EMULTYPE_NO_DECODE to detect that KVM is completing emulation. Of the three users of EMULTYPE_NO_DECODE, only complete_emulated_io() (the intended "recipient") can reach the code in question. gp_interception()'s use is mutually exclusive with is_guest_mode(), and complete_emulated_insn_gp() unconditionally pairs EMULTYPE_NO_DECODE with EMULTYPE_SKIP. The bad behavior was detected by a syzkaller program that toggles port I/O interception during the userspace I/O exit, ultimately resulting in a WARN on vcpu->arch.pio.count being non-zero due to KVM no completing emulation of the I/O instruction. WARNING: CPU: 23 PID: 1083 at arch/x86/kvm/x86.c:8039 emulator_pio_in_out+0x154/0x170 [kvm] Modules linked in: kvm_intel kvm irqbypass CPU: 23 UID: 1000 PID: 1083 Comm: repro Not tainted 6.16.0-rc5-c1610d2d66b1-next-vm #74 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:emulator_pio_in_out+0x154/0x170 [kvm] PKRU: 55555554 Call Trace: kvm_fast_pio+0xd6/0x1d0 [kvm] vmx_handle_exit+0x149/0x610 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xda8/0x1ac0 [kvm] kvm_vcpu_ioctl+0x244/0x8c0 [kvm] __x64_sys_ioctl+0x8a/0xd0 do_syscall_64+0x5d/0xc60 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Reported-by: syzbot+cc2032ba16cc2018ca25@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68790db4.a00a0220.3af5df.0020.GAE@google.com Fixes: 8a76d7f25f8f ("KVM: x86: Add x86 callback for intercept check") Cc: stable@vger.kernel.org Cc: Jim Mattson Link: https://lore.kernel.org/r/20250715190638.1899116-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 9 ++++----- arch/x86/kvm/kvm_emulate.h | 3 +-- arch/x86/kvm/x86.c | 15 ++++++++------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1349e278cd2a..542d3664afa3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5107,12 +5107,11 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.end = 0; } -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) { const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5160,7 +5159,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(is_guest_mode) && ctxt->intercept) { + if (unlikely(check_intercepts) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5189,7 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5243,7 +5242,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index c1df5acfacaf..7b5ddb787a25 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -235,7 +235,6 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); bool (*is_smm)(struct x86_emulate_ctxt *ctxt); - bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -521,7 +520,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 void init_decode_cache(struct x86_emulate_ctxt *ctxt); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..79057622fa76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8470,11 +8470,6 @@ static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) return is_smm(emul_to_vcpu(ctxt)); } -static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) -{ - return is_guest_mode(emul_to_vcpu(ctxt)); -} - #ifndef CONFIG_KVM_SMM static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { @@ -8558,7 +8553,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, - .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -9143,7 +9137,14 @@ restart: ctxt->exception.address = 0; } - r = x86_emulate_insn(ctxt); + /* + * Check L1's instruction intercepts when emulating instructions for + * L2, unless KVM is re-emulating a previously decoded instruction, + * e.g. to complete userspace I/O, in which case KVM has already + * checked the intercepts. + */ + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && + !(emulation_type & EMULTYPE_NO_DECODE)); if (r == EMULATION_INTERCEPTED) return 1; From a1f2418c3eea05a2a11ec76a1913e9e8e77039ff Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 14 Jul 2025 18:25:17 -0700 Subject: [PATCH 005/142] KVM: VMX: Fix an indentation Fix an indentation by replacing 8 spaces with a tab. While at it, add empty lines before and after for better readability. Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250715012517.694429-1-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..92fe8d200335 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8525,7 +8525,9 @@ __init int vmx_hardware_setup(void) */ if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; - kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + return r; } From 65391feb042b2b2cfc4263d64d41a66ef4a7e03e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 Jun 2025 10:16:01 -0700 Subject: [PATCH 006/142] KVM: VMX: Add host MSR read/write helpers to consolidate preemption handling Add host MSR read/write helpers to consolidate preemption handling to prepare for adding FRED RSP0 access functions without duplicating the preemption handling code. Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250626171601.2293914-1-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 92fe8d200335..6ed6c9e97577 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1344,22 +1344,35 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) } #ifdef CONFIG_X86_64 -static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) { preempt_disable(); if (vmx->vt.guest_state_loaded) - rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + *cache = read_msr(msr); preempt_enable(); - return vmx->msr_guest_kernel_gs_base; + return *cache; +} + +static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, + u64 *cache) +{ + preempt_disable(); + if (vmx->vt.guest_state_loaded) + wrmsrns(msr, data); + preempt_enable(); + *cache = data; +} + +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +{ + return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, + &vmx->msr_guest_kernel_gs_base); } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { - preempt_disable(); - if (vmx->vt.guest_state_loaded) - wrmsrq(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, + &vmx->msr_guest_kernel_gs_base); } #endif From 7cbb14d361bd562fd6e2b3a96c13d5567fd3067a Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Fri, 8 Aug 2025 15:45:32 +0800 Subject: [PATCH 007/142] KVM: TDX: Remove redundant __GFP_ZERO Remove the redundant __GFP_ZERO flag from kcalloc() since kcalloc() inherently zeroes memory. Signed-off-by: Qianfeng Rong Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20250808074532.215098-1-rongqianfeng@vivo.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..6784aaaced87 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2480,7 +2480,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL); if (!tdcs_pages) goto free_tdr; From 68e61f6fd65610e73b17882f86fedfd784d99229 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 11 Jul 2025 10:27:46 -0700 Subject: [PATCH 008/142] KVM: SVM: Emulate PERF_CNTR_GLOBAL_STATUS_SET for PerfMonV2 Emulate PERF_CNTR_GLOBAL_STATUS_SET when PerfMonV2 is enumerated to the guest, as the MSR is supposed to exist in all AMD v2 PMUs. Fixes: 4a2771895ca6 ("KVM: x86/svm/pmu: Add AMD PerfMonV2 support") Cc: stable@vger.kernel.org Cc: Sandipan Das Link: https://lore.kernel.org/r/20250711172746.1579423-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/pmu.c | 5 +++++ arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/x86.c | 2 ++ 4 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..20fa4a79df13 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -733,6 +733,7 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303 /* AMD Hardware Feedback Support MSRs */ #define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 75e9cfc689f8..a84fb3d28885 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -650,6 +650,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; break; @@ -711,6 +712,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) pmu->global_status &= ~data; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!msr_info->host_initiated) + pmu->global_status |= data & ~pmu->global_status_rsvd; + break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_pmu_call(set_msr)(vcpu, msr_info); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 288f7f2a46f2..aa4379e46e96 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -113,6 +113,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: return pmu->version > 1; default: if (msr > MSR_F15H_PERF_CTR5 && diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 79057622fa76..5dc32f2fe391 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -367,6 +367,7 @@ static const u32 msrs_to_save_pmu[] = { MSR_AMD64_PERF_CNTR_GLOBAL_CTL, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -7353,6 +7354,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) return; break; From 0910dd7c9ad45a2605c45fd2bf3d1bcac087687c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:09 -0700 Subject: [PATCH 009/142] KVM: SVM: Skip fastpath emulation on VM-Exit if next RIP isn't valid Skip the WRMSR and HLT fastpaths in SVM's VM-Exit handler if the next RIP isn't valid, e.g. because KVM is running with nrips=false. SVM must decode and emulate to skip the instruction if the CPU doesn't provide the next RIP, and getting the instruction bytes to decode requires reading guest memory. Reading guest memory through the emulator can fault, i.e. can sleep, which is disallowed since the fastpath handlers run with IRQs disabled. BUG: sleeping function called from invalid context at ./include/linux/uaccess.h:106 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 32611, name: qemu preempt_count: 1, expected: 0 INFO: lockdep is turned off. irq event stamp: 30580 hardirqs last enabled at (30579): [] vcpu_run+0x1787/0x1db0 [kvm] hardirqs last disabled at (30580): [] __schedule+0x1e2/0xed0 softirqs last enabled at (30570): [] fpu_swap_kvm_fpstate+0x44/0x210 softirqs last disabled at (30568): [] fpu_swap_kvm_fpstate+0x44/0x210 CPU: 298 UID: 0 PID: 32611 Comm: qemu Tainted: G U 6.16.0-smp--e6c618b51cfe-sleep #782 NONE Tainted: [U]=USER Hardware name: Google Astoria-Turin/astoria, BIOS 0.20241223.2-0 01/17/2025 Call Trace: dump_stack_lvl+0x7d/0xb0 __might_resched+0x271/0x290 __might_fault+0x28/0x80 kvm_vcpu_read_guest_page+0x8d/0xc0 [kvm] kvm_fetch_guest_virt+0x92/0xc0 [kvm] __do_insn_fetch_bytes+0xf3/0x1e0 [kvm] x86_decode_insn+0xd1/0x1010 [kvm] x86_emulate_instruction+0x105/0x810 [kvm] __svm_skip_emulated_instruction+0xc4/0x140 [kvm_amd] handle_fastpath_invd+0xc4/0x1a0 [kvm] vcpu_run+0x11a1/0x1db0 [kvm] kvm_arch_vcpu_ioctl_run+0x5cc/0x730 [kvm] kvm_vcpu_ioctl+0x578/0x6a0 [kvm] __se_sys_ioctl+0x6d/0xb0 do_syscall_64+0x8a/0x2c0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f479d57a94b Note, this is essentially a reapply of commit 5c30e8101e8d ("KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"), but with different justification (KVM now grabs SRCU when skipping the instruction for other reasons). Fixes: b439eb8ab578 ("Revert "KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250805190526.1453366-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..829d9d46718d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4181,13 +4181,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + /* + * Next RIP must be provided as IRQs are disabled, and accessing guest + * memory to decode the instruction might fault, i.e. might sleep. + */ + if (!nrips || !control->next_rip) + return EXIT_FASTPATH_NONE; if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - switch (svm->vmcb->control.exit_code) { + switch (control->exit_code) { case SVM_EXIT_MSR: - if (!svm->vmcb->control.exit_info_1) + if (!control->exit_info_1) break; return handle_fastpath_set_msr_irqoff(vcpu); case SVM_EXIT_HLT: From 49be82d4ad2ea3329c77b9e6010507baf1c66134 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 28 Jul 2025 17:28:43 +0200 Subject: [PATCH 010/142] arch/x86/kvm/ioapic: Remove license boilerplate with bad FSF address The Free Software Foundation does not reside in "59 Temple Place" anymore, so we should not mention that address in the source code here. But instead of updating the address to their current location, let's rather drop the license boilerplate text here and use a proper SPDX license identifier instead. The text talks about the "GNU *Lesser* General Public License" and "any later version", so LGPL-2.1+ is the right choice here. Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20250728152843.310260-1-thuth@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2b5d389bca5f..2c2783296aed 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later /* * Copyright (C) 2001 MandrakeSoft S.A. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -8,20 +9,6 @@ * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Yunhong Jiang * Yaozu (Eddie) Dong * Based on Xen 3.1 code. From 1f0654dc75b8b4945988f802e4e532ff722ecce7 Mon Sep 17 00:00:00 2001 From: Ewan Hai Date: Mon, 18 Aug 2025 04:30:34 -0400 Subject: [PATCH 011/142] KVM: x86: allow CPUID 0xC000_0000 to proceed on Zhaoxin CPUs Bypass the Centaur-only filter for the CPUID signature leaf so that processing continues when the CPU vendor is Zhaoxin. Signed-off-by: Ewan Hai Link: https://lore.kernel.org/r/20250818083034.93935-1-ewanhai-oc@zhaoxin.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e2836a255b16..bee8c869259f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1820,7 +1820,8 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, int r; if (func == CENTAUR_CPUID_SIGNATURE && - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; r = do_cpuid_func(array, func, type); From cc63f918a215a24a72992295566cab5f8f89b08e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Jul 2025 21:58:45 -0400 Subject: [PATCH 012/142] kvm: x86: simplify kvm_vector_to_index() Use find_nth_bit() and make the function almost a one-liner. Signed-off-by: Yury Norov Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8172c2042dd6..5bfa6ec324af 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1064,16 +1064,9 @@ EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) { - u32 mod; - int i, idx = -1; - - mod = vector % dest_vcpus; - - for (i = 0; i <= mod; i++) { - idx = find_next_bit(bitmap, bitmap_size, idx + 1); - BUG_ON(idx == bitmap_size); - } + int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); + BUG_ON(idx >= bitmap_size); return idx; } From 15daa58e78cef70945324db162d1afbf3f493fb6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:10 -0700 Subject: [PATCH 013/142] KVM: x86: Add kvm_icr_to_lapic_irq() helper to allow for fastpath IPIs Extract the code for converting an ICR message into a kvm_lapic_irq structure into a local helper so that a fast-only IPI path can share the conversion logic. No functional change intended. Link: https://lore.kernel.org/r/20250805190526.1453366-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5bfa6ec324af..68fda104195c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1474,24 +1474,30 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); +static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, + u32 icr_high, struct kvm_lapic_irq *irq) +{ + /* KVM has no delay and should always clear the BUSY/PENDING flag. */ + WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); + + irq->vector = icr_low & APIC_VECTOR_MASK; + irq->delivery_mode = icr_low & APIC_MODE_MASK; + irq->dest_mode = icr_low & APIC_DEST_MASK; + irq->level = (icr_low & APIC_INT_ASSERT) != 0; + irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq->shorthand = icr_low & APIC_SHORT_MASK; + irq->msi_redir_hint = false; + if (apic_x2apic_mode(apic)) + irq->dest_id = icr_high; + else + irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); +} + void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { struct kvm_lapic_irq irq; - /* KVM has no delay and should always clear the BUSY/PENDING flag. */ - WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); - - irq.vector = icr_low & APIC_VECTOR_MASK; - irq.delivery_mode = icr_low & APIC_MODE_MASK; - irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = (icr_low & APIC_INT_ASSERT) != 0; - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; - irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.msi_redir_hint = false; - if (apic_x2apic_mode(apic)) - irq.dest_id = icr_high; - else - irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); + kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); trace_kvm_apic_ipi(icr_low, irq.dest_id); From 777414340085711cafd3807a72c531107c0ff7f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:11 -0700 Subject: [PATCH 014/142] KVM: x86: Only allow "fast" IPIs in fastpath WRMSR(X2APIC_ICR) handler Explicitly restrict fastpath ICR writes to IPIs that are "fast", i.e. can be delivered without having to walk all vCPUs, and that target at most 16 vCPUs. Artificially restricting ICR writes to physical mode guarantees at most one vCPU will receive in IPI (because x2APIC IDs are read-only), but that delivery might not be "fast". E.g. even if the vCPU exists, KVM might have to iterate over 4096 vCPUs to find the right one. Limiting delivery to fast IPIs aligns the WRMSR fastpath with kvm_arch_set_irq_inatomic() (which also runs with IRQs disabled), and will allow dropping the semi-arbitrary restrictions on delivery mode and type. Link: https://lore.kernel.org/r/20250805190526.1453366-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 27 +++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 68fda104195c..252e77c88ae2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2432,7 +2432,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; @@ -2447,7 +2447,20 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) */ data &= ~APIC_ICR_BUSY; - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (fast) { + struct kvm_lapic_irq irq; + int ignored; + + kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); + + if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, + &ignored, NULL)) + return -EWOULDBLOCK; + + trace_kvm_apic_ipi((u32)data, irq.dest_id); + } else { + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + } if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); @@ -2458,6 +2471,16 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) return 0; } +static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, false); +} + +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, true); +} + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 72de14527698..1b2d408816aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -137,7 +137,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5dc32f2fe391..1b64c71458a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2150,7 +2150,7 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write(vcpu->arch.apic, data); + return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); return 1; } From aeeb4c7fff525e0fd71ec28162b713b8cb1ec943 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:12 -0700 Subject: [PATCH 015/142] KVM: x86: Drop semi-arbitrary restrictions on IPI type in fastpath Drop the restrictions on fastpath IPIs only working for fixed IRQs with a physical destination now that the fastpath is explicitly limited to "fast" delivery. Limiting delivery to a single physical APIC ID guarantees only one vCPU will receive the event, but that isn't necessary "fast", e.g. if the targeted vCPU is the last of 4096 vCPUs. And logical destination mode or shorthand (to self) can also be fast, e.g. if only a few vCPUs are being targeted. Lastly, there's nothing inherently slow about delivering an NMI, INIT, SIPI, SMI, etc., i.e. there's no reason to artificially limit fastpath delivery to fixed vector IRQs. Link: https://lore.kernel.org/r/20250805190526.1453366-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b64c71458a2..6d93547526e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2146,13 +2146,7 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) return 1; - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); - - return 1; + return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); } static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) From 0a94b2042419f7896f5d362465731506e43bc319 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:13 -0700 Subject: [PATCH 016/142] KVM: x86: Unconditionally handle MSR_IA32_TSC_DEADLINE in fastpath exits Drop the fastpath VM-Exit requirement that KVM can use the hypervisor timer to emulate the APIC timer in TSC deadline mode. I.e. unconditionally handle MSR_IA32_TSC_DEADLINE WRMSRs in the fastpath. Restricting the fastpath to *maybe* using the VMX preemption timer is ineffective and unnecessary. If the requested deadline can't be programmed into the VMX preemption timer, KVM will fall back to hrtimers, i.e. the restriction is ineffective as far as preventing any kind of worst case scenario. But guarding against a worst case scenario is completely unnecessary as the "slow" path, start_sw_tscdeadline() => hrtimer_start(), explicitly disables IRQs. In fact, the worst case scenario is when KVM thinks it can use the VMX preemption timer, as KVM will eat the overhead of calling into vmx_set_hv_timer() and falling back to hrtimers. Opportunistically limit kvm_can_use_hv_timer() to lapic.c as the fastpath code was the only external user. Stating the obvious, this allows handling MSR_IA32_TSC_DEADLINE writes in the fastpath on AMD CPUs. Link: https://lore.kernel.org/r/20250805190526.1453366-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 252e77c88ae2..0c10b1e003c7 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -130,7 +130,7 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1b2d408816aa..8b00e29741de 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -249,7 +249,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d93547526e6..366c8c7f2e43 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2151,9 +2151,6 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) { - if (!kvm_can_use_hv_timer(vcpu)) - return 1; - kvm_set_lapic_tscdeadline_msr(vcpu, data); return 0; } From aebcbb60977323c21c8d89eb4298e454f8e89299 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:14 -0700 Subject: [PATCH 017/142] KVM: x86: Acquire SRCU in WRMSR fastpath iff instruction needs to be skipped Acquire SRCU in the WRMSR fastpath if and only if an instruction needs to be skipped, i.e. only if the fastpath succeeds. The reasoning in commit 3f2739bd1e0b ("KVM: x86: Acquire SRCU read lock when handling fastpath MSR writes") about "avoid having to play whack-a-mole" seems sound, but in hindsight unconditionally acquiring SRCU does more harm than good. While acquiring/releasing SRCU isn't slow per se, the things that are _protected_ by kvm->srcu are generally safe to access only in the "slow" VM-Exit path. E.g. accessing memslots in generic helpers is never safe, because accessing guest memory with IRQs disabled is unless unsafe (except when kvm_vcpu_read_guest_atomic() is used, but that API should never be used in emulation helpers). In other words, playing whack-a-mole is actually desirable in this case, because every access to an asset protected by kvm->srcu warrants further scrutiny. Link: https://lore.kernel.org/r/20250805190526.1453366-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 366c8c7f2e43..a5d7ab23d432 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2159,10 +2159,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - fastpath_t ret; bool handled; - - kvm_vcpu_srcu_read_lock(vcpu); + int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): @@ -2178,19 +2176,16 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) break; } - if (handled) { - if (!kvm_skip_emulated_instruction(vcpu)) - ret = EXIT_FASTPATH_EXIT_USERSPACE; - else - ret = EXIT_FASTPATH_REENTER_GUEST; - trace_kvm_msr_write(msr, data); - } else { - ret = EXIT_FASTPATH_NONE; - } + if (!handled) + return EXIT_FASTPATH_NONE; + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_skip_emulated_instruction(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); - return ret; + trace_kvm_msr_write(msr, data); + + return r ? EXIT_FASTPATH_REENTER_GUEST : EXIT_FASTPATH_EXIT_USERSPACE; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); From aa2e4f029341c0b56645d49cd5959946cdab31b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:15 -0700 Subject: [PATCH 018/142] KVM: x86: Unconditionally grab data from EDX:EAX in WRMSR fastpath Always grab EDX:EAX in the WRMSR fastpath to deduplicate and simplify the case statements, and to prepare for handling immediate variants of WRMSRNS in the fastpath (the data register is explicitly provided in that case). There's no harm in reading the registers, as their values are always available, i.e. don't require VMREADs (or similarly slow operations). No real functional change intended. Cc: Xin Li Link: https://lore.kernel.org/r/20250805190526.1453366-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a5d7ab23d432..343bb38840dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2157,18 +2157,16 @@ static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { + u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - u64 data; bool handled; int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); handled = !handle_fastpath_set_tscdeadline(vcpu, data); break; default: From d618fb4e43a0287a54551aa01be58c75ac668671 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:16 -0700 Subject: [PATCH 019/142] KVM: x86: Fold WRMSR fastpath helpers into the main handler Fold the per-MSR WRMSR fastpath helpers into the main handler now that the IPI path in particular is relatively tiny. In addition to eliminating a decent amount of boilerplate, this removes the ugly -errno/1/0 => bool conversion (which is "necessitated" by kvm_x2apic_icr_write_fast()). Opportunistically drop the comment about IPIs, as the purpose of the fastpath is hopefully self-evident, and _if_ it needs more documentation, the documentation (and rules!) should be placed in a more central location. No functional change intended. Link: https://lore.kernel.org/r/20250805190526.1453366-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 343bb38840dc..f5e933f0e21a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2134,48 +2134,24 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) -{ - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); -} - -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -{ - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; -} - fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - bool handled; int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - handled = !handle_fastpath_set_tscdeadline(vcpu, data); + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - handled = false; - break; - } - - if (!handled) return EXIT_FASTPATH_NONE; + } kvm_vcpu_srcu_read_lock(vcpu); r = kvm_skip_emulated_instruction(vcpu); From a3e80bf73ee1f89ad4ccd53dd4f2654e23e72529 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:17 -0700 Subject: [PATCH 020/142] KVM: x86/pmu: Move kvm_init_pmu_capability() to pmu.c Move kvm_init_pmu_capability() to pmu.c so that future changes can access variables that have no business being visible outside of pmu.c. kvm_init_pmu_capability() is called once per module load, there's is zero reason it needs to be inlined. No functional change intended. Cc: Dapeng Mi Cc: Sandipan Das Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 47 +--------------------------------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a84fb3d28885..adb0fb8f6bb7 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -96,6 +96,53 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + enable_pmu = false; + + if (enable_pmu) { + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_MAX_NR_FIXED_COUNTERS); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..13477066eb40 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -180,52 +180,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; -static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) -{ - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; - - /* - * Hybrid PMUs don't play nice with virtualization without careful - * configuration by userspace, and KVM's APIs for reporting supported - * vPMU features do not account for hybrid PMUs. Disable vPMU support - * for hybrid PMUs until KVM gains a way to let userspace opt-in. - */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - enable_pmu = false; - - if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * WARN if perf did NOT disable hardware PMU if the number of - * architecturally required GP counters aren't present, i.e. if - * there are a non-zero number of counters, but fewer than what - * is architecturally required. - */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) - enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) - enable_pmu = false; - } - - if (!enable_pmu) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); - return; - } - - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, - pmu_ops->MAX_NR_GP_COUNTERS); - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_MAX_NR_FIXED_COUNTERS); - - kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); - kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -} +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { From 43f5bea2639ccca59541f33d62342543b4461937 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:18 -0700 Subject: [PATCH 021/142] KVM: x86/pmu: Add wrappers for counting emulated instructions/branches Add wrappers for triggering instruction retired and branch retired PMU events in anticipation of reworking the internal mechanisms to track which PMCs need to be evaluated, e.g. to avoid having to walk and check every PMC. Opportunistically bury "struct kvm_pmu_emulated_event_selectors" in pmu.c. No functional change intended. Link: https://lore.kernel.org/r/20250805190526.1453366-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 22 ++++++++++++++++++---- arch/x86/kvm/pmu.h | 9 ++------- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index adb0fb8f6bb7..9040bf20e851 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,8 +29,11 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; -EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; +static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { @@ -912,7 +915,7 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -949,7 +952,18 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) kvm_pmu_incr_counter(pmc); } } -EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); +} +EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); + +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); +} +EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 13477066eb40..740af816af37 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,11 +23,6 @@ #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED -struct kvm_pmu_emulated_event_selectors { - u64 INSTRUCTIONS_RETIRED; - u64 BRANCH_INSTRUCTIONS_RETIRED; -}; - struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -178,7 +173,6 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) } extern struct x86_pmu_capability kvm_pmu_cap; -extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); @@ -227,7 +221,8 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8ea1969113d..db2fd4eedc90 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3690,7 +3690,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5e933f0e21a..c44d3d64270b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8820,7 +8820,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); /* * rflags is the old, "raw" value of the flags. The new value has @@ -9161,9 +9161,9 @@ writeback: */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); From 5dfd498bad5f07b7419b5f834d95f7b61e767048 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:19 -0700 Subject: [PATCH 022/142] KVM: x86/pmu: Calculate set of to-be-emulated PMCs at time of WRMSRs Calculate and track PMCs that are counting instructions/branches retired when the PMC's event selector (or fixed counter control) is modified instead evaluating the event selector on-demand. Immediately recalc a PMC's configuration on writes to avoid false negatives/positives when KVM skips an emulated WRMSR, which is guaranteed to occur before the main run loop processes KVM_REQ_PMU. Out of an abundance of caution, and because it's relatively cheap, recalc reprogrammed PMCs in kvm_pmu_handle_event() as well. Recalculating in response to KVM_REQ_PMU _should_ be unnecessary, but for now be paranoid to avoid introducing easily-avoidable bugs in edge cases. The code can be removed in the future if necessary, e.g. in the unlikely event that the overhead of recalculating to-be-emulated PMCs is noticeable. Note! Deliberately don't check the PMU event filters, as doing so could result in KVM consuming stale information. Tracking which PMCs are counting branches/instructions will allow grabbing SRCU in the fastpath VM-Exit handlers if and only if a PMC event might be triggered (to consult the event filters), and will also allow the upcoming mediated PMU to do the right thing with respect to counting instructions (the mediated PMU won't be able to update PMCs in the VM-Exit fastpath). Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/pmu.c | 75 ++++++++++++++++++++++++--------- arch/x86/kvm/pmu.h | 4 ++ 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..d7680612ba1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -579,6 +579,9 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 9040bf20e851..8070097c5c4a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -542,6 +542,47 @@ static int reprogram_counter(struct kvm_pmc *pmc) eventsel & ARCH_PERFMON_EVENTSEL_INT); } +static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) +{ + /* + * Ignore checks for edge detect (all events currently emulated by KVM + * are always rising edges), pin control (unsupported by modern CPUs), + * and counter mask and its invert flag (KVM doesn't emulate multiple + * events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU + * can't be in a transaction if KVM is emulating an instruction. + * + * Checking the reserved bits might be wrong if they are defined in the + * future, but so could ignoring them, so do the simple thing for now. + */ + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); +} + +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); + + /* + * Do NOT consult the PMU event filters, as the filters must be checked + * at the time of emulation to ensure KVM uses fresh information, e.g. + * omitting a PMC from a bitmap could result in a missed event if the + * filter is changed to allow counting the event. + */ + if (!pmc_speculative_in_use(pmc)) + return; + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); +} +EXPORT_SYMBOL_GPL(kvm_pmu_recalc_pmc_emulation); + void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); @@ -577,6 +618,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); + + kvm_for_each_pmc(pmu, pmc, bit, bitmap) + kvm_pmu_recalc_pmc_emulation(pmu, pmc); } int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) @@ -915,7 +959,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, + const unsigned long *event_pmcs) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -924,29 +969,17 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) + return; + if (!kvm_pmu_has_perf_global_ctrl(pmu)) - bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, event_pmcs, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - /* - * Ignore checks for edge detect (all events currently emulated - * but KVM are always rising edges), pin control (unsupported - * by modern CPUs), and counter mask and its invert flag (KVM - * doesn't emulate multiple events in a single clock cycle). - * - * Note, the uppermost nibble of AMD's mask overlaps Intel's - * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved - * bits (bits 35:34). Checking the "in HLE/RTM transaction" - * flags is correct as the vCPU can't be in a transaction if - * KVM is emulating an instruction. Checking the reserved bits - * might be wrong if they are defined in the future, but so - * could ignoring them, so do the simple thing for now. - */ - if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || - !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); @@ -955,13 +988,13 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); } EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 740af816af37..cb93a936a177 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -176,8 +176,12 @@ extern struct x86_pmu_capability kvm_pmu_cap; void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); + static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { + kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc); + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } From 6b6f1adc43320494197ddbefbe933faea21e2d10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:20 -0700 Subject: [PATCH 023/142] KVM: x86/pmu: Rename pmc_speculative_in_use() to pmc_is_locally_enabled() Rename pmc_speculative_in_use() to pmc_is_locally_enabled() to better capture what it actually tracks, and to show its relationship to pmc_is_globally_enabled(). While neither AMD nor Intel refer to event selectors or the fixed counter control MSR as "local", it's the obvious name to pair with "global". As for "speculative", there's absolutely nothing speculative about the checks. E.g. for PMUs without PERF_GLOBAL_CTRL, from the guest's perspective, the counters are "in use" without any qualifications. No functional change intended. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 +++--- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8070097c5c4a..55b995c3ed08 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -493,7 +493,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) static bool pmc_event_is_allowed(struct kvm_pmc *pmc) { - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + return pmc_is_globally_enabled(pmc) && pmc_is_locally_enabled(pmc) && check_pmu_event_filter(pmc); } @@ -572,7 +572,7 @@ void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) * omitting a PMC from a bitmap could result in a missed event if the * filter is changed to allow counting the event. */ - if (!pmc_speculative_in_use(pmc)) + if (!pmc_is_locally_enabled(pmc)) return; if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) @@ -912,7 +912,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, bitmask) { - if (pmc->perf_event && !pmc_speculative_in_use(pmc)) + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) pmc_stop_counter(pmc); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cb93a936a177..08ae644db00e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -160,7 +160,7 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0b173602821b..07baff96300f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -762,7 +762,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) int bit, hw_idx; kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { - if (!pmc_speculative_in_use(pmc) || + if (!pmc_is_locally_enabled(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; From e630bb52d27f4191324d3443ecb2b115d7fb14b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:21 -0700 Subject: [PATCH 024/142] KVM: x86/pmu: Open code pmc_event_is_allowed() in its callers Open code pmc_event_is_allowed() in its callers, as kvm_pmu_trigger_event() only needs to check the event filter (both global and local enables are consulted outside of the loop). No functional change intended. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 55b995c3ed08..fa3464559698 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -491,12 +491,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } -static bool pmc_event_is_allowed(struct kvm_pmc *pmc) -{ - return pmc_is_globally_enabled(pmc) && pmc_is_locally_enabled(pmc) && - check_pmu_event_filter(pmc); -} - static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -507,7 +501,8 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); - if (!pmc_event_is_allowed(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !check_pmu_event_filter(pmc)) return 0; if (emulate_overflow) @@ -979,7 +974,8 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); From 58baa649ea09e7b719d0ef51aba894d63c2bb64f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:22 -0700 Subject: [PATCH 025/142] KVM: x86/pmu: Drop redundant check on PMC being globally enabled for emulation When triggering PMC events in response to emulation, drop the redundant checks on a PMC being globally and locally enabled, as the passed in bitmap contains only PMCs that are locally enabled (and counting the right event), and the local copy of the bitmap has already been masked with global_ctrl. No true functional change intended. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index fa3464559698..744717d379cb 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -974,7 +974,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + if (!pmc_is_locally_enabled(pmc) || !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; From 8709656514c1106933befcdefdcc5fd9bc013ed2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:23 -0700 Subject: [PATCH 026/142] KVM: x86/pmu: Drop redundant check on PMC being locally enabled for emulation Drop the check on a PMC being locally enabled when triggering emulated events, as the bitmap of passed-in PMCs only contains locally enabled PMCs. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 744717d379cb..b7514082e1ac 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -974,8 +974,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_is_locally_enabled(pmc) || - !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) + if (!check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); From 3eced8b07bb984a3bd2959f0644c14929c848c3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:24 -0700 Subject: [PATCH 027/142] KVM: x86/pmu: Rename check_pmu_event_filter() to pmc_is_event_allowed() Rename check_pmu_event_filter() to make its polarity more obvious, and to connect the dots to is_gp_event_allowed() and is_fixed_event_allowed(). No functional change intended. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b7514082e1ac..a1b8223a11f8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -476,7 +476,7 @@ static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, return true; } -static bool check_pmu_event_filter(struct kvm_pmc *pmc) +static bool pmc_is_event_allowed(struct kvm_pmc *pmc) { struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; @@ -502,7 +502,7 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || - !check_pmu_event_filter(pmc)) + !pmc_is_event_allowed(pmc)) return 0; if (emulate_overflow) @@ -974,7 +974,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); From 8bb8b60c95c55c13f9924f3f090232e14d035d43 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:25 -0700 Subject: [PATCH 028/142] KVM: x86: Push acquisition of SRCU in fastpath into kvm_pmu_trigger_event() Acquire SRCU in the VM-Exit fastpath if and only if KVM needs to check the PMU event filter, to further trim the amount of code that is executed with SRCU protection in the fastpath. Counter-intuitively, holding SRCU can do more harm than good due to masking potential bugs, and introducing a new SRCU-protected asset to code reachable via kvm_skip_emulated_instruction() would be quite notable, i.e. definitely worth auditing. E.g. the primary user of kvm->srcu is KVM's memslots, accessing memslots all but guarantees guest memory may be accessed, accessing guest memory can fault, and page faults might sleep, which isn't allowed while IRQs are disabled. Not acquiring SRCU means the (hypothetical) illegal sleep would be flagged when running with PROVE_RCU=y, even if DEBUG_ATOMIC_SLEEP=n. Note, performance is NOT a motivating factor, as SRCU lock/unlock only adds ~15 cycles of latency to fastpath VM-Exits. I.e. overhead isn't a concern _if_ SRCU protection needs to be extended beyond PMU events, e.g. to honor userspace MSR filters. Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250805190526.1453366-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 4 +++- arch/x86/kvm/x86.c | 18 +++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a1b8223a11f8..40b076a7a5f6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -960,7 +960,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - int i; + int i, idx; BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); @@ -973,12 +973,14 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_for_each_pmc(pmu, pmc, i, bitmap) { if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c44d3d64270b..093bfc8d00b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2138,7 +2138,6 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): @@ -2153,13 +2152,12 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - kvm_vcpu_srcu_read_lock(vcpu); - r = kvm_skip_emulated_instruction(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - trace_kvm_msr_write(msr, data); - return r ? EXIT_FASTPATH_REENTER_GUEST : EXIT_FASTPATH_EXIT_USERSPACE; + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -11254,13 +11252,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { - int ret; - - kvm_vcpu_srcu_read_lock(vcpu); - ret = kvm_emulate_halt(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - - if (!ret) + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; if (kvm_vcpu_running(vcpu)) From 6c3d4b917995a17f515943ccd39ba11b81753b0d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 12:05:26 -0700 Subject: [PATCH 029/142] KVM: x86: Add a fastpath handler for INVD Add a fastpath handler for INVD so that the common fastpath logic can be trivially tested on both Intel and AMD. Under KVM, INVD is always: (a) intercepted, (b) available to the guest, and (c) emulated as a nop, with no side effects. Combined with INVD not having any inputs or outputs, i.e. no register constraints, INVD is the perfect instruction for exercising KVM's fastpath as it can be inserted into practically any guest-side code stream. Link: https://lore.kernel.org/r/20250805190526.1453366-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 9 +++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 14 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 829d9d46718d..f7e1e665a826 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4200,6 +4200,8 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return handle_fastpath_set_msr_irqoff(vcpu); case SVM_EXIT_HLT: return handle_fastpath_hlt(vcpu); + case SVM_EXIT_INVD: + return handle_fastpath_invd(vcpu); default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..95765db52992 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7175,6 +7175,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); + case EXIT_REASON_INVD: + return handle_fastpath_invd(vcpu); default: return EXIT_FASTPATH_NONE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 093bfc8d00b3..6e56d5cff44d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2087,6 +2087,15 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} +EXPORT_SYMBOL_GPL(handle_fastpath_invd); + int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bcfd9b719ada..46220b04cdf2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -439,6 +439,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; From 3c7cb84145336721eddc86982532df84bbc80853 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Aug 2025 13:22:19 -0700 Subject: [PATCH 030/142] x86/cpufeatures: Add a CPU feature bit for MSR immediate form instructions The immediate form of MSR access instructions are primarily motivated by performance, not code size: by having the MSR number in an immediate, it is available *much* earlier in the pipeline, which allows the hardware much more leeway about how a particular MSR is handled. Use a scattered CPU feature bit for MSR immediate form instructions. Suggested-by: Borislav Petkov (AMD) Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250805202224.1475590-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 06fc0479a23f..eb859299d514 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -495,6 +495,7 @@ #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +#define X86_FEATURE_MSR_IMM (21*32+14) /* MSR immediate form instructions */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c3..cf4ae822bcc0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 }, { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, From ec400f6c2f2703cb6c698dd00b28cfdb8ee5cdcc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Aug 2025 13:22:20 -0700 Subject: [PATCH 031/142] KVM: x86: Rename local "ecx" variables to "msr" and "pmc" as appropriate Rename "ecx" variables in {RD,WR}MSR and RDPMC helpers to "msr" and "pmc" respectively, in anticipation of adding support for the immediate variants of RDMSR and WRMSRNS, and to better document what the variables hold (versus where the data originated). No functional change intended. Link: https://lore.kernel.org/r/20250805202224.1475590-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e56d5cff44d..f7c5db3d2652 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1573,10 +1573,10 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 pmc = kvm_rcx_read(vcpu); u64 data; - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -2027,23 +2027,23 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 msr = kvm_rcx_read(vcpu); u64 data; int r; - r = kvm_get_msr_with_filter(vcpu, ecx, &data); + r = kvm_get_msr_with_filter(vcpu, msr, &data); if (!r) { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(msr, data); kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { /* MSR read failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, complete_fast_rdmsr, r)) return 0; - trace_kvm_msr_read_ex(ecx); + trace_kvm_msr_read_ex(msr); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); @@ -2052,23 +2052,23 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 msr = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); int r; - r = kvm_set_msr_with_filter(vcpu, ecx, data); + r = kvm_set_msr_with_filter(vcpu, msr, data); if (!r) { - trace_kvm_msr_write(ecx, data); + trace_kvm_msr_write(msr, data); } else { /* MSR write failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, complete_fast_msr_access, r)) return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; - trace_kvm_msr_write_ex(ecx, data); + trace_kvm_msr_write_ex(msr, data); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); From 87a877de367d835b527d1086f75727123ef85fc4 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Aug 2025 13:22:21 -0700 Subject: [PATCH 032/142] KVM: x86: Rename handle_fastpath_set_msr_irqoff() to handle_fastpath_wrmsr() Rename the WRMSR fastpath API to drop "irqoff", as that information is redundant (the fastpath always runs with IRQs disabled), and to prepare for adding a fastpath for the immediate variant of WRMSRNS. No functional change intended. Signed-off-by: Xin Li (Intel) [sean: split to separate patch, write changelog] Link: https://lore.kernel.org/r/20250805202224.1475590-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- arch/x86/kvm/x86.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f7e1e665a826..ca550c4fa174 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4197,7 +4197,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) case SVM_EXIT_MSR: if (!control->exit_info_1) break; - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); case SVM_EXIT_HLT: return handle_fastpath_hlt(vcpu); case SVM_EXIT_INVD: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 95765db52992..ae2c8c10e5d2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7170,7 +7170,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7c5db3d2652..85e40d61d18b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2143,7 +2143,7 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) { u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); @@ -2168,7 +2168,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_REENTER_GUEST; } -EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); +EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr); /* * Adapt set_msr() to msr_io()'s calling convention diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 46220b04cdf2..2dab9c9d6199 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -437,7 +437,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); From 885df2d2109a60f84d84639ce6d95a91045f6c45 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Aug 2025 13:22:22 -0700 Subject: [PATCH 033/142] KVM: x86: Add support for RDMSR/WRMSRNS w/ immediate on Intel Add support for the immediate forms of RDMSR and WRMSRNS (currently Intel-only). The immediate variants are only valid in 64-bit mode, and use a single general purpose register for the data (the register is also encoded in the instruction, i.e. not implicit like regular RDMSR/WRMSR). The immediate variants are primarily motivated by performance, not code size: by having the MSR index in an immediate, it is available *much* earlier in the CPU pipeline, which allows hardware much more leeway about how a particular MSR is handled. Intel VMX support for the immediate forms of MSR accesses communicates exit information to the host as follows: 1) The immediate form of RDMSR uses VM-Exit Reason 84. 2) The immediate form of WRMSRNS uses VM-Exit Reason 85. 3) For both VM-Exit reasons 84 and 85, the Exit Qualification field is set to the MSR index that triggered the VM-Exit. 4) Bits 3 ~ 6 of the VM-Exit Instruction Information field are set to the register encoding used by the immediate form of the instruction, i.e. the destination register for RDMSR, and the source for WRMSRNS. 5) The VM-Exit Instruction Length field records the size of the immediate form of the MSR instruction. To deal with userspace RDMSR exits, stash the destination register in a new kvm_vcpu_arch field, similar to cui_linear_rip, pio, etc. Alternatively, the register could be saved in kvm_run.msr or re-retrieved from the VMCS, but the former would require sanitizing the value to ensure userspace doesn't clobber the value to an out-of-bounds index, and the latter would require a new one-off kvm_x86_ops hook. Don't bother adding support for the instructions in KVM's emulator, as the only way for RDMSR/WRMSR to be encountered is if KVM is emulating large swaths of code due to invalid guest state, and a vCPU cannot have invalid guest state while in 64-bit mode. Signed-off-by: Xin Li (Intel) [sean: minor tweaks, massage and expand changelog] Link: https://lore.kernel.org/r/20250805202224.1475590-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/include/uapi/asm/vmx.h | 6 +++- arch/x86/kvm/vmx/nested.c | 13 ++++++-- arch/x86/kvm/vmx/vmx.c | 21 +++++++++++++ arch/x86/kvm/vmx/vmx.h | 5 +++ arch/x86/kvm/x86.c | 55 +++++++++++++++++++++++++++------ 6 files changed, 90 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d7680612ba1e..dbdec6025fde 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -929,6 +929,7 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); unsigned long cui_linear_rip; + int cui_rdmsr_imm_reg; gpa_t time; s8 pvclock_tsc_shift; @@ -2158,7 +2159,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiat int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); int kvm_emulate_invd(struct kvm_vcpu *vcpu); int kvm_emulate_mwait(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0f4a4cf84a7..9792e329343e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -94,6 +94,8 @@ #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 #define EXIT_REASON_TDCALL 77 +#define EXIT_REASON_MSR_READ_IMM 84 +#define EXIT_REASON_MSR_WRITE_IMM 85 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -158,7 +160,9 @@ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ { EXIT_REASON_NOTIFY, "NOTIFY" }, \ - { EXIT_REASON_TDCALL, "TDCALL" } + { EXIT_REASON_TDCALL, "TDCALL" }, \ + { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \ + { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index db2fd4eedc90..798776dddd43 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6216,19 +6216,26 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { - u32 msr_index = kvm_rcx_read(vcpu); + u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; + if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + msr_index = vmx_get_exit_qual(vcpu); + else + msr_index = kvm_rcx_read(vcpu); + /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -6527,6 +6534,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_MSR_READ_IMM: + case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ae2c8c10e5d2..44423d5f0e27 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6003,6 +6003,23 @@ static int handle_notify(struct kvm_vcpu *vcpu) return 1; } +static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) +{ + return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); +} + +static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + +static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6061,6 +6078,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, + [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; static const int kvm_vmx_max_exit_handlers = @@ -6495,6 +6514,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); + else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + return handle_wrmsr_imm(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d3389baf3ab3..24d65dac5e89 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -706,6 +706,11 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 3) & 0xf; +} + static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) { return (vmx_instr_info >> 28) & 0xf; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 85e40d61d18b..efd45b2e8f45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1991,6 +1991,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) return complete_fast_msr_access(vcpu); } +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -2025,39 +2034,53 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) { - u32 msr = kvm_rcx_read(vcpu); u64 data; int r; r = kvm_get_msr_with_filter(vcpu, msr, &data); - if (!r) { trace_kvm_msr_read(msr, data); - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); + if (reg < 0) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + kvm_register_write(vcpu, reg, data); + } } else { /* MSR read failed? See if we should ask user space */ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, - complete_fast_rdmsr, r)) + complete_rdmsr, r)) return 0; trace_kvm_msr_read_ex(msr); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, + complete_fast_rdmsr); +} EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + vcpu->arch.cui_rdmsr_imm_reg = reg; + + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr_imm); + +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); int r; r = kvm_set_msr_with_filter(vcpu, msr, data); - if (!r) { trace_kvm_msr_write(msr, data); } else { @@ -2073,8 +2096,20 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) return kvm_x86_call(complete_emulated_msr)(vcpu, r); } + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr_imm); + int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); From ec93675a325191c95bc322a4471c2f194f7211fc Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Aug 2025 13:22:23 -0700 Subject: [PATCH 034/142] KVM: VMX: Support the immediate form of WRMSRNS in the VM-Exit fastpath Add support for handling "WRMSRNS with an immediate" VM-Exits in KVM's fastpath. On Intel, all writes to the x2APIC ICR and to the TSC Deadline MSR are non-serializing, i.e. it's highly likely guest kernels will switch to using WRMSRNS when possible. And in general, any MSR written via WRMSRNS is probably worth handling in the fastpath, as the entire point of WRMSRNS is to shave cycles in hot paths. Signed-off-by: Xin Li (Intel) [sean: rewrite changelog, split rename to separate patch] Link: https://lore.kernel.org/r/20250805202224.1475590-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 3 +++ arch/x86/kvm/x86.c | 17 +++++++++++++---- arch/x86/kvm/x86.h | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44423d5f0e27..a3f0d458be9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7192,6 +7192,9 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_wrmsr(vcpu); + case EXIT_REASON_MSR_WRITE_IMM: + return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efd45b2e8f45..b47a6a4ced15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2178,11 +2178,8 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - u64 data = kvm_read_edx_eax(vcpu); - u32 msr = kvm_rcx_read(vcpu); - switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || @@ -2203,8 +2200,20 @@ fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_REENTER_GUEST; } + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr); +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr_imm); + /* * Adapt set_msr() to msr_io()'s calling convention */ diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2dab9c9d6199..eb3088684e8a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -438,6 +438,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); From d90ebf5a06ecc6d4d7f72d12c5029519af8f838d Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Aug 2025 13:22:24 -0700 Subject: [PATCH 035/142] KVM: x86: Advertise support for the immediate form of MSR instructions Advertise support for the immediate form of MSR instructions to userspace if the instructions are supported by the underlying CPU, and KVM is using VMX, i.e. is running on an Intel-compatible CPU. For SVM, explicitly clear X86_FEATURE_MSR_IMM to ensure KVM doesn't over- report support if AMD-compatible CPUs ever implement the immediate forms, as SVM will likely require explicit enablement in KVM. Signed-off-by: Xin Li (Intel) [sean: massage changelog] Link: https://lore.kernel.org/r/20250805202224.1475590-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 6 +++++- arch/x86/kvm/reverse_cpuid.h | 5 +++++ arch/x86/kvm/svm/svm.c | 6 +++++- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dbdec6025fde..735b5d1e62dd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -774,6 +774,7 @@ enum kvm_only_cpuid_leafs { CPUID_7_2_EDX, CPUID_24_0_EBX, CPUID_8000_0021_ECX, + CPUID_7_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bee8c869259f..2705545f67fe 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -985,6 +985,10 @@ void kvm_set_cpu_caps(void) F(LAM), ); + kvm_cpu_cap_init(CPUID_7_1_ECX, + SCATTERED_F(MSR_IMM), + ); + kvm_cpu_cap_init(CPUID_7_1_EDX, F(AVX_VNNI_INT8), F(AVX_NE_CONVERT), @@ -1411,9 +1415,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); + cpuid_entry_override(entry, CPUID_7_1_ECX); cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; - entry->ecx = 0; } if (max_idx >= 2) { entry = do_host_cpuid(array, function, 2); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index c53b92379e6e..743ab25ba787 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -25,6 +25,9 @@ #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) #define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) +/* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */ +#define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5) + /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) @@ -87,6 +90,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, + [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, }; /* @@ -128,6 +132,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); + KVM_X86_TRANSLATE_FEATURE(MSR_IMM); default: return x86_feature; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca550c4fa174..7e7821ee8ee1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5311,8 +5311,12 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); - /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + /* + * Clear capabilities that are automatically configured by common code, + * but that require explicit SVM support (that isn't yet implemented). + */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); + kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); } static __init int svm_hardware_setup(void) From d2dcf25a4cf2d9058a866c2237884287209b8d19 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Mon, 11 Aug 2025 19:55:09 -0700 Subject: [PATCH 036/142] KVM: x86: Rename kvm_{g,s}et_msr()* to show that they emulate guest accesses Rename kvm_{g,s}et_msr_with_filter() kvm_{g,s}et_msr() to kvm_emulate_msr_{read,write} __kvm_emulate_msr_{read,write} to make it more obvious that KVM uses these helpers to emulate guest behaviors, i.e., host_initiated == false in these helpers. Suggested-by: Sean Christopherson Suggested-by: Chao Gao Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Signed-off-by: Chao Gao Tested-by: Rick Edgecombe Link: https://lore.kernel.org/r/20250812025606.74625-2-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 8 ++++---- arch/x86/kvm/smm.c | 4 ++-- arch/x86/kvm/vmx/nested.c | 14 +++++++------- arch/x86/kvm/x86.c | 27 ++++++++++++++------------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 735b5d1e62dd..3f59554df6b1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2154,11 +2154,11 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 9864c057187d..5dd8a1646800 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -529,7 +529,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, vcpu->arch.smbase = smstate->smbase; - if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) + if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) return X86EMUL_UNHANDLEABLE; rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); @@ -620,7 +620,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) /* And finally go back to 32-bit mode. */ efer = 0; - kvm_set_msr(vcpu, MSR_EFER, efer); + __kvm_emulate_msr_write(vcpu, MSR_EFER, efer); } #endif diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 798776dddd43..2156c9a854f4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -997,7 +997,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { + if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -1033,7 +1033,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { + if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -2770,8 +2770,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) { + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -4758,8 +4758,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl)); + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -4937,7 +4937,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { + if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b47a6a4ced15..e1eff02f37c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1933,33 +1933,33 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, __kvm_get_msr); } -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); +EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); +EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr); +EXPORT_SYMBOL_GPL(__kvm_emulate_msr_read); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr); +EXPORT_SYMBOL_GPL(__kvm_emulate_msr_write); static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { @@ -2040,7 +2040,8 @@ static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, u64 data; int r; - r = kvm_get_msr_with_filter(vcpu, msr, &data); + r = kvm_emulate_msr_read(vcpu, msr, &data); + if (!r) { trace_kvm_msr_read(msr, data); @@ -2080,7 +2081,7 @@ static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int r; - r = kvm_set_msr_with_filter(vcpu, msr, data); + r = kvm_emulate_msr_write(vcpu, msr, data); if (!r) { trace_kvm_msr_write(msr, data); } else { @@ -8366,7 +8367,7 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8389,7 +8390,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr_with_filter(vcpu, msr_index, data); + r = kvm_emulate_msr_write(vcpu, msr_index, data); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8409,7 +8410,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) From db07f3d0eb19663d8fb61b40c19b26703c9a1b1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Aug 2025 19:55:10 -0700 Subject: [PATCH 037/142] KVM: x86: Use double-underscore read/write MSR helpers as appropriate Use the double-underscore helpers for emulating MSR reads and writes in he no-underscore versions to better capture the relationship between the two sets of APIs (the double-underscore versions don't honor userspace MSR filters). No functional change intended. Signed-off-by: Chao Gao Tested-by: Rick Edgecombe Link: https://lore.kernel.org/r/20250812025606.74625-3-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1eff02f37c7..52ad2d2f41cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1933,22 +1933,6 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, __kvm_get_msr); } -int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; - return kvm_get_msr_ignored_check(vcpu, index, data, false); -} -EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); - -int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) -{ - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; - return kvm_set_msr_ignored_check(vcpu, index, data, false); -} -EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); - int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); @@ -1961,6 +1945,25 @@ int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(__kvm_emulate_msr_write); +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); +} +EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); + +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); +} +EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); + + static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { if (!vcpu->run->msr.error) { From c2aa58b226abf5ac6d355fb1f3b7c4284a7b5cab Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Mon, 11 Aug 2025 19:55:11 -0700 Subject: [PATCH 038/142] KVM: x86: Add kvm_msr_{read,write}() helpers Wrap __kvm_{get,set}_msr() into two new helpers for KVM usage and use the helpers to replace existing usage of the raw functions. kvm_msr_{read,write}() are KVM-internal helpers, i.e. used when KVM needs to get/set a MSR value for emulating CPU behavior, i.e., host_initiated == %true in the helpers. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Tested-by: Mathias Krause Tested-by: John Allen Signed-off-by: Chao Gao Tested-by: Rick Edgecombe Link: https://lore.kernel.org/r/20250812025606.74625-4-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/x86.c | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f59554df6b1..b1f87b5750f8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2156,9 +2156,10 @@ void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2705545f67fe..ad6cadf09930 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -2006,7 +2006,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (function == 7 && index == 0) { u64 data; if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && - !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52ad2d2f41cb..59a431dce015 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1899,8 +1899,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1926,6 +1926,16 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, true); +} + +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -12472,7 +12482,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); - __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ From 41f6710f99f4337924e3929e8e7a51c74f800b91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Aug 2025 19:55:12 -0700 Subject: [PATCH 039/142] KVM: x86: Manually clear MPX state only on INIT Don't manually clear/zero MPX state on RESET, as the guest FPU state is zero allocated and KVM only does RESET during vCPU creation, i.e. the relevant state is guaranteed to be all zeroes. Opportunistically move the relevant code into a helper in anticipation of adding support for CET shadow stacks, which also has state that is zeroed on INIT. Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Signed-off-by: Chao Gao Tested-by: Rick Edgecombe Link: https://lore.kernel.org/r/20250812025606.74625-5-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59a431dce015..d398ee84c8f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12398,6 +12398,35 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvfree(vcpu->arch.cpuid_entries); } +static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + + /* + * Guest FPU state is zero allocated and so doesn't need to be manually + * cleared on RESET, i.e. during vCPU creation. + */ + if (!init_event || !fpstate) + return; + + /* + * On INIT, only select XSTATE components are zeroed, most components + * are unchanged. Currently, the only components that are zeroed and + * supported by KVM are MPX related. + */ + if (!kvm_mpx_supported()) + return; + + /* + * All paths that lead to INIT are required to load the guest's FPU + * state (because most paths are buried in KVM_RUN). + */ + kvm_put_guest_fpu(vcpu); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + kvm_load_guest_fpu(vcpu); +} + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_cpuid_entry2 *cpuid_0x1; @@ -12455,22 +12484,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { - struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; - - /* - * All paths that lead to INIT are required to load the guest's - * FPU state (because most paths are buried in KVM_RUN). - */ - if (init_event) - kvm_put_guest_fpu(vcpu); - - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); - - if (init_event) - kvm_load_guest_fpu(vcpu); - } + kvm_xstate_reset(vcpu, init_event); if (!init_event) { vcpu->arch.smbase = 0x30000; From c26675447faff8c4ddc1dc5d2cd28326b8181aaf Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Mon, 11 Aug 2025 19:55:13 -0700 Subject: [PATCH 040/142] KVM: x86: Zero XSTATE components on INIT by iterating over supported features Tweak the code a bit to facilitate resetting more xstate components in the future, e.g., CET's xstate-managed MSRs. No functional change intended. Suggested-by: Sean Christopherson Tested-by: Mathias Krause Tested-by: John Allen Signed-off-by: Chao Gao Tested-by: Rick Edgecombe Link: https://lore.kernel.org/r/20250812025606.74625-6-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d398ee84c8f3..8bfba7d8f750 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12401,6 +12401,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) { struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + u64 xfeatures_mask; + int i; /* * Guest FPU state is zero allocated and so doesn't need to be manually @@ -12414,16 +12416,20 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) * are unchanged. Currently, the only components that are zeroed and * supported by KVM are MPX related. */ - if (!kvm_mpx_supported()) + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + if (!xfeatures_mask) return; + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); + /* * All paths that lead to INIT are required to load the guest's FPU * state (because most paths are buried in KVM_RUN). */ kvm_put_guest_fpu(vcpu); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) + fpstate_clear_xstate_component(fpstate, i); kvm_load_guest_fpu(vcpu); } From c78af20374a1c9c230cc535857d2af3de5d4442c Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:26 -0700 Subject: [PATCH 041/142] KVM: SEV: Drop GHCB_VERSION_DEFAULT and open code it Remove the GHCB_VERSION_DEFAULT macro and open code it with '2'. The macro is used conditionally and is not a true default. KVM ABI does not advertise/emumerates the default GHCB version. Any future change to this macro would silently alter the ABI and potentially break existing deployments that rely on the current behavior. Additionally, move the GHCB version assignment earlier in the code flow and update the comment to clarify that KVM_SEV_INIT2 defaults to version 2, while KVM_SEV_INIT forces version 1. No functional change intended. Cc: Thomas Lendacky Cc: Michael Roth Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2fbdebf79fbb..212f790eedd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -37,7 +37,6 @@ #include "trace.h" #define GHCB_VERSION_MAX 2ULL -#define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) @@ -421,6 +420,14 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) return -EINVAL; + /* + * KVM supports the full range of mandatory features defined by version + * 2 of the GHCB protocol, so default to that for SEV-ES guests created + * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). + */ + if (es_active && !data->ghcb_version) + data->ghcb_version = 2; + if (unlikely(sev->active)) return -EINVAL; @@ -429,14 +436,6 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, sev->vmsa_features = data->vmsa_features; sev->ghcb_version = data->ghcb_version; - /* - * Currently KVM supports the full range of mandatory features defined - * by version 2 of the GHCB protocol, so default to that for SEV-ES - * guests created via KVM_SEV_INIT2. - */ - if (sev->es_active && !sev->ghcb_version) - sev->ghcb_version = GHCB_VERSION_DEFAULT; - if (vm_type == KVM_X86_SNP_VM) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; From 00f0b959ffb094ea677ca24a0bd14d300a3013a0 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:27 -0700 Subject: [PATCH 042/142] KVM: SEV: Enforce minimum GHCB version requirement for SEV-SNP guests Require a minimum GHCB version of 2 when starting SEV-SNP guests through KVM_SEV_INIT2. When a VMM attempts to start an SEV-SNP guest with an incompatible GHCB version (less than 2), reject the request early rather than allowing the guest kernel to start with an incorrect protocol version and fail later with GHCB_SNP_UNSUPPORTED guest termination. Not enforcing the minimum version typically causes the guest to request termination with GHCB_SNP_UNSUPPORTED error code: kvm_amd: SEV-ES guest requested termination: 0x0:0x2 Fixes: 4af663c2f64a ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") Cc: Thomas Lendacky Cc: Sean Christopherson Cc: Michael Roth Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 212f790eedd4..e88dce598785 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -405,6 +405,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; + bool snp_active = vm_type == KVM_X86_SNP_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; int ret; @@ -428,6 +429,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (es_active && !data->ghcb_version) data->ghcb_version = 2; + if (snp_active && data->ghcb_version < 2) + return -EINVAL; + if (unlikely(sev->active)) return -EINVAL; @@ -436,7 +440,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, sev->vmsa_features = data->vmsa_features; sev->ghcb_version = data->ghcb_version; - if (vm_type == KVM_X86_SNP_VM) + if (snp_active) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; ret = sev_asid_new(sev); @@ -454,7 +458,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, } /* This needs to happen after SEV/SNP firmware initialization. */ - if (vm_type == KVM_X86_SNP_VM) { + if (snp_active) { ret = snp_guest_req_init(kvm); if (ret) goto e_free; From 7b59c73fd611eae87e60e8bb0ab83a1475d8a3a7 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:28 -0700 Subject: [PATCH 043/142] x86/cpufeatures: Add SNP Secure TSC The Secure TSC feature for SEV-SNP allows guests to securely use the RDTSC and RDTSCP instructions, ensuring that the parameters used cannot be altered by the hypervisor once the guest is launched. For more details, refer to the AMD64 APM Vol 2, Section "Secure TSC". Acked-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Vaishali Thakkar Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 06fc0479a23f..f53d4943ea63 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -444,6 +444,7 @@ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_SNP_SECURE_TSC (19*32+ 8) /* SEV-SNP Secure TSC */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ From 34bd82aab15b8b9e8e9d923267283c84aa8c2789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:29 -0700 Subject: [PATCH 044/142] KVM: SVM: Move SEV-ES VMSA allocation to a dedicated sev_vcpu_create() helper Add a dedicated sev_vcpu_create() helper to allocate the VMSA page for SEV-ES+ vCPUs, and to allow for consolidating a variety of related SEV+ code in the near future. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 25 +++++++------------------ arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e88dce598785..c17cc4eb0fe1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4561,6 +4561,26 @@ void sev_init_vmcb(struct vcpu_svm *svm) sev_es_init_vmcb(svm); } +int sev_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct page *vmsa_page; + + if (!sev_es_guest(vcpu->kvm)) + return 0; + + /* + * SEV-ES guests require a separate (from the VMCB) VMSA page used to + * contain the encrypted register state of the guest. + */ + vmsa_page = snp_safe_alloc_page(); + if (!vmsa_page) + return -ENOMEM; + + svm->sev_es.vmsa = page_address(vmsa_page); + return 0; +} + void sev_es_vcpu_reset(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..3d4c14e0244f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1275,7 +1275,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; - struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1286,24 +1285,18 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) if (!vmcb01_page) goto out; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests require a separate VMSA page used to contain - * the encrypted register state of the guest. - */ - vmsa_page = snp_safe_alloc_page(); - if (!vmsa_page) - goto error_free_vmcb_page; - } + err = sev_vcpu_create(vcpu); + if (err) + goto error_free_vmcb_page; err = avic_init_vcpu(svm); if (err) - goto error_free_vmsa_page; + goto error_free_sev; svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmsa_page; + goto error_free_sev; } svm->x2avic_msrs_intercepted = true; @@ -1312,16 +1305,12 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); - if (vmsa_page) - svm->sev_es.vmsa = page_address(vmsa_page); - svm->guest_state_loaded = false; return 0; -error_free_vmsa_page: - if (vmsa_page) - __free_page(vmsa_page); +error_free_sev: + sev_free_vcpu(vcpu); error_free_vmcb_page: __free_page(vmcb01_page); out: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 58b9d168e0c8..cf2569b5451a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -854,6 +854,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +int sev_vcpu_create(struct kvm_vcpu *vcpu); void sev_free_vcpu(struct kvm_vcpu *vcpu); void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); @@ -880,6 +881,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +static inline int sev_vcpu_create(struct kvm_vcpu *vcpu) { return 0; } static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} static inline void sev_vm_destroy(struct kvm *kvm) {} static inline void __init sev_set_cpu_caps(void) {} From 3d4e882e3439593b406fa226bcdd48c92d2222a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:30 -0700 Subject: [PATCH 045/142] KVM: SEV: Move init of SNP guest state into sev_init_vmcb() Move the initialization of SNP guest state from svm_vcpu_reset() into sev_init_vmcb() to reduce the number of paths that deal with INIT/RESET for SEV+ vCPUs from 4+ to 1. Plumb in @init_event as necessary. Opportunistically check for an SNP guest outside of sev_snp_init_protected_guest_state() so that sev_init_vmcb() is consistent with respect to checking for SEV-ES+ and SNP+ guests. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 +++++++++------- arch/x86/kvm/svm/svm.c | 9 +++------ arch/x86/kvm/svm/svm.h | 4 +--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c17cc4eb0fe1..c5726b091680 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1975,7 +1975,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { dst_svm = to_svm(dst_vcpu); - sev_init_vmcb(dst_svm); + sev_init_vmcb(dst_svm, false); if (!dst->es_active) continue; @@ -3887,7 +3887,7 @@ next_range: /* * Invoked as part of svm_vcpu_reset() processing of an init event. */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_memory_slot *slot; @@ -3895,9 +3895,6 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) kvm_pfn_t pfn; gfn_t gfn; - if (!sev_snp_guest(vcpu->kvm)) - return; - guard(mutex)(&svm->sev_es.snp_vmsa_mutex); if (!svm->sev_es.snp_ap_waiting_for_reset) @@ -4546,8 +4543,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_XSETBV); } -void sev_init_vmcb(struct vcpu_svm *svm) +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) { + struct kvm_vcpu *vcpu = &svm->vcpu; + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); @@ -4557,7 +4556,10 @@ void sev_init_vmcb(struct vcpu_svm *svm) */ clr_exception_intercept(svm, GP_VECTOR); - if (sev_es_guest(svm->vcpu.kvm)) + if (init_event && sev_snp_guest(vcpu->kvm)) + sev_snp_init_protected_guest_state(vcpu); + + if (sev_es_guest(vcpu->kvm)) sev_es_init_vmcb(svm); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3d4c14e0244f..8ed135dbd649 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1083,7 +1083,7 @@ static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) svm_recalc_msr_intercepts(vcpu); } -static void init_vmcb(struct kvm_vcpu *vcpu) +static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -1221,7 +1221,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_BUSLOCK); if (sev_guest(vcpu->kvm)) - sev_init_vmcb(svm); + sev_init_vmcb(svm, init_event); svm_hv_init_vmcb(vmcb); @@ -1256,10 +1256,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (init_event) - sev_snp_init_protected_guest_state(vcpu); - - init_vmcb(vcpu); + init_vmcb(vcpu, init_event); if (!init_event) __svm_vcpu_reset(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index cf2569b5451a..321480ebe62f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -826,7 +826,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ int pre_sev_run(struct vcpu_svm *svm, int cpu); -void sev_init_vmcb(struct vcpu_svm *svm); +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_vcpu_reset(struct vcpu_svm *svm); @@ -864,7 +864,6 @@ int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); @@ -891,7 +890,6 @@ static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} -static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { return 0; From baf6ed177290db5873f318626970fa7fd4060579 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:31 -0700 Subject: [PATCH 046/142] KVM: SEV: Set RESET GHCB MSR value during sev_es_init_vmcb() Set the RESET value for the GHCB "MSR" during sev_es_init_vmcb() instead of sev_es_vcpu_reset() to allow for dropping sev_es_vcpu_reset() entirely. Note, the call to sev_init_vmcb() from sev_migrate_from() also kinda sorta emulates a RESET, but sev_migrate_from() immediately overwrites ghcb_gpa with the source's current value, so whether or not stuffing the GHCB version is correct/desirable is moot. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c5726b091680..ee7a05843548 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4480,7 +4480,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } -static void sev_es_init_vmcb(struct vcpu_svm *svm) +static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) { struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -4541,6 +4541,15 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); + + /* + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. + */ + if (!init_event) + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, + GHCB_VERSION_MIN, + sev_enc_bit)); } void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) @@ -4560,7 +4569,7 @@ void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) sev_snp_init_protected_guest_state(vcpu); if (sev_es_guest(vcpu->kvm)) - sev_es_init_vmcb(svm); + sev_es_init_vmcb(svm, init_event); } int sev_vcpu_create(struct kvm_vcpu *vcpu) @@ -4585,17 +4594,6 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) void sev_es_vcpu_reset(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); - - /* - * Set the GHCB MSR value as per the GHCB specification when emulating - * vCPU RESET for an SEV-ES guest. - */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, - GHCB_VERSION_MIN, - sev_enc_bit)); - mutex_init(&svm->sev_es.snp_vmsa_mutex); } From f7b1f0c1620db689e085569035a659340a3209c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:32 -0700 Subject: [PATCH 047/142] KVM: SEV: Fold sev_es_vcpu_reset() into sev_vcpu_create() Fold the remaining line of sev_es_vcpu_reset() into sev_vcpu_create() as there's no need for a dedicated RESET hook just to init a mutex, and the mutex should be initialized as early as possible anyways. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 7 ++----- arch/x86/kvm/svm/svm.c | 3 --- arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ee7a05843548..7d1d34e45310 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4577,6 +4577,8 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct page *vmsa_page; + mutex_init(&svm->sev_es.snp_vmsa_mutex); + if (!sev_es_guest(vcpu->kvm)) return 0; @@ -4592,11 +4594,6 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) return 0; } -void sev_es_vcpu_reset(struct vcpu_svm *svm) -{ - mutex_init(&svm->sev_es.snp_vmsa_mutex); -} - void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { struct kvm *kvm = svm->vcpu.kvm; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ed135dbd649..b237b4081c91 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1244,9 +1244,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->nmi_masked = false; svm->awaiting_iret_completion = false; - - if (sev_es_guest(vcpu->kvm)) - sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 321480ebe62f..3c7f208b7935 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -829,7 +829,6 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu); void sev_init_vmcb(struct vcpu_svm *svm, bool init_event); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); From a311fce2b694599cede76c5d7a0905336bc09803 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:33 -0700 Subject: [PATCH 048/142] KVM: SVM: Enable Secure TSC for SNP guests Add support for Secure TSC, allowing userspace to configure the Secure TSC feature for SNP guests. Use the SNP specification's desired TSC frequency parameter during the SNP_LAUNCH_START command to set the mean TSC frequency in KHz for Secure TSC enabled guests. Always use kvm->arch.arch.default_tsc_khz as the TSC frequency that is passed to SNP guests in the SNP_LAUNCH_START command. The default value is the host TSC frequency. The userspace can optionally change the TSC frequency via the KVM_SET_TSC_KHZ ioctl before calling the SNP_LAUNCH_START ioctl. Introduce the read-only MSR GUEST_TSC_FREQ (0xc0010134) that returns guest's effective frequency in MHZ when Secure TSC is enabled for SNP guests. Disable interception of this MSR when Secure TSC is enabled. Note that GUEST_TSC_FREQ MSR is accessible only to the guest and not from the hypervisor context. Co-developed-by: Ketan Chaturvedi Signed-off-by: Ketan Chaturvedi Reviewed-by: Kai Huang Reviewed-by: Tom Lendacky Signed-off-by: Nikunj A Dadhania [sean: contain Secure TSC to sev.c] Link: https://lore.kernel.org/r/20250819234833.3080255-9-seanjc@google.com [sean: return -EINVAL if TSC frequency is '0'] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 1 + arch/x86/kvm/svm/sev.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index ffc27f676243..17f6c3fedeee 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -299,6 +299,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_SECURE_TSC BIT(9) #define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7d1d34e45310..86fd270a1d6e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -146,6 +146,14 @@ static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } +static bool snp_is_secure_tsc_enabled(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && + !WARN_ON_ONCE(!sev_snp_guest(kvm)); +} + /* Must be called with the sev_bitmap_lock held */ static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { @@ -415,6 +423,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->flags) return -EINVAL; + if (!snp_active) + valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; + if (data->vmsa_features & ~valid_vmsa_features) return -EINVAL; @@ -2187,6 +2198,13 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) return -EINVAL; + if (snp_is_secure_tsc_enabled(kvm)) { + if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) + return -EINVAL; + + start.desired_tsc_khz = kvm->arch.default_tsc_khz; + } + sev->policy = params.policy; sev->snp_context = snp_context_create(kvm, argp); @@ -2195,6 +2213,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start.gctx_paddr = __psp_pa(sev->snp_context); start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); if (rc) { @@ -3085,6 +3104,9 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) + sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; } void sev_hardware_unsetup(void) @@ -4452,6 +4474,9 @@ void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, + !snp_is_secure_tsc_enabled(vcpu->kvm)); + /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. @@ -4591,6 +4616,9 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) return -ENOMEM; svm->sev_es.vmsa = page_address(vmsa_page); + + vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); + return 0; } From 2f5f8fb9de095e9b255a89269827f1761c714690 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Aug 2025 14:38:41 -0700 Subject: [PATCH 049/142] KVM: SEV: Save the SEV policy if and only if LAUNCH_START succeeds Wait until LAUNCH_START fully succeeds to set a VM's SEV/SNP policy so that KVM doesn't keep a potentially stale policy. In practice, the issue is benign as the policy is only used to detect if the VMSA can be decrypted, and the VMSA only needs to be decrypted if LAUNCH_UPDATE and thus LAUNCH_START succeeded. Fixes: 962e2b6152ef ("KVM: SVM: Decrypt SEV VMSA in dump_vmcb() if debugging is enabled") Cc: Tom Lendacky Cc: Kim Phillips Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250821213841.3462339-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86fd270a1d6e..a95b862afa23 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -583,8 +583,6 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - sev->policy = params.policy; - memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -632,6 +630,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_session; } + sev->policy = params.policy; sev->handle = start.handle; sev->fd = argp->sev_fd; @@ -2205,8 +2204,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start.desired_tsc_khz = kvm->arch.default_tsc_khz; } - sev->policy = params.policy; - sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -2222,6 +2219,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_context; } + sev->policy = params.policy; sev->fd = argp->sev_fd; rc = snp_bind_asid(kvm, &argp->error); if (rc) { From cbf5d9457462c83c1bf851c0b8b04296b037a9dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Aug 2025 14:42:07 -0700 Subject: [PATCH 050/142] KVM: x86: Move kvm_irq_delivery_to_apic() from irq.c to lapic.c Move kvm_irq_delivery_to_apic() to lapic.c as it is specific to local APIC emulation. This will allow burying more local APIC code in lapic.c, e.g. the various "lowest priority" helpers. No functional change intended. Link: https://lore.kernel.org/r/20250821214209.3463350-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 57 -------------------------------------------- arch/x86/kvm/irq.h | 4 ---- arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 3 +++ 4 files changed, 60 insertions(+), 61 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 16da89259011..a6b122f732be 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -195,63 +195,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) -{ - int r = -1; - struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; - unsigned int dest_vcpus = 0; - - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - - if (irq->dest_mode == APIC_DEST_PHYSICAL && - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - pr_info("apic: phys broadcast and lowest prio\n"); - irq->delivery_mode = APIC_DM_FIXED; - } - - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (!kvm_lowest_prio_delivery(irq)) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; - } else { - __set_bit(i, dest_vcpu_bitmap); - dest_vcpus++; - } - } - } - - if (dest_vcpus != 0) { - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); - - lowest = kvm_get_vcpu(kvm, idx); - } - - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); - - return r; -} - static void kvm_msi_to_lapic_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 5e62c1f79ce6..34f4a78a7a01 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -121,8 +121,4 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); - #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0c10b1e003c7..66de69cbd57c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1249,6 +1249,63 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, return ret; } +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, struct dest_map *dest_map) +{ + int r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { + pr_info("apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_lowest_prio_delivery(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { + if (!kvm_vector_hashing_enabled()) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } + } + } + + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 8b00e29741de..edfed763cf89 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -119,6 +119,9 @@ void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); From 73473f31a4bf9e19adc6926bba1bdde1c798175d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Aug 2025 14:42:08 -0700 Subject: [PATCH 051/142] KVM: x86: Make "lowest priority" helpers local to lapic.c Make various helpers for resolving lowest priority IRQs local to lapic.c now that kvm_irq_delivery_to_apic() lives in lapic.c as well. No functional change intended. Link: https://lore.kernel.org/r/20250821214209.3463350-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 19 ++++++++++++------- arch/x86/kvm/lapic.h | 9 --------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 66de69cbd57c..958ab7ff66b2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1061,8 +1061,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, } EXPORT_SYMBOL_GPL(kvm_apic_match_dest); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size) +static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size) { int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); @@ -1097,6 +1097,16 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, return false; } +static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint); +} + +static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) +{ + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. @@ -1449,11 +1459,6 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_unlock(); } -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) -{ - return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; -} - static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index edfed763cf89..50123fe7f58f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -105,7 +105,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); @@ -225,12 +224,6 @@ static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu) !kvm_x86_call(apic_init_signal_blocked)(vcpu); } -static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) -{ - return (irq->delivery_mode == APIC_DM_LOWEST || - irq->msi_redir_hint); -} - static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -245,8 +238,6 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); From aac057dd623132a1776be37b471e30b4589fdf76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Aug 2025 14:42:09 -0700 Subject: [PATCH 052/142] KVM: x86: Move vector_hashing into lapic.c Move the vector_hashing module param into lapic.c now that all usage is contained within the local APIC emulation code. Opportunistically drop the accessor and append "_enabled" to the variable to help capture that it's a boolean module param. No functional change intended. Link: https://lore.kernel.org/r/20250821214209.3463350-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 8 ++++++-- arch/x86/kvm/x86.c | 8 -------- arch/x86/kvm/x86.h | 1 - 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 958ab7ff66b2..ab48e057dbdc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -74,6 +74,10 @@ module_param(lapic_timer_advance, bool, 0444); #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + +static bool __read_mostly vector_hashing_enabled = true; +module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); + static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); @@ -1150,7 +1154,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (!kvm_lowest_prio_delivery(irq)) return true; - if (!kvm_vector_hashing_enabled()) { + if (!vector_hashing_enabled) { lowest = -1; for_each_set_bit(i, bitmap, 16) { if (!(*dst)[i]) @@ -1291,7 +1295,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { + if (!vector_hashing_enabled) { if (!lowest) lowest = vcpu; else if (kvm_apic_compare_prio(vcpu, lowest) < 0) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8bfba7d8f750..bf386f2ebba3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -164,9 +164,6 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, 0444); - bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); @@ -13549,11 +13546,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_vector_hashing_enabled(void) -{ - return vector_hashing; -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return (vcpu->arch.msr_kvm_poll_control & 1) == 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index eb3088684e8a..786e36fcd0fb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -431,7 +431,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len); From 3ccbf6f47098f5d5e247d1b7739d0fd90802187b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Aug 2025 15:03:47 +0800 Subject: [PATCH 053/142] KVM: x86/mmu: Return -EAGAIN if userspace deletes/moves memslot during prefault Return -EAGAIN if userspace attempts to delete or move a memslot while also prefaulting memory for that same memslot, i.e. force userspace to retry instead of trying to handle the scenario entirely within KVM. Unlike KVM_RUN, which needs to handle the scenario entirely within KVM because userspace has come to depend on such behavior, KVM_PRE_FAULT_MEMORY can return -EAGAIN without breaking userspace as this scenario can't have ever worked (and there's no sane use case for prefaulting to a memslot that's being deleted/moved). And also unlike KVM_RUN, the prefault path doesn't naturally guarantee forward progress. E.g. to handle such a scenario, KVM would need to drop and reacquire SRCU to break the deadlock between the memslot update (synchronizes SRCU) and the prefault (waits for the memslot update to complete). However, dropping SRCU creates more problems, as completing the memslot update will bump the memslot generation, which in turn will invalidate the MMU root. To handle that, prefaulting would need to handle pending KVM_REQ_MMU_FREE_OBSOLETE_ROOTS requests and do kvm_mmu_reload() prior to mapping each individual. I.e. to fully handle this scenario, prefaulting would eventually need to look a lot like vcpu_enter_guest(). Given that there's no reasonable use case and practically zero risk of breaking userspace, punt the problem to userspace and avoid adding unnecessary complexity to the prefault path. Note, TDX's guest_memfd post-populate path is unaffected as slots_lock is held for the entire duration of populate(), i.e. any memslot modifications will be fully serialized against TDX's flavor of prefaulting. Reported-by: Reinette Chatre Closes: https://lore.kernel.org/all/20250519023737.30360-1-yan.y.zhao@intel.com Debugged-by: Yan Zhao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250822070347.26451-1-yan.y.zhao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 92ff15969a36..53be6c2a5b0c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4653,10 +4653,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will - * be zapped before KVM inserts a new MMIO SPTE for the gfn. + * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the + * error to userspace if this is a prefault, as KVM's prefaulting ABI + * doesn't provide the same forward progress guarantees as KVM_RUN. */ - if (slot->flags & KVM_MEMSLOT_INVALID) + if (slot->flags & KVM_MEMSLOT_INVALID) { + if (fault->prefetch) + return -EAGAIN; + return RET_PF_RETRY; + } if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* From 2bc2694fe20bf06eb73524426e3f4581d7b28923 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Aug 2025 15:05:23 +0800 Subject: [PATCH 054/142] KVM: TDX: Do not retry locally when the retry is caused by invalid memslot Avoid local retries within the TDX EPT violation handler if a retry is triggered by faulting in an invalid memslot, indicating that the memslot is undergoing a removal process. Faulting in a GPA from an invalid memslot will never succeed, and holding SRCU prevents memslot deletion from succeeding, i.e. retrying when the memslot is being actively deleted will lead to (breakable) deadlock. Opportunistically export kvm_vcpu_gfn_to_memslot() to allow for a per-vCPU lookup (which, strictly speaking, is unnecessary since TDX doesn't support SMM, but aligns the TDX code with the MMU code). Fixes: b0327bb2e7e0 ("KVM: TDX: Retry locally in TDX EPT violation handler on RET_PF_RETRY") Reported-by: Reinette Chatre Closes: https://lore.kernel.org/all/20250519023737.30360-1-yan.y.zhao@intel.com [Yan: Wrote patch log, comment, fixed a minor error, function export] Signed-off-by: Yan Zhao Link: https://lore.kernel.org/r/20250822070523.26495-1-yan.y.zhao@intel.com [sean: massage changelog, relocate and tweak comment] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 11 +++++++++++ virt/kvm/kvm_main.c | 1 + 2 files changed, 12 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..bef2cd880041 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2002,6 +2002,8 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) * handle retries locally in their EPT violation handlers. */ while (1) { + struct kvm_memory_slot *slot; + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); if (ret != RET_PF_RETRY || !local_retry) @@ -2015,6 +2017,15 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) break; } + /* + * Bail if the memslot is invalid, i.e. is being deleted, as + * faulting in will never succeed and this task needs to drop + * SRCU in order to let memslot deletion complete. + */ + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa)); + if (slot && slot->flags & KVM_MEMSLOT_INVALID) + break; + cond_resched(); } return ret; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..f769d1dccc21 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2661,6 +2661,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { From fc55b4cda00aff08ea6dfe86411efa13bdb728c5 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 3 Sep 2025 02:29:50 +0200 Subject: [PATCH 055/142] KVM: nSVM: Replace kzalloc() + copy_from_user() with memdup_user() Replace kzalloc() followed by copy_from_user() with memdup_user() to improve and simplify svm_set_nested_state(). Return early if an error occurs instead of trying to allocate memory for 'save' when memory allocation for 'ctl' already failed. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250903002951.118912-1-thorsten.blum@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b7fd2e869998..826473f2d7c7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1798,17 +1798,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); - save = kzalloc(sizeof(*save), GFP_KERNEL); - if (!ctl || !save) - goto out_free; + ctl = memdup_user(&user_vmcb->control, sizeof(*ctl)); + if (IS_ERR(ctl)) + return PTR_ERR(ctl); - ret = -EFAULT; - if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) - goto out_free; - if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) - goto out_free; + save = memdup_user(&user_vmcb->save, sizeof(*save)); + if (IS_ERR(save)) { + kfree(ctl); + return PTR_ERR(save); + } ret = -EINVAL; __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); From d7fc7d9833f6b60805f67e3f913b7a646ce2a47e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Wed, 20 Aug 2025 20:50:01 +0000 Subject: [PATCH 056/142] KVM: SEV: Introduce new min,max sev_es and sev_snp asid variables Introduce new min, max sev_es_asid and sev_snp_asid variables. The new {min,max}_{sev_es,snp}_asid variables along with existing {min,max}_sev_asid variable simplifies partitioning of the SEV and SEV-ES+ ASID space. Suggested-by: Sean Christopherson Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Link: https://lore.kernel.org/r/1db48277e8e96a633d734786ea69bf830f014857.1755721927.git.ashish.kalra@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2fbdebf79fbb..b5f4e69ff579 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -85,6 +85,10 @@ static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; +static unsigned int max_sev_es_asid; +static unsigned int min_sev_es_asid; +static unsigned int max_snp_asid; +static unsigned int min_snp_asid; static unsigned long sev_me_mask; static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; @@ -173,20 +177,31 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) misc_cg_uncharge(type, sev->misc_cg, 1); } -static int sev_asid_new(struct kvm_sev_info *sev) +static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) { /* * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - * Note: min ASID can end up larger than the max if basic SEV support is - * effectively disabled by disallowing use of ASIDs for SEV guests. */ - unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; - unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; - unsigned int asid; + unsigned int min_asid, max_asid, asid; bool retry = true; int ret; + if (vm_type == KVM_X86_SNP_VM) { + min_asid = min_snp_asid; + max_asid = max_snp_asid; + } else if (sev->es_active) { + min_asid = min_sev_es_asid; + max_asid = max_sev_es_asid; + } else { + min_asid = min_sev_asid; + max_asid = max_sev_asid; + } + + /* + * The min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ if (min_asid > max_asid) return -ENOTTY; @@ -440,7 +455,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (vm_type == KVM_X86_SNP_VM) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; - ret = sev_asid_new(sev); + ret = sev_asid_new(sev, vm_type); if (ret) goto e_no_asid; @@ -3042,6 +3057,9 @@ void __init sev_hardware_setup(void) if (min_sev_asid == 1) goto out; + min_sev_es_asid = min_snp_asid = 1; + max_sev_es_asid = max_snp_asid = min_sev_asid - 1; + sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; @@ -3065,11 +3083,11 @@ out: if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", str_enabled_disabled(sev_es_supported), - min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + min_sev_es_asid, max_sev_es_asid); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", str_enabled_disabled(sev_snp_supported), - min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + min_snp_asid, max_snp_asid); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; From 6c7c620585c6537dd5dcc75f972b875caf00f773 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Wed, 20 Aug 2025 20:50:25 +0000 Subject: [PATCH 057/142] KVM: SEV: Add SEV-SNP CipherTextHiding support Ciphertext hiding prevents host accesses from reading the ciphertext of SNP guest private memory. Instead of reading ciphertext, the host reads will see constant default values (0xff). The SEV ASID space is split into SEV and SEV-ES/SEV-SNP ASID ranges. Enabling ciphertext hiding further splits the SEV-ES/SEV-SNP ASID space into separate ASID ranges for SEV-ES and SEV-SNP guests. Add a new off-by-default kvm-amd module parameter to enable ciphertext hiding and allow the admin to configure the SEV-ES and SEV-SNP ASID ranges. Simply cap the maximum SEV-SNP ASID as appropriate, i.e. don't reject loading KVM or disable ciphertest hiding for a too-big value, as KVM's general approach for module params is to sanitize inputs based on hardware/kernel support, not burn the world down. This also allows the admin to use -1u to assign all SEV-ES/SNP ASIDs to SNP without needing dedicated handling in KVM. Suggested-by: Sean Christopherson Signed-off-by: Ashish Kalra Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/95abc49edfde36d4fb791570ea2a4be6ad95fd0d.1755721927.git.ashish.kalra@amd.com Signed-off-by: Sean Christopherson --- .../admin-guide/kernel-parameters.txt | 21 ++++++++++++ arch/x86/kvm/svm/sev.c | 32 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf494..69d640efb661 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2957,6 +2957,27 @@ (enabled). Disable by KVM if hardware lacks support for NPT. + kvm-amd.ciphertext_hiding_asids= + [KVM,AMD] Ciphertext hiding prevents disallowed accesses + to SNP private memory from reading ciphertext. Instead, + reads will see constant default values (0xff). + + If ciphertext hiding is enabled, the joint SEV-ES and + SEV-SNP ASID space is partitioned into separate SEV-ES + and SEV-SNP ASID ranges, with the SEV-SNP range being + [1..max_snp_asid] and the SEV-ES range being + (max_snp_asid..min_sev_asid), where min_sev_asid is + enumerated by CPUID.0x.8000_001F[EDX]. + + A non-zero value enables SEV-SNP ciphertext hiding and + adjusts the ASID ranges for SEV-ES and SEV-SNP guests. + KVM caps the number of SEV-SNP ASIDs at the maximum + possible value, e.g. specifying -1u will assign all + joint SEV-ES and SEV-SNP ASIDs to SEV-SNP. Note, + assigning all joint ASIDs to SEV-SNP, i.e. configuring + max_snp_asid == min_sev_asid-1, will effectively make + SEV-ES unusable. + kvm-arm.mode= [KVM,ARM,EARLY] Select one of KVM/arm64's modes of operation. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b5f4e69ff579..01345b73f879 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -59,6 +59,9 @@ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); static u64 sev_supported_vmsa_features; +static unsigned int nr_ciphertext_hiding_asids; +module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); + #define AP_RESET_HOLD_NONE 0 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 @@ -201,6 +204,9 @@ static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) /* * The min ASID can end up larger than the max if basic SEV support is * effectively disabled by disallowing use of ASIDs for SEV guests. + * Similarly for SEV-ES guests the min ASID can end up larger than the + * max when ciphertext hiding is enabled, effectively disabling SEV-ES + * support. */ if (min_asid > max_asid) return -ENOTTY; @@ -3068,10 +3074,32 @@ void __init sev_hardware_setup(void) out: if (sev_enabled) { init_args.probe = true; + + if (sev_is_snp_ciphertext_hiding_supported()) + init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, + min_sev_asid - 1); + if (sev_platform_init(&init_args)) sev_supported = sev_es_supported = sev_snp_supported = false; else if (sev_snp_supported) sev_snp_supported = is_sev_snp_initialized(); + + if (sev_snp_supported) + nr_ciphertext_hiding_asids = init_args.max_snp_asid; + + /* + * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP + * ASID range is partitioned into separate SEV-ES and SEV-SNP + * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] + * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. + * Note, SEV-ES may effectively be disabled if all ASIDs from + * the joint range are assigned to SEV-SNP. + */ + if (nr_ciphertext_hiding_asids) { + max_snp_asid = nr_ciphertext_hiding_asids; + min_sev_es_asid = max_snp_asid + 1; + pr_info("SEV-SNP ciphertext hiding enabled\n"); + } } if (boot_cpu_has(X86_FEATURE_SEV)) @@ -3082,7 +3110,9 @@ out: min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - str_enabled_disabled(sev_es_supported), + sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : + "unusable" : + "disabled", min_sev_es_asid, max_sev_es_asid); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", From b3a37bff8daf50cdd6fa9ebe4a503d4261d99796 Mon Sep 17 00:00:00 2001 From: Sagi Shahar Date: Tue, 26 Aug 2025 18:17:26 -0700 Subject: [PATCH 058/142] KVM: TDX: Reject fully in-kernel irqchip if EOIs are protected, i.e. for TDX VMs Reject KVM_CREATE_IRQCHIP if the VM type has protected EOIs, i.e. if KVM can't intercept EOI and thus can't faithfully emulate level-triggered interrupts that are routed through the I/O APIC. For TDX VMs, the TDX-Module owns the VMX EOI-bitmap and configures all IRQ vectors to have the CPU accelerate EOIs, i.e. doesn't allow KVM to intercept any EOIs. KVM already requires a split irqchip[1], but does so during vCPU creation, which is both too late to allow userspace to fallback to a split irqchip and a less-than-stellar experience for userspace since an -EINVAL on KVM_VCPU_CREATE is far harder to debug/triage than failure exactly on KVM_CREATE_IRQCHIP. And of course, allowing an action that ultimately fails is arguably a bug regardless of the impact on userspace. Link: https://lore.kernel.org/lkml/20250222014757.897978-11-binbin.wu@linux.intel.com [1] Link: https://lore.kernel.org/lkml/aK3vZ5HuKKeFuuM4@google.com Suggested-by: Sean Christopherson Signed-off-by: Sagi Shahar Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250827011726.2451115-1-sagis@google.com [sean: massage shortlog+changelog, relocate setting has_protected_eoi] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/tdx.c | 5 +++++ arch/x86/kvm/x86.c | 9 +++++++++ 3 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b1f87b5750f8..990fbbb96ebf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1362,6 +1362,7 @@ struct kvm_arch { u8 vm_type; bool has_private_mem; bool has_protected_state; + bool has_protected_eoi; bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..17559f3ffbd5 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -629,6 +629,11 @@ int tdx_vm_init(struct kvm *kvm) struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); kvm->arch.has_protected_state = true; + /* + * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, + * i.e. all EOIs are accelerated and never trigger exits. + */ + kvm->arch.has_protected_eoi = true; kvm->arch.has_private_mem = true; kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf386f2ebba3..99f2a150ca78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6989,6 +6989,15 @@ set_identity_unlock: if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + /* + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, + * i.e. if KVM can't intercept EOIs and thus can't properly + * emulate level-triggered interrupts. + */ + r = -ENOTTY; + if (kvm->arch.has_protected_eoi) + goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; From 06dc910f5e07a3b85ea1a6891596689f492ecc72 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 18 Jul 2025 08:19:01 +0800 Subject: [PATCH 059/142] KVM: x86/pmu: Correct typo "_COUTNERS" to "_COUNTERS" Fix typos. "_COUTNERS" -> "_COUNTERS". Signed-off-by: Dapeng Mi Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250718001905.196989-2-dapeng1.mi@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 8 ++++---- arch/x86/kvm/vmx/pmu_intel.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 990fbbb96ebf..464a636b2dc6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -545,10 +545,10 @@ struct kvm_pmc { #define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ KVM_MAX_NR_AMD_GP_COUNTERS) -#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 -#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 -#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ - KVM_MAX_NR_AMD_FIXED_COUTNERS) +#define KVM_MAX_NR_INTEL_FIXED_COUNTERS 3 +#define KVM_MAX_NR_AMD_FIXED_COUNTERS 0 +#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUNTERS, \ + KVM_MAX_NR_AMD_FIXED_COUNTERS) struct kvm_pmu { u8 version; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 07baff96300f..b2a2c4ebf448 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -478,8 +478,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); - BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUNTERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUNTERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -625,7 +625,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUNTERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; From 4319fa120f0f42be167d56d84ece94340e11d61e Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 1 Sep 2025 21:18:21 +0800 Subject: [PATCH 060/142] KVM: x86: Use guard() instead of mutex_lock() to simplify code Use guard(mutex) instead of mutex_lock/mutex_unlock pair to simplify the error handling when allocating the APIC access page. No functional change intended. Signed-off-by: Liao Yuanhong Link: https://lore.kernel.org/r/20250901131822.647802-1-liaoyuanhong@vivo.com [sean: add blank link to isolate guard(), tweak changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ab48e057dbdc..c7486f213432 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2747,24 +2747,21 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) int kvm_alloc_apic_access_page(struct kvm *kvm) { void __user *hva; - int ret = 0; - mutex_lock(&kvm->slots_lock); + guard(mutex)(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled || kvm->arch.apic_access_memslot_inhibited) - goto out; + return 0; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } + if (IS_ERR(hva)) + return PTR_ERR(hva); kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; + + return 0; } EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); From cbd860293d139fae44b76d821d8112aab89d8e17 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 1 Sep 2025 21:16:04 +0800 Subject: [PATCH 061/142] KVM: x86: hyper-v: Use guard() instead of mutex_lock() to simplify code Use guard(mutex) instead of mutex_lock/mutex_unlock pair to simplify the error handling when setting up the TSC page for a Hyper-V guest. No functional change intended. Signed-off-by: Liao Yuanhong Link: https://lore.kernel.org/r/20250901131604.646415-1-liaoyuanhong@vivo.com [sean: tweak changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 72b19a88a776..a471900c7325 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1168,15 +1168,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - mutex_lock(&hv->hv_lock); + guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) - goto out_unlock; + return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; + return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* @@ -1192,7 +1192,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; } /* @@ -1228,12 +1228,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; -out_unlock: - mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) From 5b5133e6a55bf6ca18de1b33c2ffe17d1162ec8e Mon Sep 17 00:00:00 2001 From: Jiaming Zhang Date: Sat, 6 Sep 2025 01:47:36 +0800 Subject: [PATCH 062/142] Documentation: KVM: Call out that KVM strictly follows the 8254 PIT spec Explicitly document that the behavior of KVM_SET_PIT2 strictly conforms to the Intel 8254 PIT hardware specification, specifically that a write of '0' adheres to the spec's definition that a programmed count of '0' is converted to the maximum possible value (2^16). E.g. an unaware userspace might attempt to validate that KVM_GET_PIT2 returns the exact state set via KVM_SET_PIT2, and be surprised when the returned count is 65536, not 0. Add a references to the Intel 8254 PIT datasheet that will hopefully stay fresh for some time (the internet isn't exactly brimming with copies of the 8254 datasheet). Link: https://lore.kernel.org/all/CANypQFbEySjKOFLqtFFf2vrEe=NBr7XJfbkjQhqXuZGg7Rpoxw@mail.gmail.com Signed-off-by: Jiaming Zhang Link: https://lore.kernel.org/r/20250905174736.260694-1-r772577952@gmail.com [sean: add context Link, drop local APIC change, massage changelog accordingly] Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6aa40ee05a4a..b3f2395a62d1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3075,6 +3075,12 @@ This IOCTL replaces the obsolete KVM_GET_PIT. Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. See KVM_GET_PIT2 for details on struct kvm_pit_state2. +.. Tip:: + ``KVM_SET_PIT2`` strictly adheres to the spec of Intel 8254 PIT. For example, + a ``count`` value of 0 in ``struct kvm_pit_channel_state`` is interpreted as + 65536, which is the maximum count value. Refer to `Intel 8254 programmable + interval timer `_. + This IOCTL replaces the obsolete KVM_SET_PIT. From 4687a2c4e6a61b247ad14ffb2ef5ca56e44fa4f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:36 -0700 Subject: [PATCH 063/142] KVM: VMX: Setup canonical VMCS config prior to kvm_x86_vendor_init() Setup the golden VMCS config during vmx_init(), before the call to kvm_x86_vendor_init(), instead of waiting until the callback to do hardware setup. setup_vmcs_config() only touches VMX state, i.e. doesn't poke anything in kvm.ko, and has no runtime dependencies beyond hv_init_evmcs(). Setting the VMCS config early on will allow referencing VMCS and VMX capabilities at any point during setup, e.g. to check for PERF_GLOBAL_CTRL save/load support during mediated PMU initialization. Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a3f0d458be9d..99be305555a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8359,8 +8359,6 @@ __init int vmx_hardware_setup(void) vmx_setup_user_return_msrs(); - if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) - return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8584,11 +8582,18 @@ int __init vmx_init(void) return -EOPNOTSUPP; /* - * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing - * to unwind if a later step fails. + * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, + * i.e. there's nothing to unwind if a later step fails. */ hv_init_evmcs(); + /* + * Parse the VMCS config and VMX capabilities before anything else, so + * that the information is available to all setup flows. + */ + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) + return -EIO; + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; From e3d1f2826da68c763cba1f29dba5cc81d3fdaaee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:37 -0700 Subject: [PATCH 064/142] KVM: SVM: Check pmu->version, not enable_pmu, when getting PMC MSRs Gate access to PMC MSRs based on pmu->version, not on kvm->arch.enable_pmu, to more accurately reflect KVM's behavior. This is a glorified nop, as pmu->version and pmu->nr_arch_gp_counters can only be non-zero if amd_pmu_refresh() is reached, kvm_pmu_refresh() invokes amd_pmu_refresh() if and only if kvm->arch.enable_pmu is true, and amd_pmu_refresh() forces pmu->version to be 1 or 2. I.e. the following holds true: !pmu->nr_arch_gp_counters || kvm->arch.enable_pmu == (pmu->version > 0) and so the only way for amd_pmu_get_pmc() to return a non-NULL value is if both kvm->arch.enable_pmu and pmu->version evaluate to true. No real functional change intended. Reviewed-by: Sandipan Das Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index aa4379e46e96..f3163237286f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -41,7 +41,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); unsigned int idx; - if (!vcpu->kvm->arch.enable_pmu) + if (!pmu->version) return NULL; switch (msr) { From 51f34b1e650fc5843530266cea4341750bd1ae37 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:39 -0700 Subject: [PATCH 065/142] KVM: x86/pmu: Snapshot host (i.e. perf's) reported PMU capabilities Take a snapshot of the unadulterated PMU capabilities provided by perf so that KVM can compare guest vPMU capabilities against hardware capabilities when determining whether or not to intercept PMU MSRs (and RDPMC). Reviewed-by: Sandipan Das Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 40b076a7a5f6..5205f0d9ced9 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -26,6 +26,10 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +/* Unadultered PMU capabilities of the host, i.e. of hardware. */ +static struct x86_pmu_capability __read_mostly kvm_host_pmu; + +/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); @@ -104,6 +108,8 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + perf_get_x86_pmu_capability(&kvm_host_pmu); + /* * Hybrid PMUs don't play nice with virtualization without careful * configuration by userspace, and KVM's APIs for reporting supported @@ -114,18 +120,16 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) enable_pmu = false; if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - /* * WARN if perf did NOT disable hardware PMU if the number of * architecturally required GP counters aren't present, i.e. if * there are a non-zero number of counters, but fewer than what * is architecturally required. */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + if (!kvm_host_pmu.num_counters_gp || + WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs)) enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) + else if (is_intel && !kvm_host_pmu.version) enable_pmu = false; } @@ -134,6 +138,7 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) return; } + memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu)); kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, pmu_ops->MAX_NR_GP_COUNTERS); From 1e24bece26812547608dd02eb1fc138359d0f813 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 6 Aug 2025 12:56:44 -0700 Subject: [PATCH 066/142] KVM: x86: Rename vmx_vmentry/vmexit_ctrl() helpers Rename the two helpers vmx_vmentry/vmexit_ctrl() to vmx_get_initial_vmentry/vmexit_ctrl() to represent their real meaning. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 99be305555a7..75cd5768e8ad 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4304,7 +4304,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } -static u32 vmx_vmentry_ctrl(void) +static u32 vmx_get_initial_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; @@ -4321,7 +4321,7 @@ static u32 vmx_vmentry_ctrl(void) return vmentry_ctrl; } -static u32 vmx_vmexit_ctrl(void) +static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; @@ -4686,10 +4686,10 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); + vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); + vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); From cdfed9370b96aabaa2d20f65ca4e9c9b009fe8fa Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 6 Aug 2025 12:56:45 -0700 Subject: [PATCH 067/142] KVM: x86/pmu: Move PMU_CAP_{FW_WRITES,LBR_FMT} into msr-index.h header Move PMU_CAP_{FW_WRITES,LBR_FMT} into msr-index.h and rename them with PERF_CAP prefix to keep consistent with other perf capabilities macros. No functional change intended. Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 15 +++++++++------ arch/x86/kvm/vmx/capabilities.h | 3 --- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 12 ++++++------ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 20fa4a79df13..717baeba6db3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,15 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) + +#define PERF_CAP_LBR_FMT 0x3f +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_FW_WRITES BIT_ULL(13) +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 5316c27f6099..f614428dbeda 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -20,9 +20,6 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 -#define PMU_CAP_FW_WRITES (1ULL << 13) -#define PMU_CAP_LBR_FMT 0x3f - struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b2a2c4ebf448..343de013eacd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -138,7 +138,7 @@ static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) { - return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; } static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -588,7 +588,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (intel_pmu_lbr_is_compatible(vcpu) && - (perf_capabilities & PMU_CAP_LBR_FMT)) + (perf_capabilities & PERF_CAP_LBR_FMT)) memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75cd5768e8ad..5d3a50547c3e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2127,7 +2127,7 @@ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2412,9 +2412,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (data & PMU_CAP_LBR_FMT) { - if ((data & PMU_CAP_LBR_FMT) != - (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) + if (data & PERF_CAP_LBR_FMT) { + if ((data & PERF_CAP_LBR_FMT) != + (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -7810,7 +7810,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) static __init u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = PMU_CAP_FW_WRITES; + u64 perf_cap = PERF_CAP_FW_WRITES; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7830,7 +7830,7 @@ static __init u64 vmx_get_perf_capabilities(void) if (!vmx_lbr_caps.has_callstack) memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); else if (vmx_lbr_caps.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; } if (vmx_pebs_supported()) { From 6057497336bbfabd3a2f632bba2cd2bfbcb7b304 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:46 -0700 Subject: [PATCH 068/142] KVM: x86: Rework KVM_REQ_MSR_FILTER_CHANGED into a generic RECALC_INTERCEPTS Rework the MSR_FILTER_CHANGED request into a more generic RECALC_INTERCEPTS request, and expand the responsibilities of vendor code to recalculate all intercepts that vary based on userspace input, e.g. instruction intercepts that are tied to guest CPUID. Providing a generic recalc request will allow the upcoming mediated PMU support to trigger a recalc when PMU features, e.g. PERF_CAPABILITIES, are set by userspace, without having to make multiple calls to/from PMU code. As a bonus, using a request will effectively coalesce recalcs, e.g. will reduce the number of recalcs for normal usage from 3+ to 1 (vCPU create, set CPUID, set PERF_CAPABILITIES (Intel only), set filter). The downside is that MSR filter changes that are done in isolation will do a small amount of unnecessary work, but that's already a relatively slow path, and the cost of recalculating instruction intercepts is negligible. Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/main.c | 14 +++++++------- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- arch/x86/kvm/vmx/x86_ops.h | 2 +- arch/x86/kvm/x86.c | 15 +++++++-------- 7 files changed, 29 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 18a5c3119e1a..7c240e23bd52 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -138,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) -KVM_X86_OP(recalc_msr_intercepts) +KVM_X86_OP(recalc_intercepts) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 464a636b2dc6..7e1e41f04752 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -120,7 +120,7 @@ #define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) -#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) +#define KVM_REQ_RECALC_INTERCEPTS KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ @@ -1914,7 +1914,7 @@ struct kvm_x86_ops { int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); - void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu); + void (*recalc_intercepts)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7e7821ee8ee1..711f160113d1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1077,7 +1077,7 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) } } -static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) { svm_recalc_instruction_intercepts(vcpu); svm_recalc_msr_intercepts(vcpu); @@ -1225,7 +1225,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_hv_init_vmcb(vmcb); - svm_recalc_intercepts_after_set_cpuid(vcpu); + svm_recalc_intercepts(vcpu); vmcb_mark_all_dirty(vmcb); @@ -4479,7 +4479,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - svm_recalc_intercepts_after_set_cpuid(vcpu); + svm_recalc_intercepts(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5181,7 +5181,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .recalc_msr_intercepts = svm_recalc_msr_intercepts, + .recalc_intercepts = svm_recalc_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dbab1c15b0cd..68dcafd177a8 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -188,18 +188,18 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return vmx_get_msr(vcpu, msr_info); } -static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vt_recalc_intercepts(struct kvm_vcpu *vcpu) { /* - * TDX doesn't allow VMM to configure interception of MSR accesses. - * TDX guest requests MSR accesses by calling TDVMCALL. The MSR - * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR - * if the userspace has set any. + * TDX doesn't allow VMM to configure interception of instructions or + * MSR accesses. TDX guest requests MSR accesses by calling TDVMCALL. + * The MSR filters will be applied when handling the TDVMCALL for + * RDMSR/WRMSR if the userspace has set any. */ if (is_td_vcpu(vcpu)) return; - vmx_recalc_msr_intercepts(vcpu); + vmx_recalc_intercepts(vcpu); } static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) @@ -995,7 +995,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), + .recalc_intercepts = vt_op(recalc_intercepts), .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5d3a50547c3e..68bec421f3fc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4068,7 +4068,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { if (!cpu_has_vmx_msr_bitmap()) return; @@ -4121,6 +4121,11 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) */ } +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) +{ + vmx_recalc_msr_intercepts(vcpu); +} + static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -7802,7 +7807,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~FEAT_CTL_SGX_LC_ENABLED; /* Recalc MSR interception to account for feature changes. */ - vmx_recalc_msr_intercepts(vcpu); + vmx_recalc_intercepts(vcpu); /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 2b3424f638db..2c590ff44ced 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -52,7 +52,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99f2a150ca78..64e08148909b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6794,7 +6794,11 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, kvm_free_msr_filter(old_filter); - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); return 0; } @@ -10827,13 +10831,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); - /* - * Recalc MSR intercepts as userspace may want to intercept - * accesses to MSRs that KVM would otherwise pass through to - * the guest. - */ - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_call(recalc_msr_intercepts)(vcpu); + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) + kvm_x86_call(recalc_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) kvm_x86_call(update_cpu_dirty_logging)(vcpu); From 5a1a726e68ff256f564cf51ad21a96bceb4dd954 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:47 -0700 Subject: [PATCH 069/142] KVM: x86: Use KVM_REQ_RECALC_INTERCEPTS to react to CPUID updates Defer recalculating MSR and instruction intercepts after a CPUID update via RECALC_INTERCEPTS to converge on RECALC_INTERCEPTS as the "official" mechanism for triggering recalcs. As a bonus, because KVM does a "recalc" during vCPU creation, and every functional VMM sets CPUID at least once, for all intents and purposes this saves at least one recalc. Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm/svm.c | 4 +--- arch/x86/kvm/vmx/vmx.c | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ad6cadf09930..efee08fad72e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -448,6 +448,8 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); + + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 711f160113d1..22c5e4ffa132 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1225,7 +1225,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_hv_init_vmcb(vmcb); - svm_recalc_intercepts(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); vmcb_mark_all_dirty(vmcb); @@ -4478,8 +4478,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - - svm_recalc_intercepts(vcpu); } static bool svm_has_wbinvd_exit(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 68bec421f3fc..6b9523ca082c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7806,9 +7806,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; - /* Recalc MSR interception to account for feature changes. */ - vmx_recalc_intercepts(vcpu); - /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } From 2bff2edf69ed80b21d35470a3580f25979be2c4b Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 6 Aug 2025 12:56:48 -0700 Subject: [PATCH 070/142] KVM: VMX: Add helpers to toggle/change a bit in VMCS execution controls Expand the VMCS controls builder macros to generate helpers to change a bit to the desired value, and use the new helpers when toggling APICv related controls. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang [sean: rewrite changelog] Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 20 +++++++------------- arch/x86/kvm/vmx/vmx.h | 8 ++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b9523ca082c..d2fa64bddb57 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4356,19 +4356,13 @@ void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) { - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } + secondary_exec_controls_changebit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, + kvm_vcpu_apicv_active(vcpu)); + if (enable_ipiv) + tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, + kvm_vcpu_apicv_active(vcpu)); vmx_update_msr_bitmap_x2apic(vcpu); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 24d65dac5e89..23d6e89b96f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -608,6 +608,14 @@ static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##b { \ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +} \ +static __always_inline void lname##_controls_changebit(struct vcpu_vmx *vmx, u##bits val, \ + bool set) \ +{ \ + if (set) \ + lname##_controls_setbit(vmx, val); \ + else \ + lname##_controls_clearbit(vmx, val); \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) From 30c0267f15812114d7ab96a7dd45874742d736fe Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 6 Aug 2025 12:56:51 -0700 Subject: [PATCH 071/142] KVM: x86/pmu: Use BIT_ULL() instead of open coded equivalents Replace a variety of "1ull << N" and "(u64)1 << N" snippets with BIT_ULL() in the PMU code. No functional change intended. Signed-off-by: Dapeng Mi [sean: split to separate patch, write changelog] Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 4 ++-- arch/x86/kvm/vmx/pmu_intel.c | 15 ++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index f3163237286f..25ccd6a99838 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -200,11 +200,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; /* not applicable to AMD; but clean them to prevent any fall out */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 343de013eacd..096f091980f0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -536,11 +536,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); eax.split.bit_width = min_t(int, eax.split.bit_width, kvm_pmu_cap.bit_width_gp); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(eax.split.bit_width) - 1; eax.split.mask_length = min_t(int, eax.split.mask_length, kvm_pmu_cap.events_mask_len); - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); + pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1); if (pmu->version == 1) { pmu->nr_arch_fixed_counters = 0; @@ -549,16 +548,15 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, kvm_pmu_cap.bit_width_fixed); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; + pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; } intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | INTEL_FIXED_0_ENABLE_PMI); - counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); + counter_rsvd = ~((BIT_ULL(pmu->nr_arch_gp_counters) - 1) | + ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_rsvd = counter_rsvd; /* @@ -603,8 +601,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_rsvd = ~0xff00000full; intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_rsvd = - ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->pebs_enable_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); } } } From 9bae7a086394128bd829c0c04f9f75600fdc1bc1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:52 -0700 Subject: [PATCH 072/142] KVM: x86/pmu: Move initialization of valid PMCs bitmask to common x86 Move all initialization of all_valid_pmc_idx to common code, as the logic is 100% common to Intel and AMD, and KVM heavily relies on Intel and AMD having the same semantics. E.g. the fact that AMD doesn't support fixed counters doesn't allow KVM to use all_valid_pmc_idx[63:32] for other purposes. Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 4 ++++ arch/x86/kvm/svm/pmu.c | 1 - arch/x86/kvm/vmx/pmu_intel.c | 5 ----- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 5205f0d9ced9..b7dc5bd981ba 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -888,6 +888,10 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) */ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, + pmu->nr_arch_fixed_counters); } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 25ccd6a99838..bc062285fbf5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -210,7 +210,6 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 096f091980f0..edf0db8e53ad 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -579,11 +579,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); } - bitmap_set(pmu->all_valid_pmc_idx, - 0, pmu->nr_arch_gp_counters); - bitmap_set(pmu->all_valid_pmc_idx, - INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (intel_pmu_lbr_is_compatible(vcpu) && (perf_capabilities & PERF_CAP_LBR_FMT)) From c49aa98376864ddf82175a7ea48977c637398653 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Aug 2025 12:56:53 -0700 Subject: [PATCH 073/142] KVM: x86/pmu: Restrict GLOBAL_{CTRL,STATUS}, fixed PMCs, and PEBS to PMU v2+ Restrict support for GLOBAL_CTRL, GLOBAL_STATUS, fixed PMCs, and PEBS to v2 or later vPMUs. The SDM explicitly states that GLOBAL_{CTRL,STATUS} and fixed counters were introduced with PMU v2, and PEBS has hard dependencies on fixed counters and the bitmap MSR layouts established by PMU v2. Reported-by: Dapeng Mi Tested-by: Xudong Hao Link: https://lore.kernel.org/r/20250806195706.1650976-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 51 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index edf0db8e53ad..de1d9785c01f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -541,16 +541,33 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1); - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - kvm_pmu_cap.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, - kvm_pmu_cap.bit_width_fixed); - pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); } + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (intel_pmu_lbr_is_compatible(vcpu) && + (perf_capabilities & PERF_CAP_LBR_FMT)) + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (pmu->version == 1) + return; + + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, + kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); + pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | INTEL_FIXED_0_ENABLE_PMI); @@ -571,24 +588,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { - pmu->reserved_bits ^= HSW_IN_TX; - pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); - } - - perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (intel_pmu_lbr_is_compatible(vcpu) && - (perf_capabilities & PERF_CAP_LBR_FMT)) - memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); - else - lbr_desc->records.nr = 0; - - if (lbr_desc->records.nr) - bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_rsvd = counter_rsvd; From 510c47f165f0c1f0b57329a30a9a797795519831 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 18 Sep 2025 08:32:25 +0300 Subject: [PATCH 074/142] KVM: TDX: Fix uninitialized error code for __tdx_bringup() Fix a Smatch static checker warning reported by Dan: arch/x86/kvm/vmx/tdx.c:3464 __tdx_bringup() warn: missing error code 'r' Initialize r to -EINVAL before tdx_get_sysinfo() to simplify the code and to prevent similar issues from sneaking in later on as suggested by Kai. Cc: stable@vger.kernel.org Reported-by: Dan Carpenter Fixes: 61bb28279623 ("KVM: TDX: Get system-wide info about TDX module on initialization") Suggested-by: Kai Huang Reviewed-by: Kai Huang Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20250918053226.802204-1-tony.lindgren@linux.intel.com [sean: tag for stable] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 6784aaaced87..36c15035ecb4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3457,12 +3457,11 @@ static int __init __tdx_bringup(void) if (r) goto tdx_bringup_err; + r = -EINVAL; /* Get TDX global information for later use */ tdx_sysinfo = tdx_get_sysinfo(); - if (WARN_ON_ONCE(!tdx_sysinfo)) { - r = -EINVAL; + if (WARN_ON_ONCE(!tdx_sysinfo)) goto get_sysinfo_err; - } /* Check TDX module and KVM capabilities */ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || @@ -3505,14 +3504,11 @@ static int __init __tdx_bringup(void) if (td_conf->max_vcpus_per_td < num_present_cpus()) { pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", td_conf->max_vcpus_per_td, num_present_cpus()); - r = -EINVAL; goto get_sysinfo_err; } - if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { - r = -EINVAL; + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) goto get_sysinfo_err; - } /* * Leave hardware virtualization enabled after TDX is enabled From e8f85d7884e0890ea43aa36396bcaf8a2659c2ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:46:39 -0700 Subject: [PATCH 075/142] KVM: x86: Don't treat ENTER and LEAVE as branches, because they aren't Remove the IsBranch flag from ENTER and LEAVE in KVM's emulator, as ENTER and LEAVE are stack operations, not branches. Add forced emulation of said instructions to the PMU counters test to prove that KVM diverges from hardware, and to guard against regressions. Opportunistically add a missing "1 MOV" to the selftest comment regarding the number of instructions per loop, which commit 7803339fa929 ("KVM: selftests: Use data load to trigger LLC references/misses in Intel PMU") forgot to add. Fixes: 018d70ffcfec ("KVM: x86: Update vPMCs when retiring branch instructions") Cc: Jim Mattson Reviewed-by: Jim Mattson Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919004639.1360453-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 4 ++-- tools/testing/selftests/kvm/x86/pmu_counters_test.c | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 542d3664afa3..23929151a5b8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4330,8 +4330,8 @@ static const struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), - I(Stack | IsBranch, em_leave), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), + I(Stack, em_leave), I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), I(ImplicitOps | IsBranch, em_ret_far), D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c index 8aaaf25b6111..881cce1c81b8 100644 --- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c @@ -14,10 +14,10 @@ #define NUM_BRANCH_INSNS_RETIRED (NUM_LOOPS) /* - * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, - * 1 LOOP. + * Number of instructions in each loop. 1 ENTER, 1 CLFLUSH/CLFLUSHOPT/NOP, + * 1 MFENCE, 1 MOV, 1 LEAVE, 1 LOOP. */ -#define NUM_INSNS_PER_LOOP 4 +#define NUM_INSNS_PER_LOOP 6 /* * Number of "extra" instructions that will be counted, i.e. the number of @@ -210,9 +210,11 @@ do { \ __asm__ __volatile__("wrmsr\n\t" \ " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t" \ "1:\n\t" \ + FEP "enter $0, $0\n\t" \ clflush "\n\t" \ "mfence\n\t" \ "mov %[m], %%eax\n\t" \ + FEP "leave\n\t" \ FEP "loop 1b\n\t" \ FEP "mov %%edi, %%ecx\n\t" \ FEP "xor %%eax, %%eax\n\t" \ From 86bcd23df9cec9c2df520ae0982033e301d3c184 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 9 Sep 2025 07:39:52 +0700 Subject: [PATCH 076/142] KVM: x86: Fix hypercalls docs section number order Commit 4180bf1b655a79 ("KVM: X86: Implement "send IPI" hypercall") documents KVM_HC_SEND_IPI hypercall, yet its section number duplicates KVM_HC_CLOCK_PAIRING one (which both are 6th). Fix the numbering order so that the former should be 7th. Fixes: 4180bf1b655a ("KVM: X86: Implement "send IPI" hypercall") Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20250909003952.10314-1-bagasdotme@gmail.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/hypercalls.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/x86/hypercalls.rst b/Documentation/virt/kvm/x86/hypercalls.rst index 10db7924720f..521ecf9a8a36 100644 --- a/Documentation/virt/kvm/x86/hypercalls.rst +++ b/Documentation/virt/kvm/x86/hypercalls.rst @@ -137,7 +137,7 @@ compute the CLOCK_REALTIME for its clock, at the same instant. Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. -6. KVM_HC_SEND_IPI +7. KVM_HC_SEND_IPI ------------------ :Architecture: x86 @@ -158,7 +158,7 @@ corresponds to the APIC ID a2+1, and so on. Returns the number of CPUs to which the IPIs were delivered successfully. -7. KVM_HC_SCHED_YIELD +8. KVM_HC_SCHED_YIELD --------------------- :Architecture: x86 @@ -170,7 +170,7 @@ a0: destination APIC ID :Usage example: When sending a call-function IPI-many to vCPUs, yield if any of the IPI target vCPUs was preempted. -8. KVM_HC_MAP_GPA_RANGE +9. KVM_HC_MAP_GPA_RANGE ------------------------- :Architecture: x86 :Status: active From e0ff302b79c54567eb6b8f609e76c633978ff06d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:08 -0700 Subject: [PATCH 077/142] KVM: SEV: Rename kvm_ghcb_get_sw_exit_code() to kvm_get_cached_sw_exit_code() Rename kvm_ghcb_get_sw_exit_code() to kvm_get_cached_sw_exit_code() to make it clear that KVM is getting the cached value, not reading directly from the guest-controlled GHCB. More importantly, vacating kvm_ghcb_get_sw_exit_code() will allow adding a KVM-specific macro-built kvm_ghcb_get_##field() helper to read values from the GHCB. No functional change intended. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a95b862afa23..5456a71f9504 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3220,7 +3220,7 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) { return (((u64)control->exit_code_hi) << 32) | control->exit_code; } @@ -3246,7 +3246,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3335,7 +3335,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { @@ -4340,7 +4340,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); From bd5f500d23170e5bde59ce97da523048b66a8183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:09 -0700 Subject: [PATCH 078/142] KVM: SEV: Read save fields from GHCB exactly once Wrap all reads of GHCB save fields with READ_ONCE() via a KVM-specific GHCB get() utility to help guard against TOCTOU bugs. Using READ_ONCE() doesn't completely prevent such bugs, e.g. doesn't prevent KVM from redoing get() after checking the initial value, but at least addresses all potential TOCTOU issues in the current KVM code base. To prevent unintentional use of the generic helpers, take only @svm for the kvm_ghcb_get_xxx() helpers and retrieve the ghcb instead of explicitly passing it in. Opportunistically reduce the indentation of the macro-defined helpers and clean up the alignment. Fixes: 4e15a0ddc3ff ("KVM: SEV: snapshot the GHCB before accessing it") Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 22 +++++++++++----------- arch/x86/kvm/svm/svm.h | 25 +++++++++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5456a71f9504..a2076ab3fe71 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3299,26 +3299,26 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); - svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); if (kvm_ghcb_xcr0_is_valid(svm)) { - vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); + vcpu->arch.xcr0 = kvm_ghcb_get_xcr0(svm); vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_code = lower_32_bits(exit_code); control->exit_code_hi = upper_32_bits(exit_code); - control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); - control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); - svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); + control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); + control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3c7f208b7935..10d5cbc259e1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -913,16 +913,21 @@ void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ - static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ - { \ - return test_bit(GHCB_BITMAP_IDX(field), \ - (unsigned long *)&svm->sev_es.valid_bitmap); \ - } \ - \ - static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \ - { \ - return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \ - } \ +static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \ +{ \ + return READ_ONCE(svm->sev_es.ghcb->save.field); \ +} \ + \ +static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ +{ \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&svm->sev_es.valid_bitmap); \ +} \ + \ +static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm) \ +{ \ + return kvm_ghcb_##field##_is_valid(svm) ? kvm_ghcb_get_##field(svm) : 0; \ +} DEFINE_KVM_GHCB_ACCESSORS(cpl) DEFINE_KVM_GHCB_ACCESSORS(rax) From 4135a9a8ccba2b685f2301429ea765fa0f78eb89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:10 -0700 Subject: [PATCH 079/142] KVM: SEV: Validate XCR0 provided by guest in GHCB Use __kvm_set_xcr() to propagate XCR0 changes from the GHCB to KVM's software model in order to validate the new XCR0 against KVM's view of the supported XCR0. Allowing garbage is thankfully mostly benign, as kvm_load_{guest,host}_xsave_state() bail early for vCPUs with protected state, xstate_required_size() will simply provide garbage back to the guest, and attempting to save/restore the bad value via KVM_{G,S}ET_XCRS will only harm the guest (setting XCR0 will fail). However, allowing the guest to put junk into a field that KVM assumes is valid is a CVE waiting to happen. And as a bonus, using the proper API eliminates the ugly open coding of setting arch.cpuid_dynamic_bits_dirty. Simply ignore bad values, as either the guest managed to get an unsupported value into hardware, or the guest is misbehaving and providing pure garbage. In either case, KVM can't fix the broken guest. Note, using __kvm_set_xcr() also avoids recomputing dynamic CPUID bits if XCR0 isn't actually changing (relatively to KVM's previous snapshot). Cc: Tom Lendacky Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/sev.c | 6 ++---- arch/x86/kvm/x86.c | 3 ++- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..822a5596a4a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2187,6 +2187,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2076ab3fe71..d2ffacd43234 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3307,10 +3307,8 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); - if (kvm_ghcb_xcr0_is_valid(svm)) { - vcpu->arch.xcr0 = kvm_ghcb_get_xcr0(svm); - vcpu->arch.cpuid_dynamic_bits_dirty = true; - } + if (kvm_ghcb_xcr0_is_valid(svm)) + __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); /* Copy the GHCB exit information into the VMCB fields */ exit_code = kvm_ghcb_get_sw_exit_code(svm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..1d7faf8bc785 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1237,7 +1237,7 @@ static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) } #endif -static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1281,6 +1281,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } +EXPORT_SYMBOL_GPL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { From 5b66e335ead6472f336b4d7d9cbf14488b844f27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:16:49 -0700 Subject: [PATCH 080/142] KVM: SEV: Reject non-positive effective lengths during LAUNCH_UPDATE Check for an invalid length during LAUNCH_UPDATE at the start of snp_launch_update() instead of subtly relying on kvm_gmem_populate() to detect the bad state. Code that directly handles userspace input absolutely should sanitize those inputs; failure to do so is asking for bugs where KVM consumes an invalid "npages". Keep the check in gmem, but wrap it in a WARN to flag any bad usage by the caller. Note, this is technically an ABI change as KVM would previously allow a length of '0'. But allowing a length of '0' is nonsensical and creates pointless conundrums in KVM. E.g. an empty range is arguably neither private nor shared, but LAUNCH_UPDATE will fail if the starting gpa can't be made private. In practice, no known or well-behaved VMM passes a length of '0'. Note #2, the PAGE_ALIGNED(params.len) check ensures that lengths between 1 and 4095 (inclusive) are also rejected, i.e. that KVM won't end up with npages=0 when doing "npages = params.len / PAGE_SIZE". Cc: Thomas Lendacky Cc: Michael Roth Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919211649.1575654-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- virt/kvm/guest_memfd.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d2ffacd43234..019d920fe442 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2353,7 +2353,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, params.gfn_start, params.len, params.type, params.flags); - if (!PAGE_ALIGNED(params.len) || params.flags || + if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7d85cc33c0bb..79552467add5 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -639,7 +639,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long long i; lockdep_assert_held(&kvm->slots_lock); - if (npages < 0) + + if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); From 9bc366350734246301b090802fc71f9924daad39 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 23 Sep 2025 08:37:37 -0700 Subject: [PATCH 081/142] KVM: x86: Add helper to retrieve current value of user return MSR In the user return MSR support, the cached value is always the hardware value of the specific MSR. Therefore, add a helper to retrieve the cached value, which can replace the need for RDMSR, for example, to allow SEV-ES guests to restore the correct host hardware value without using RDMSR. Cc: stable@vger.kernel.org Signed-off-by: Hou Wenlong [sean: drop "cache" from the name, make it a one-liner, tag for stable] Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250923153738.1875174-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 822a5596a4a0..0d6a4af79b78 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2357,6 +2357,7 @@ int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); void kvm_user_return_msr_update_cache(unsigned int index, u64 val); +u64 kvm_get_user_return_msr(unsigned int slot); static inline bool kvm_is_supported_user_return_msr(u32 msr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d7faf8bc785..5ac2183b9993 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -677,6 +677,12 @@ void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) } EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); +u64 kvm_get_user_return_msr(unsigned int slot) +{ + return this_cpu_ptr(user_return_msrs)->values[slot].curr; +} +EXPORT_SYMBOL_GPL(kvm_get_user_return_msr); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); From 29da8c823abffdacb71c7c07ec48fcf9eb38757c Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 23 Sep 2025 08:37:38 -0700 Subject: [PATCH 082/142] KVM: SVM: Re-load current, not host, TSC_AUX on #VMEXIT from SEV-ES guest Prior to running an SEV-ES guest, set TSC_AUX in the host save area to the current value in hardware, as tracked by the user return infrastructure, instead of always loading the host's desired value for the CPU. If the pCPU is also running a non-SEV-ES vCPU, loading the host's value on #VMEXIT could clobber the other vCPU's value, e.g. if the SEV-ES vCPU preempted the non-SEV-ES vCPU, in which case KVM expects the other vCPU's TSC_AUX value to be resident in hardware. Note, unlike TDX, which blindly _zeroes_ TSC_AUX on TD-Exit, SEV-ES CPUs can load an arbitrary value. Stuff the current value in the host save area instead of refreshing the user return cache so that KVM doesn't need to track whether or not the vCPU actually enterred the guest and thus loaded TSC_AUX from the host save area. Opportunistically tag tsc_aux_uret_slot as read-only after init to guard against unexpected modifications, and to make it obvious that using the variable in sev_es_prepare_switch_to_guest() is safe. Fixes: 916e3e5f26ab ("KVM: SVM: Do not use user return MSR support for virtualized TSC_AUX") Cc: stable@vger.kernel.org Suggested-by: Lai Jiangshan Signed-off-by: Hou Wenlong [sean: handle the SEV-ES case in sev_es_prepare_switch_to_guest()] Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250923153738.1875174-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 10 ++++++++++ arch/x86/kvm/svm/svm.c | 25 ++++++------------------- arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 019d920fe442..5529e4c3362b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4666,6 +4666,16 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); } + + /* + * TSC_AUX is always virtualized for SEV-ES guests when the feature is + * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. + * Set the save area to the current hardware value, i.e. the current + * user return value, so that the correct value is restored on #VMEXIT. + */ + if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && + !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) + hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b237b4081c91..6f486fb82144 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -195,7 +195,7 @@ static DEFINE_MUTEX(vmcb_dump_mutex); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -static int tsc_aux_uret_slot __read_mostly = -1; +int tsc_aux_uret_slot __ro_after_init = -1; static int get_npt_level(void) { @@ -577,18 +577,6 @@ static int svm_enable_virtualization_cpu(void) amd_pmu_enable_virt(); - /* - * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type - * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. - * Since Linux does not change the value of TSC_AUX once set, prime the - * TSC_AUX field now to avoid a RDMSR on every vCPU run. - */ - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - u32 __maybe_unused msr_hi; - - rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); - } - return 0; } @@ -1406,10 +1394,10 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); /* - * TSC_AUX is always virtualized for SEV-ES guests when the feature is - * available. The user return MSR support is not required in this case - * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_enable_virtualization_cpu()). + * TSC_AUX is always virtualized (context switched by hardware) for + * SEV-ES guests when the feature is available. For non-SEV-ES guests, + * context switch TSC_AUX via the user_return MSR infrastructure (not + * all CPUs support TSC_AUX virtualization). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -3004,8 +2992,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * TSC_AUX is always virtualized for SEV-ES guests when the * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT - * from the host save area (which has been initialized in - * svm_enable_virtualization_cpu()). + * from the host save area. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 10d5cbc259e1..ec3fb318ca83 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -52,6 +52,8 @@ extern bool x2avic_enabled; extern bool vnmi; extern int lbrv; +extern int tsc_aux_uret_slot __ro_after_init; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to From 44bfe1f0490d5620c7962ab7384797672b4c4293 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:28 -0700 Subject: [PATCH 083/142] KVM: SVM: Make svm_x86_ops globally visible, clean up on-HyperV usage Make svm_x86_ops globally visible in anticipation of modifying the struct in avic.c, and clean up the KVM-on-HyperV usage, as declaring _and using_ a local variable in a header that's only defined in one specific .c-file is all kinds of ugly. Opportunistically make svm_hv_enable_l2_tlb_flush() local to svm_onhyperv.c, as the only reason it was visible was due to the aforementioned shenanigans in svm_onhyperv.h. Alternatively, svm_x86_ops could be explicitly passed to svm_hv_hardware_setup() as a parameter. While that approach is slightly safer, e.g. avoids "hidden" updates, for better or worse, the Intel side of KVM has already chosen to expose vt_x86_ops (and vt_init_ops). Given that svm_x86_ops is only truly consumed by kvm_ops_update, the odds of a "hidden" update causing problems are extremely low. So, absent a strong reason to rework the VMX/TDX code, make svm_x86_ops visible, as having all updates use exactly "svm_x86_ops." is advantageous in its own right. No functional change intended. Link: https://lore.kernel.org/r/20250919215934.1590410-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/svm/svm_onhyperv.c | 28 +++++++++++++++++++++++++++- arch/x86/kvm/svm/svm_onhyperv.h | 31 +------------------------------ 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6f486fb82144..bfbd34818412 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5012,7 +5012,7 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) return page_address(page); } -static struct kvm_x86_ops svm_x86_ops __initdata = { +struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, .check_processor_compatibility = svm_check_processor_compat, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ec3fb318ca83..bc46a3539487 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -54,6 +54,8 @@ extern int lbrv; extern int tsc_aux_uret_slot __ro_after_init; +extern struct kvm_x86_ops svm_x86_ops __initdata; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 3971b3ea5d04..a8e78c0e5956 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -15,7 +15,7 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) +static int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); @@ -35,3 +35,29 @@ int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +__init void svm_hv_hardware_setup(void) +{ + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); + svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { + int cpu; + + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); + for_each_online_cpu(cpu) { + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->nested_control.features.directhypercall = 1; + } + svm_x86_ops.enable_l2_tlb_flush = + svm_hv_enable_l2_tlb_flush; + } +} diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index f85bc617ffe4..08f14e6f195c 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -13,9 +13,7 @@ #include "kvm_onhyperv.h" #include "svm/hyperv.h" -static struct kvm_x86_ops svm_x86_ops; - -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +__init void svm_hv_hardware_setup(void); static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) { @@ -40,33 +38,6 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) hve->hv_enlightenments_control.msr_bitmap = 1; } -static inline __init void svm_hv_hardware_setup(void) -{ - if (npt_enabled && - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); - svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { - int cpu; - - pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); - for_each_online_cpu(cpu) { - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 1; - } - svm_x86_ops.enable_l2_tlb_flush = - svm_hv_enable_l2_tlb_flush; - } -} - static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { From eb44ea6a7aace13becd8d99905284ad272b3bd98 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:29 -0700 Subject: [PATCH 084/142] KVM: SVM: Move x2AVIC MSR interception helper to avic.c Move svm_set_x2apic_msr_interception() to avic.c as it's only relevant when x2AVIC is enabled/supported and only called by AVIC code. In addition to scoping AVIC code to avic.c, this will allow burying the global x2avic_enabled variable in avic. Opportunistically rename the helper to explicitly scope it to "avic". No functional change intended. Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 57 ++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/svm/svm.c | 49 ----------------------------------- arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a34c5c3b164e..478a18208a76 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -79,6 +79,57 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; + +static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, + bool intercept) +{ + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (!x2avic_enabled) + return; + + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); + + svm->x2avic_msrs_intercepted = intercept; +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -99,7 +150,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, false); + avic_set_x2apic_msr_interception(svm, false); } else { /* * Flush the TLB, the guest may have inserted a non-APIC @@ -110,7 +161,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } } @@ -130,7 +181,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) return; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } /* Note: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bfbd34818412..44032c78aab0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -724,55 +724,6 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) -{ - static const u32 x2avic_passthrough_msrs[] = { - X2APIC_MSR(APIC_ID), - X2APIC_MSR(APIC_LVR), - X2APIC_MSR(APIC_TASKPRI), - X2APIC_MSR(APIC_ARBPRI), - X2APIC_MSR(APIC_PROCPRI), - X2APIC_MSR(APIC_EOI), - X2APIC_MSR(APIC_RRR), - X2APIC_MSR(APIC_LDR), - X2APIC_MSR(APIC_DFR), - X2APIC_MSR(APIC_SPIV), - X2APIC_MSR(APIC_ISR), - X2APIC_MSR(APIC_TMR), - X2APIC_MSR(APIC_IRR), - X2APIC_MSR(APIC_ESR), - X2APIC_MSR(APIC_ICR), - X2APIC_MSR(APIC_ICR2), - - /* - * Note! Always intercept LVTT, as TSC-deadline timer mode - * isn't virtualized by hardware, and the CPU will generate a - * #GP instead of a #VMEXIT. - */ - X2APIC_MSR(APIC_LVTTHMR), - X2APIC_MSR(APIC_LVTPC), - X2APIC_MSR(APIC_LVT0), - X2APIC_MSR(APIC_LVT1), - X2APIC_MSR(APIC_LVTERR), - X2APIC_MSR(APIC_TMICT), - X2APIC_MSR(APIC_TMCCT), - X2APIC_MSR(APIC_TDCR), - }; - int i; - - if (intercept == svm->x2avic_msrs_intercepted) - return; - - if (!x2avic_enabled) - return; - - for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) - svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], - MSR_TYPE_RW, intercept); - - svm->x2avic_msrs_intercepted = intercept; -} - void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bc46a3539487..cb1d26cb5113 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -703,7 +703,6 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); From a9095e4fc4368052593c84472a8d374fefd3df71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:30 -0700 Subject: [PATCH 085/142] KVM: SVM: Update "APICv in x2APIC without x2AVIC" in avic.c, not svm.c Set the "allow_apicv_in_x2apic_without_x2apic_virtualization" flag as part of avic_hardware_setup() instead of handling in svm_hardware_setup(), and make x2avic_enabled local to avic.c (setting the flag was the only use in svm.c). Tag avic_hardware_setup() with __init as necessary (it should have been tagged __init long ago). No functional change intended (aside from the side effects of tagging avic_hardware_setup() with __init). Acked-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++++-- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/svm/svm.h | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 478a18208a76..b4577401ce5f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -77,7 +77,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -bool x2avic_enabled; +static bool x2avic_enabled; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, @@ -1147,7 +1147,7 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(void) +bool __init avic_hardware_setup(void) { if (!npt_enabled) return false; @@ -1182,6 +1182,8 @@ bool avic_hardware_setup(void) x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + else + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44032c78aab0..3fd2f4097a3b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5337,8 +5337,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; - } else if (!x2avic_enabled) { - svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } if (vls) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index cb1d26cb5113..739f4f52f46d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -48,7 +48,6 @@ extern bool npt_enabled; extern int nrips; extern int vgif; extern bool intercept_smi; -extern bool x2avic_enabled; extern bool vnmi; extern int lbrv; @@ -804,7 +803,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) -bool avic_hardware_setup(void); +bool __init avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); From ce4253e21fa8a4468474e26884196770cb560eae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:31 -0700 Subject: [PATCH 086/142] KVM: SVM: Always print "AVIC enabled" separately, even when force enabled Print the customary "AVIC enabled" informational message even when AVIC is force enabled on a system that doesn't advertise supported for AVIC in CPUID, as not printing the standard message can confuse users and tools. Opportunistically clean up the scary message when AVIC is force enabled, but keep it as separate message so that it is printed at level "warn", versus the standard message only being printed for level "info". Suggested-by: Naveen N Rao (AMD) Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b4577401ce5f..b8b73c4103c6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1167,16 +1167,15 @@ bool __init avic_hardware_setup(void) return false; } - if (boot_cpu_has(X86_FEATURE_AVIC)) { - pr_info("AVIC enabled\n"); - } else if (force_avic) { - /* - * Some older systems does not advertise AVIC support. - * See Revision Guide for specific AMD processor for more detail. - */ - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } + /* + * Print a scary message if AVIC is force enabled to make it abundantly + * clear that ignoring CPUID could have repercussions. See Revision + * Guide for specific AMD processor for more details. + */ + if (!boot_cpu_has(X86_FEATURE_AVIC)) + pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); From ad65dca2ca4cf8c377135362c0c1e031ad92019d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:32 -0700 Subject: [PATCH 087/142] KVM: SVM: Don't advise the user to do force_avic=y (when x2AVIC is detected) Don't advise the end user to try to force enable AVIC when x2AVIC is reported as supported in CPUID, as forcefully enabling AVIC isn't something that should be done lightly. E.g. some Zen4 client systems hide AVIC but leave x2AVIC behind, and while such a configuration is indeed due to buggy firmware in the sense the reporting x2AVIC without AVIC is nonsensical, KVM has no idea _why_ firmware disabled AVIC in the first place. Suggesting that the user try to run with force_avic=y is sketchy even if the user explicitly tries to enable AVIC, and will be downright irresponsible once KVM starts enabling AVIC by default. Alternatively, KVM could print the message only when the user explicitly asks for AVIC, but running with force_avic=y isn't something that should be encouraged for random users. force_avic is a useful knob for developers and perhaps even advanced users, but isn't something that KVM should advertise broadly. Opportunistically append a newline to the pr_warn() so that it prints out immediately, and tweak the message to say that AVIC is unsupported instead of disabled (disabled suggests that the kernel/KVM is somehow responsible). Suggested-by: Naveen N Rao (AMD) Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b8b73c4103c6..35dde7d89f56 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1154,10 +1154,8 @@ bool __init avic_hardware_setup(void) /* AVIC is a prerequisite for x2AVIC. */ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } + if (boot_cpu_has(X86_FEATURE_X2AVIC)) + pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); return false; } From b14665353162db70e445aacdd18b0566edba00c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:33 -0700 Subject: [PATCH 088/142] KVM: SVM: Move global "avic" variable to avic.c Move "avic" to avic.c so that it's colocated with the other AVIC specific globals and module params, and so that avic_hardware_setup() is a bit more self-contained, e.g. similar to sev_hardware_setup(). Deliberately set enable_apicv in svm.c as it's already globally visible (defined by kvm.ko, not by kvm-amd.ko), and to clearly capture the dependency on enable_apicv being initialized (svm_hardware_setup() clears several AVIC-specific hooks when enable_apicv is disabled). Alternatively, clearing of the hooks (and enable_ipiv) could be moved to avic_hardware_setup(), but that's not obviously better, e.g. it's helpful to isolate the setting of enable_apicv when reading code from the generic x86 side of the world. No functional change intended. Acked-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 33 +++++++++++++++++++++++++-------- arch/x86/kvm/svm/svm.c | 11 +---------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 35dde7d89f56..ec214062d136 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -64,6 +64,14 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); +/* + * enable / disable AVIC. Because the defaults differ for APICv + * support between VMX and SVM we cannot use module_param_named. + */ +static bool avic; +module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); + static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -1141,15 +1149,9 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } -/* - * Note: - * - The module param avic enable both xAPIC and x2APIC mode. - * - Hypervisor can support both xAVIC and x2AVIC in the same guest. - * - The mode can be switched at run-time. - */ -bool __init avic_hardware_setup(void) +static bool __init avic_want_avic_enabled(void) { - if (!npt_enabled) + if (!avic || !npt_enabled) return false; /* AVIC is a prerequisite for x2AVIC. */ @@ -1173,6 +1175,21 @@ bool __init avic_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_AVIC)) pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + return true; +} + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool __init avic_hardware_setup(void) +{ + avic = avic_want_avic_enabled(); + if (!avic) + return false; + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3fd2f4097a3b..748881a3dedb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -158,14 +158,6 @@ module_param(lbrv, int, 0444); static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); -/* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. - */ -static bool avic; -module_param(avic, bool, 0444); -module_param(enable_ipiv, bool, 0444); - module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; @@ -5330,8 +5322,7 @@ static __init int svm_hardware_setup(void) goto err; } - enable_apicv = avic = avic && avic_hardware_setup(); - + enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; From ca2967de5a5b098b43c5ad665672945ce7e7d4f7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 19 Sep 2025 14:59:34 -0700 Subject: [PATCH 089/142] KVM: SVM: Enable AVIC by default for Zen4+ if x2AVIC is support AVIC and x2AVIC are fully functional since Zen 4, with no known hardware errata. Enable AVIC and x2AVIC by default on Zen4+ so long as x2AVIC is supported (to avoid enabling partial support for APIC virtualization by default). Internally, convert "avic" to an integer so that KVM can identify if the user has asked to explicitly enable or disable AVIC, i.e. so that KVM doesn't override an explicit 'y' from the user. Arbitrarily use -1 to denote auto-mode, and accept the string "auto" for the module param in addition to standard boolean values, i.e. continue to allow the user to configure the "avic" module parameter to explicitly enable/disable AVIC. To again maintain backward compatibility with a standard boolean param, set KERNEL_PARAM_OPS_FL_NOARG, which tells the params infrastructure to allow empty values for %true, i.e. to interpret a bare "avic" as "avic=y". Take care to check for a NULL @val when looking for "auto"! Lastly, always print "avic" as a boolean, since auto-mode is resolved during module initialization, i.e. the user should never see "auto" in sysfs. Signed-off-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919215934.1590410-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ec214062d136..f286b5706d7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -64,12 +64,32 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); +#define AVIC_AUTO_MODE -1 + +static int avic_param_set(const char *val, const struct kernel_param *kp) +{ + if (val && sysfs_streq(val, "auto")) { + *(int *)kp->arg = AVIC_AUTO_MODE; + return 0; + } + + return param_set_bint(val, kp); +} + +static const struct kernel_param_ops avic_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = avic_param_set, + .get = param_get_bool, +}; + /* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. + * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled + * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). */ -static bool avic; -module_param(avic, bool, 0444); +static int avic = AVIC_AUTO_MODE; +module_param_cb(avic, &avic_ops, &avic, 0444); +__MODULE_PARM_TYPE(avic, "bool"); + module_param(enable_ipiv, bool, 0444); static bool force_avic; @@ -1151,6 +1171,18 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) static bool __init avic_want_avic_enabled(void) { + /* + * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is + * supported (to avoid enabling partial support by default, and because + * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for + * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * aren't inclusive of previous generations, i.e. the kernel will set + * at most one ZenX feature flag. + */ + if (avic == AVIC_AUTO_MODE) + avic = boot_cpu_has(X86_FEATURE_X2AVIC) && + (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + if (!avic || !npt_enabled) return false; From 06f2969c6a1237f05f8ba4324b6ddc2570a808d0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:11 -0700 Subject: [PATCH 090/142] KVM: x86: Introduce KVM_{G,S}ET_ONE_REG uAPIs support Enable KVM_{G,S}ET_ONE_REG uAPIs so that userspace can access MSRs and other non-MSR registers through them, along with support for KVM_GET_REG_LIST to enumerate support for KVM-defined registers. This is in preparation for allowing userspace to read/write the guest SSP register, which is needed for the upcoming CET virtualization support. Currently, two types of registers are supported: KVM_X86_REG_TYPE_MSR and KVM_X86_REG_TYPE_KVM. All MSRs are in the former type; the latter type is added for registers that lack existing KVM uAPIs to access them. The "KVM" in the name is intended to be vague to give KVM flexibility to include other potential registers. More precise names like "SYNTHETIC" and "SYNTHETIC_MSR" were considered, but were deemed too confusing (e.g. can be conflated with synthetic guest-visible MSRs) and may put KVM into a corner (e.g. if KVM wants to change how a KVM-defined register is modeled internally). Enumerate only KVM-defined registers in KVM_GET_REG_LIST to avoid duplicating KVM_GET_MSR_INDEX_LIST, and so that KVM can return _only_ registers that are fully supported (KVM_GET_REG_LIST is vCPU-scoped, i.e. can be precise, whereas KVM_GET_MSR_INDEX_LIST is system-scoped). Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Link: https://lore.kernel.org/all/20240219074733.122080-18-weijiang.yang@intel.com [1] Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919223258.1604852-5-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 6 +- arch/x86/include/uapi/asm/kvm.h | 26 +++++++++ arch/x86/kvm/x86.c | 100 ++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3f2395a62d1..0e82fc5abd7b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2908,6 +2908,8 @@ such as set vcpu counter or reset vcpu, and they have the following id bit patte 0x9030 0000 0002 +x86 MSR registers have the following id bit patterns:: + 0x2030 0002 4.69 KVM_GET_ONE_REG -------------------- @@ -3588,7 +3590,7 @@ VCPU matching underlying host. --------------------- :Capability: basic -:Architectures: arm64, mips, riscv +:Architectures: arm64, mips, riscv, x86 (if KVM_CAP_ONE_REG) :Type: vcpu ioctl :Parameters: struct kvm_reg_list (in/out) :Returns: 0 on success; -1 on error @@ -3631,6 +3633,8 @@ Note that s390 does not support KVM_GET_REG_LIST for historical reasons - KVM_REG_S390_GBEA +Note, for x86, all MSRs enumerated by KVM_GET_MSR_INDEX_LIST are supported as +type KVM_X86_REG_TYPE_MSR, but are NOT enumerated via KVM_GET_REG_LIST. 4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) ----------------------------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0f15d683817d..aae1033c8afa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -411,6 +411,32 @@ struct kvm_xcrs { __u64 padding[16]; }; +#define KVM_X86_REG_TYPE_MSR 2 +#define KVM_X86_REG_TYPE_KVM 3 + +#define KVM_X86_KVM_REG_SIZE(reg) \ +({ \ + reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \ +}) + +#define KVM_X86_REG_TYPE_SIZE(type, reg) \ +({ \ + __u64 type_size = (__u64)type << 32; \ + \ + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ + type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \ + 0; \ + type_size; \ +}) + +#define KVM_X86_REG_ID(type, index) \ + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index) + +#define KVM_X86_REG_MSR(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index) +#define KVM_X86_REG_KVM(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f718834bc00b..bc245e0b0443 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4735,6 +4735,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; case KVM_CAP_PRE_FAULT_MEMORY: @@ -5913,6 +5914,98 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } } +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_x86_reg_id *reg) +{ + return -EINVAL; +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + return r; +} + +static int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = 0; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6029,6 +6122,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } + case KVM_GET_ONE_REG: + case KVM_SET_ONE_REG: + r = kvm_get_set_one_reg(vcpu, ioctl, argp); + break; + case KVM_GET_REG_LIST: + r = kvm_get_reg_list(vcpu, argp); + break; case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; From c0a5f298912222f3bb4e8f5dc67c6a1f0e93d83f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:12 -0700 Subject: [PATCH 091/142] KVM: x86: Report XSS as to-be-saved if there are supported features Add MSR_IA32_XSS to list of MSRs reported to userspace if supported_xss is non-zero, i.e. KVM supports at least one XSS based feature. Before enabling CET virtualization series, guest IA32_MSR_XSS is guaranteed to be 0, i.e., XSAVES/XRSTORS is executed in non-root mode with XSS == 0, which equals to the effect of XSAVE/XRSTOR. Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Reviewed-by: Xiaoyao Li Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc245e0b0443..757878a222a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -332,7 +332,7 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, - MSR_IA32_XFD, MSR_IA32_XFD_ERR, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, }; static const u32 msrs_to_save_pmu[] = { @@ -7503,6 +7503,10 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) return; break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; default: break; } From 338543cbe033e56dcc8c13adcdf6c228953c0829 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:13 -0700 Subject: [PATCH 092/142] KVM: x86: Check XSS validity against guest CPUIDs Maintain per-guest valid XSS bits and check XSS validity against them rather than against KVM capabilities. This is to prevent bits that are supported by KVM but not supported for a guest from being set. Opportunistically return KVM_MSR_RET_UNSUPPORTED on IA32_XSS MSR accesses if guest CPUID doesn't enumerate X86_FEATURE_XSAVES. Since KVM_MSR_RET_UNSUPPORTED takes care of host_initiated cases, drop the host_initiated check. Signed-off-by: Chao Gao Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/cpuid.c | 12 ++++++++++++ arch/x86/kvm/x86.c | 7 +++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b1e2a2e033f0..5865c9b77b6d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -815,7 +815,6 @@ struct kvm_vcpu_arch { bool at_instruction_boundary; bool tpr_access_reporting; bool xfd_no_write_intercept; - u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; u64 perf_capabilities; @@ -876,6 +875,8 @@ struct kvm_vcpu_arch { u64 xcr0; u64 guest_supported_xcr0; + u64 ia32_xss; + u64 guest_supported_xss; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efee08fad72e..6b8b5d8b13cc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -263,6 +263,17 @@ static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } +static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1); + if (!best) + return 0; + + return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss; +} + static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entry, unsigned int x86_feature, @@ -424,6 +435,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); + vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu); vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 757878a222a7..6ae12e8c9d05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3984,15 +3984,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; /* * KVM supports exposing PT to the guest, but does not support * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~kvm_caps.supported_xss) + if (data & ~vcpu->arch.guest_supported_xss) return 1; vcpu->arch.ia32_xss = data; vcpu->arch.cpuid_dynamic_bits_dirty = true; From 9622e116d0d25f1ddc47bf0328612cc7d87d20f2 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:14 -0700 Subject: [PATCH 093/142] KVM: x86: Refresh CPUID on write to guest MSR_IA32_XSS Update CPUID.(EAX=0DH,ECX=1).EBX to reflect current required xstate size due to XSS MSR modification. CPUID(EAX=0DH,ECX=1).EBX reports the required storage size of all enabled xstate features in (XCR0 | IA32_XSS). The CPUID value can be used by guest before allocate sufficient xsave buffer. Note, KVM does not yet support any XSS based features, i.e. supported_xss is guaranteed to be zero at this time. Opportunistically skip CPUID updates if XSS value doesn't change. Suggested-by: Sean Christopherson Co-developed-by: Zhang Yi Z Signed-off-by: Zhang Yi Z Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Reviewed-by: Xiaoyao Li Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6b8b5d8b13cc..32fde9e80c28 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -316,7 +316,8 @@ static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) - best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + best->ebx = xstate_required_size(vcpu->arch.xcr0 | + vcpu->arch.ia32_xss, true); } static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ae12e8c9d05..d142cbc71aaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3993,6 +3993,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if (data & ~vcpu->arch.guest_supported_xss) return 1; + if (vcpu->arch.ia32_xss == data) + break; vcpu->arch.ia32_xss = data; vcpu->arch.cpuid_dynamic_bits_dirty = true; break; From 779ed05511f2931b89fcd0087aba2fcca8890715 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:15 -0700 Subject: [PATCH 094/142] KVM: x86: Initialize kvm_caps.supported_xss Set original kvm_caps.supported_xss to (host_xss & KVM_SUPPORTED_XSS) if XSAVES is supported. host_xss contains the host supported xstate feature bits for thread FPU context switch, KVM_SUPPORTED_XSS includes all KVM enabled XSS feature bits, the resulting value represents the supervisor xstates that are available to guest and are backed by host FPU framework for swapping {guest,host} XSAVE-managed registers/MSRs. [sean: relocate and enhance comment about PT / XSS[8] ] Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Reviewed-by: Xiaoyao Li Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d142cbc71aaa..831c5e488de3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -217,6 +217,14 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +/* + * Note, KVM supports exposing PT to the guest, but does not support context + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). + */ +#define KVM_SUPPORTED_XSS 0 + bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); @@ -3986,11 +3994,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_XSS: if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return KVM_MSR_RET_UNSUPPORTED; - /* - * KVM supports exposing PT to the guest, but does not support - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than - * XSAVES/XRSTORS to save/restore PT MSRs. - */ + if (data & ~vcpu->arch.guest_supported_xss) return 1; if (vcpu->arch.ia32_xss == data) @@ -9822,14 +9826,17 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + rdmsrq(MSR_IA32_XSS, kvm_host.xss); + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; + } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrq_safe(MSR_EFER, &kvm_host.efer); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrq(MSR_IA32_XSS, kvm_host.xss); - kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) From e44eb58334bbc28d790ed2851385cc86ea76dc12 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:16 -0700 Subject: [PATCH 095/142] KVM: x86: Load guest FPU state when access XSAVE-managed MSRs Load the guest's FPU state if userspace is accessing MSRs whose values are managed by XSAVES. Introduce two helpers, kvm_{get,set}_xstate_msr(), to facilitate access to such kind of MSRs. If MSRs supported in kvm_caps.supported_xss are passed through to guest, the guest MSRs are swapped with host's before vCPU exits to userspace and after it reenters kernel before next VM-entry. Because the modified code is also used for the KVM_GET_MSRS device ioctl(), explicitly check @vcpu is non-null before attempting to load guest state. The XSAVE-managed MSRs cannot be retrieved via the device ioctl() without loading guest FPU state (which doesn't exist). Note that guest_cpuid_has() is not queried as host userspace is allowed to access MSRs that have not been exposed to the guest, e.g. it might do KVM_SET_MSRS prior to KVM_SET_CPUID2. The two helpers are put here in order to manifest accessing xsave-managed MSRs requires special check and handling to guarantee the correctness of read/write to the MSRs. Co-developed-by: Yang Weijiang Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: drop S_CET, add big comment, move accessors to x86.c] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Reviewed-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 831c5e488de3..c2e11f3d50fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,6 +136,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -3801,6 +3804,67 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. Note! S_CET is _not_ context + * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. + * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, + * the value saved/restored via XSTATE is always the host's value. That detail + * is _extremely_ important, as the guest's S_CET must _never_ be resident in + * hardware while executing in the host. Loading guest values for U_CET and + * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to + * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower + * privilege levels, i.e. are effectively only consumed by userspace as well. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static __maybe_unused void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static __maybe_unused void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; @@ -4551,11 +4615,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { + bool fpu_loaded = false; int i; - for (i = 0; i < msrs->nmsrs; ++i) + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); return i; } @@ -5965,6 +6043,7 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, struct kvm_one_reg one_reg; struct kvm_x86_reg_id *reg; u64 __user *user_val; + bool load_fpu; int r; if (copy_from_user(&one_reg, argp, sizeof(one_reg))) @@ -5991,12 +6070,18 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, guard(srcu)(&vcpu->kvm->srcu); + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + user_val = u64_to_user_ptr(one_reg.addr); if (ioctl == KVM_GET_ONE_REG) r = kvm_get_one_msr(vcpu, reg->index, user_val); else r = kvm_set_one_msr(vcpu, reg->index, user_val); + if (load_fpu) + kvm_put_guest_fpu(vcpu); return r; } From 586ef9dcbb28fc0bc6beb496e6dc8a54276e7a32 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:17 -0700 Subject: [PATCH 096/142] KVM: x86: Add fault checks for guest CR4.CET setting Check potential faults for CR4.CET setting per Intel SDM requirements. CET can be enabled if and only if CR0.WP == 1, i.e. setting CR4.CET == 1 faults if CR0.WP == 0 and setting CR0.WP == 0 fails if CR4.CET == 1. Signed-off-by: Yang Weijiang Reviewed-by: Chao Gao Reviewed-by: Maxim Levitsky Reviewed-by: Xiaoyao Li Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919223258.1604852-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c2e11f3d50fb..6d67c969e18a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1176,6 +1176,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) + return 1; + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); @@ -1376,6 +1379,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); From 6a11c860d8a4aa1c7351246bd1ee7b41f26399b6 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:18 -0700 Subject: [PATCH 097/142] KVM: x86: Report KVM supported CET MSRs as to-be-saved Add CET MSRs to the list of MSRs reported to userspace if the feature, i.e. IBT or SHSTK, associated with the MSRs is supported by KVM. Suggested-by: Chao Gao Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d67c969e18a..5f23d2d2731d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -344,6 +344,10 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_UMWAIT_CONTROL, MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, }; static const u32 msrs_to_save_pmu[] = { @@ -7603,6 +7607,20 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!kvm_caps.supported_xss) return; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; default: break; } From d6c387fc396b3d2c5aa00cb5b46f6401fb86bb43 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:19 -0700 Subject: [PATCH 098/142] KVM: VMX: Introduce CET VMCS fields and control bits Control-flow Enforcement Technology (CET) is a kind of CPU feature used to prevent Return/CALL/Jump-Oriented Programming (ROP/COP/JOP) attacks. It provides two sub-features(SHSTK,IBT) to defend against ROP/COP/JOP style control-flow subversion attacks. Shadow Stack (SHSTK): A shadow stack is a second stack used exclusively for control transfer operations. The shadow stack is separate from the data/normal stack and can be enabled individually in user and kernel mode. When shadow stack is enabled, CALL pushes the return address on both the data and shadow stack. RET pops the return address from both stacks and compares them. If the return addresses from the two stacks do not match, the processor generates a #CP. Indirect Branch Tracking (IBT): IBT introduces instruction(ENDBRANCH)to mark valid target addresses of indirect branches (CALL, JMP etc...). If an indirect branch is executed and the next instruction is _not_ an ENDBRANCH, the processor generates a #CP. These instruction behaves as a NOP on platforms that have no CET. Several new CET MSRs are defined to support CET: MSR_IA32_{U,S}_CET: CET settings for {user,supervisor} CET respectively. MSR_IA32_PL{0,1,2,3}_SSP: SHSTK pointer linear address for CPL{0,1,2,3}. MSR_IA32_INT_SSP_TAB: Linear address of SHSTK pointer table, whose entry is indexed by IST of interrupt gate desc. Two XSAVES state bits are introduced for CET: IA32_XSS:[bit 11]: Control saving/restoring user mode CET states IA32_XSS:[bit 12]: Control saving/restoring supervisor mode CET states. Six VMCS fields are introduced for CET: {HOST,GUEST}_S_CET: Stores CET settings for kernel mode. {HOST,GUEST}_SSP: Stores current active SSP. {HOST,GUEST}_INTR_SSP_TABLE: Stores current active MSR_IA32_INT_SSP_TAB. On Intel platforms, two additional bits are defined in VM_EXIT and VM_ENTRY control fields: If VM_EXIT_LOAD_CET_STATE = 1, host CET states are loaded from following VMCS fields at VM-Exit: HOST_S_CET HOST_SSP HOST_INTR_SSP_TABLE If VM_ENTRY_LOAD_CET_STATE = 1, guest CET states are loaded from following VMCS fields at VM-Entry: GUEST_S_CET GUEST_SSP GUEST_INTR_SSP_TABLE Co-developed-by: Zhang Yi Z Signed-off-by: Zhang Yi Z Signed-off-by: Yang Weijiang Reviewed-by: Chao Gao Reviewed-by: Maxim Levitsky Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cca7d6641287..ce10a7e2d3d9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -106,6 +106,7 @@ #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 +#define VM_EXIT_LOAD_CET_STATE 0x10000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -119,6 +120,7 @@ #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 #define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 +#define VM_ENTRY_LOAD_CET_STATE 0x00100000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -369,6 +371,9 @@ enum vmcs_field { GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, GUEST_SYSENTER_ESP = 0x00006824, GUEST_SYSENTER_EIP = 0x00006826, + GUEST_S_CET = 0x00006828, + GUEST_SSP = 0x0000682a, + GUEST_INTR_SSP_TABLE = 0x0000682c, HOST_CR0 = 0x00006c00, HOST_CR3 = 0x00006c02, HOST_CR4 = 0x00006c04, @@ -381,6 +386,9 @@ enum vmcs_field { HOST_IA32_SYSENTER_EIP = 0x00006c12, HOST_RSP = 0x00006c14, HOST_RIP = 0x00006c16, + HOST_S_CET = 0x00006c18, + HOST_SSP = 0x00006c1a, + HOST_INTR_SSP_TABLE = 0x00006c1c }; /* From 9d6812d415358372aaaf1dfe95bc30d11e4e95db Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:20 -0700 Subject: [PATCH 099/142] KVM: x86: Enable guest SSP read/write interface with new uAPIs Add a KVM-defined ONE_REG register, KVM_REG_GUEST_SSP, to let userspace save and restore the guest's Shadow Stack Pointer (SSP). On both Intel and AMD, SSP is a hardware register that can only be accessed by software via dedicated ISA (e.g. RDSSP) or via VMCS/VMCB fields (used by hardware to context switch SSP at entry/exit). As a result, SSP doesn't fit in any of KVM's existing interfaces for saving/restoring state. Internally, treat SSP as a fake/synthetic MSR, as the semantics of writes to SSP follow that of several other Shadow Stack MSRs, e.g. the PLx_SSP MSRs. Use a translation layer to hide the KVM-internal MSR index so that the arbitrary index doesn't become ABI, e.g. so that KVM can rework its implementation as needed, so long as the ONE_REG ABI is maintained. Explicitly reject accesses to SSP if the vCPU doesn't have Shadow Stack support to avoid running afoul of ignore_msrs, which unfortunately applies to host-initiated accesses (which is a discussion for another day). I.e. ensure consistent behavior for KVM-defined registers irrespective of ignore_msrs. Link: https://lore.kernel.org/all/aca9d389-f11e-4811-90cf-d98e345a5cc2@intel.com Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-14-seanjc@google.com Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 8 +++++++ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++---- arch/x86/kvm/x86.h | 10 +++++++++ 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0e82fc5abd7b..fac1774031ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2911,6 +2911,14 @@ such as set vcpu counter or reset vcpu, and they have the following id bit patte x86 MSR registers have the following id bit patterns:: 0x2030 0002 +Following are the KVM-defined registers for x86: + +======================= ========= ============================================= + Encoding Register Description +======================= ========= ============================================= + 0x2030 0003 0000 0000 SSP Shadow Stack Pointer +======================= ========= ============================================= + 4.69 KVM_GET_ONE_REG -------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index aae1033c8afa..467116186e71 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -437,6 +437,9 @@ struct kvm_xcrs { #define KVM_X86_REG_KVM(index) \ KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) +/* KVM-defined registers starting from 0 */ +#define KVM_REG_GUEST_SSP 0 + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f23d2d2731d..d85bb723f25a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6016,9 +6016,27 @@ struct kvm_x86_reg_id { __u8 x86; }; -static int kvm_translate_kvm_reg(struct kvm_x86_reg_id *reg) +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) { - return -EINVAL; + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; } static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) @@ -6067,7 +6085,7 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, return -EINVAL; if (reg->type == KVM_X86_REG_TYPE_KVM) { - r = kvm_translate_kvm_reg(reg); + r = kvm_translate_kvm_reg(vcpu, reg); if (r) return r; } @@ -6098,11 +6116,22 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, static int kvm_get_reg_list(struct kvm_vcpu *vcpu, struct kvm_reg_list __user *user_list) { - u64 nr_regs = 0; + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; if (put_user(nr_regs, &user_list->n)) return -EFAULT; + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 786e36fcd0fb..a7c9c72fca93 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -101,6 +101,16 @@ do { \ #define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX #define KVM_SVM_DEFAULT_PLE_WINDOW 3000 +/* + * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves + * are arbitrary and have no meaning, the only requirement is that they don't + * conflict with "real" MSRs that KVM supports. Use values at the upper end + * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values + * will be usable until KVM exhausts its supply of paravirtual MSR indices. + */ + +#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff + static inline unsigned int __grow_ple_window(unsigned int val, unsigned int base, unsigned int modifier, unsigned int max) { From 8b59d0275c964bcbe573dde46bd3c1f20ed2d2bd Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:21 -0700 Subject: [PATCH 100/142] KVM: VMX: Emulate read and write to CET MSRs Add emulation interface for CET MSR access. The emulation code is split into common part and vendor specific part. The former does common checks for MSRs, e.g., accessibility, data validity etc., then passes operation to either XSAVE-managed MSRs via the helpers or CET VMCS fields. SSP can only be read via RDSSP. Writing even requires destructive and potentially faulting operations such as SAVEPREVSSP/RSTORSSP or SETSSBSY/CLRSSBSY. Let the host use a pseudo-MSR that is just a wrapper for the GUEST_SSP field of the VMCS. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: drop call to kvm_set_xstate_msr() for S_CET, consolidate code] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++ arch/x86/kvm/x86.c | 64 ++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 23 +++++++++++++++ 3 files changed, 103 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d2fa64bddb57..62bf93dc0825 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2093,6 +2093,15 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; + case MSR_IA32_S_CET: + msr_info->data = vmcs_readl(GUEST_S_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = vmcs_readl(GUEST_SSP); + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); + break; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmx_guest_debugctl_read(); break; @@ -2411,6 +2420,15 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; + case MSR_IA32_S_CET: + vmcs_writel(GUEST_S_CET, data); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + vmcs_writel(GUEST_SSP, data); + break; + case MSR_IA32_INT_SSP_TAB: + vmcs_writel(GUEST_INTR_SSP_TABLE, data); + break; case MSR_IA32_PERF_CAPABILITIES: if (data & PERF_CAP_LBR_FMT) { if ((data & PERF_CAP_LBR_FMT) != diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d85bb723f25a..54d280fe9a4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1890,6 +1890,44 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, data = (u32)data; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; } msr.data = data; @@ -1934,6 +1972,20 @@ static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; } msr.index = index; @@ -3865,12 +3917,12 @@ static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, kvm_fpu_put(); } -static __maybe_unused void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); } -static __maybe_unused void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); } @@ -4256,6 +4308,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -4605,6 +4661,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a7c9c72fca93..076eccba0f7e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -710,4 +710,27 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); +#define CET_US_RESERVED_BITS GENMASK(9, 6) +#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) +#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) +#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) + +static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) +{ + if (data & CET_US_RESERVED_BITS) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (data & CET_US_SHSTK_MASK_BITS)) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + (data & CET_US_IBT_MASK_BITS)) + return false; + if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) + return false; + /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ + if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) + return false; + + return true; +} #endif From 1a61bd0d126abbf01c384c44b46bcdd4ef495850 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:22 -0700 Subject: [PATCH 101/142] KVM: x86: Save and reload SSP to/from SMRAM Save CET SSP to SMRAM on SMI and reload it on RSM. KVM emulates HW arch behavior when guest enters/leaves SMM mode,i.e., save registers to SMRAM at the entry of SMM and reload them at the exit to SMM. Per SDM, SSP is one of such registers on 64-bit Arch, and add the support for SSP. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/smm.c | 8 ++++++++ arch/x86/kvm/smm.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 5dd8a1646800..b0b14ba37f9a 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -269,6 +269,10 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp)) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } #endif @@ -558,6 +562,10 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp)) + return X86EMUL_UNHANDLEABLE; + return X86EMUL_CONTINUE; } #endif diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 551703fbe200..db3c88f16138 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -116,8 +116,8 @@ struct kvm_smram_state_64 { u32 smbase; u32 reserved4[5]; - /* ssp and svm_* fields below are not implemented by KVM */ u64 ssp; + /* svm_* fields below are not implemented by KVM */ u64 svm_guest_pat; u64 svm_host_efer; u64 svm_host_cr4; From 25f3840483e6c69e4d1a689cf3edc70f93604f2a Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:23 -0700 Subject: [PATCH 102/142] KVM: VMX: Set up interception for CET MSRs Disable interception for CET MSRs that can be accessed via XSAVES/XRSTORS, and exist accordingly to CPUID, as accesses through XSTATE aren't subject to MSR interception checks, i.e. can't be intercepted without intercepting and emulating XSAVES/XRSTORS, and KVM doesn't support emulating XSAVE/XRSTOR instructions. Don't condition interception on the guest actually having XSAVES as there is no benefit to intercepting the accesses (when the MSRs exist). The MSRs in question are either context switched by the CPU on VM-Enter/VM-Exit or by KVM via XSAVES/XRSTORS (KVM requires XSAVES to virtualization SHSTK), i.e. KVM is going to load guest values into hardware irrespective of guest XSAVES support. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Reviewed-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62bf93dc0825..bde768c7111f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4088,6 +4088,8 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { + bool intercept; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -4133,6 +4135,23 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); + } + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); + } + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. From 584ba3ffb9843fd12d3b4a33cfe056e2264392a0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:24 -0700 Subject: [PATCH 103/142] KVM: VMX: Set host constant supervisor states to VMCS fields Save constant values to HOST_{S_CET,SSP,INTR_SSP_TABLE} field explicitly. Kernel IBT is supported and the setting in MSR_IA32_S_CET is static after post-boot(The exception is BIOS call case but vCPU thread never across it) and KVM doesn't need to refresh HOST_S_CET field before every VM-Enter/ VM-Exit sequence. Host supervisor shadow stack is not enabled now and SSP is not accessible to kernel mode, thus it's safe to set host IA32_INT_SSP_TAB/SSP VMCS field to 0s. When shadow stack is enabled for CPL3, SSP is reloaded from PL3_SSP before it exits to userspace. Check SDM Vol 2A/B Chapter 3/4 for SYSCALL/ SYSRET/SYSENTER SYSEXIT/RDSSP/CALL etc. Prevent KVM module loading if host supervisor shadow stack SHSTK_EN is set in MSR_IA32_S_CET as KVM cannot co-exit with it correctly. Suggested-by: Sean Christopherson Suggested-by: Chao Gao Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: snapshot host S_CET if SHSTK *or* IBT is supported] Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/vmx.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 12 ++++++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 32 insertions(+) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f614428dbeda..59c83888bdc0 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -100,6 +100,10 @@ static inline bool cpu_has_load_perf_global_ctrl(void) return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } +static inline bool cpu_has_load_cet_ctrl(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); +} static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bde768c7111f..4ab066a3c22f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4312,6 +4312,21 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, kvm_host.efer); + + /* + * Supervisor shadow stack is not enabled on host side, i.e., + * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM + * description(RDSSP instruction), SSP is not readable in CPL0, + * so resetting the two registers to 0s at VM-Exit does no harm + * to kernel execution. When execution flow exits to userspace, + * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter + * 3 and 4 for details. + */ + if (cpu_has_load_cet_ctrl()) { + vmcs_writel(HOST_S_CET, kvm_host.s_cet); + vmcs_writel(HOST_SSP, 0); + vmcs_writel(HOST_INTR_SSP_TABLE, 0); + } } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d280fe9a4b..0050509a7de2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9997,6 +9997,18 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -EIO; } + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); + /* + * Linux doesn't yet support supervisor shadow stacks (SSS), so + * KVM doesn't save/restore the associated MSRs, i.e. KVM may + * clobber the host values. Yell and refuse to load if SSS is + * unexpectedly enabled, e.g. to avoid crashing the host. + */ + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) + return -EIO; + } + memset(&kvm_caps, 0, sizeof(kvm_caps)); x86_emulator_cache = kvm_alloc_emulator_cache(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 076eccba0f7e..65cbd454c4f1 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,7 @@ struct kvm_host_values { u64 efer; u64 xcr0; u64 xss; + u64 s_cet; u64 arch_capabilities; }; From 57c3db7e2e26970ee3630a25913368f849ea803a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:25 -0700 Subject: [PATCH 104/142] KVM: x86: Don't emulate instructions affected by CET features Don't emulate branch instructions, e.g. CALL/RET/JMP etc., that are affected by Shadow Stacks and/or Indirect Branch Tracking when said features are enabled in the guest, as fully emulating CET would require significant complexity for no practical benefit (KVM shouldn't need to emulate branch instructions on modern hosts). Simply doing nothing isn't an option as that would allow a malicious entity to subvert CET protections via the emulator. To detect instructions that are subject to IBT or affect IBT state, use the existing IsBranch flag along with the source operand type to detect indirect branches, and the existing NearBranch flag to detect far JMPs and CALLs, all of which are effectively indirect. Explicitly check for emulation of IRET, FAR RET (IMM), and SYSEXIT (the ret-like far branches) instead of adding another flag, e.g. IsRet, as it's unlikely the emulator will ever need to check for return-like instructions outside of this one specific flow. Use an allow-list instead of a deny-list because (a) it's a shorter list and (b) so that a missed entry gets a false positive, not a false negative (i.e. reject emulation instead of clobbering CET state). For Shadow Stacks, explicitly track instructions that directly affect the current SSP, as KVM's emulator doesn't have existing flags that can be used to precisely detect such instructions. Alternatively, the em_xxx() helpers could directly check for ShadowStack interactions, but using a dedicated flag is arguably easier to audit, and allows for handling both IBT and SHSTK in one fell swoop. Note! On far transfers, do NOT consult the current privilege level and instead treat SHSTK/IBT as being enabled if they're enabled for User *or* Supervisor mode. On inter-privilege level far transfers, SHSTK and IBT can be in play for the target privilege level, i.e. checking the current privilege could get a false negative, and KVM doesn't know the target privilege level until emulation gets under way. Note #2, FAR JMP from 64-bit mode to compatibility mode interacts with the current SSP, but only to ensure SSP[63:32] == 0. Don't tag FAR JMP as SHSTK, which would be rather confusing and would result in FAR JMP being rejected unnecessarily the vast majority of the time (ignoring that it's unlikely to ever be emulated). A future commit will add the #GP(0) check for the specific FAR JMP scenario. Note #3, task switches also modify SSP and so need to be rejected. That too will be addressed in a future commit. Suggested-by: Chao Gao Originally-by: Yang Weijiang Cc: Mathias Krause Cc: John Allen Cc: Rick Edgecombe Reviewed-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 115 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 23929151a5b8..48b067b179e0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -178,6 +178,7 @@ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ +#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4068,8 +4069,8 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | NearBranch | IsBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), + I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far), I(SrcMem | NearBranch | IsBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), @@ -4304,7 +4305,7 @@ static const struct opcode opcode_table[256] = { DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, + I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), @@ -4324,19 +4325,19 @@ static const struct opcode opcode_table[256] = { X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), - I(ImplicitOps | NearBranch | IsBranch, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm), + I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), - I(ImplicitOps | IsBranch, em_ret_far), - D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), + I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm), + I(ImplicitOps | IsBranch | ShadowStack, em_ret_far), + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn), D(ImplicitOps | No64 | IsBranch), - II(ImplicitOps | IsBranch, em_iret, iret), + II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), @@ -4352,7 +4353,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | NearBranch | IsBranch, em_call), + I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call), D(SrcImm | ImplicitOps | NearBranch | IsBranch), I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), @@ -4371,7 +4372,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), + N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4402,8 +4403,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), - I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), + I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -4514,6 +4515,60 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I2bvIP #undef I6ALU +static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt) +{ + return ctxt->d & ShadowStack; +} + +static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) +{ + u64 flags = ctxt->d; + + if (!(flags & IsBranch)) + return false; + + /* + * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are + * indirect and thus affect IBT state. All far RETs (including SYSEXIT + * and IRET) are protected via Shadow Stacks and thus don't affect IBT + * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is + * enabled, but that should be handled by IRET emulation (in the very + * unlikely scenario that KVM adds support for fully emulating IRET). + */ + if (!(flags & NearBranch)) + return ctxt->execute != em_iret && + ctxt->execute != em_ret_far && + ctxt->execute != em_ret_far_imm && + ctxt->execute != em_sysexit; + + switch (flags & SrcMask) { + case SrcReg: + case SrcMem: + case SrcMem16: + case SrcMem32: + return true; + case SrcMemFAddr: + case SrcImmFAddr: + /* Far branches should be handled above. */ + WARN_ON_ONCE(1); + return true; + case SrcNone: + case SrcImm: + case SrcImmByte: + /* + * Note, ImmU16 is used only for the stack adjustment operand on ENTER + * and RET instructions. ENTER isn't a branch and RET FAR is handled + * by the NearBranch check above. RET itself isn't an indirect branch. + */ + case SrcImmU16: + return false; + default: + WARN_ONCE(1, "Unexpected Src operand '%llx' on branch", + flags & SrcMask); + return false; + } +} + static unsigned imm_size(struct x86_emulate_ctxt *ctxt) { unsigned size; @@ -4943,6 +4998,40 @@ done_prefixes: ctxt->execute = opcode.u.execute; + /* + * Reject emulation if KVM might need to emulate shadow stack updates + * and/or indirect branch tracking enforcement, which the emulator + * doesn't support. + */ + if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) && + ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) { + u64 u_cet = 0, s_cet = 0; + + /* + * Check both User and Supervisor on far transfers as inter- + * privilege level transfers are impacted by CET at the target + * privilege level, and that is not known at this time. The + * expectation is that the guest will not require emulation of + * any CET-affected instructions at any privilege level. + */ + if (!(ctxt->d & NearBranch)) + u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else if (ctxt->ops->cpl(ctxt) == 3) + u_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else + s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + + if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) || + (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet))) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt)) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt)) + return EMULATION_FAILED; + } + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; From 82c0ec02825814e1ef332d635d3441b07c05b1c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:26 -0700 Subject: [PATCH 105/142] KVM: x86: Don't emulate task switches when IBT or SHSTK is enabled Exit to userspace with KVM_INTERNAL_ERROR_EMULATION if the guest triggers task switch emulation with Indirect Branch Tracking or Shadow Stacks enabled, as attempting to do the right thing would require non-trivial effort and complexity, KVM doesn't support emulating CET generally, and it's extremely unlikely that any guest will do task switches while also utilizing CET. Defer taking on the complexity until someone cares enough to put in the time and effort to add support. Per the SDM: If shadow stack is enabled, then the SSP of the task is located at the 4 bytes at offset 104 in the 32-bit TSS and is used by the processor to establish the SSP when a task switch occurs from a task associated with this TSS. Note that the processor does not write the SSP of the task initiating the task switch to the TSS of that task, and instead the SSP of the previous task is pushed onto the shadow stack of the new task. Note, per the SDM's pseudocode on TASK SWITCHING, IBT state for the new privilege level is updated. To keep things simple, check both S_CET and U_CET (again, anyone that wants more precise checking can have the honor of implementing support). Reported-by: Binbin Wu Closes: https://lore.kernel.org/all/819bd98b-2a60-4107-8e13-41f1e4c706b1@linux.intel.com Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0050509a7de2..31aaff9db083 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12175,6 +12175,25 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { + u64 u_cet, s_cet; + + /* + * Check both User and Supervisor on task switches as inter- + * privilege level task switches are impacted by CET at both + * the current privilege level and the new privilege level, and + * that information is not known at this time. The expectation + * is that the guest won't require emulation of task switches + * while using IBT or Shadow Stacks. + */ + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) + goto unhandled_task_switch; + + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) + goto unhandled_task_switch; + } + init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, @@ -12184,17 +12203,19 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, * Report an error userspace if MMIO is needed, as KVM doesn't support * MMIO during a task switch (or any other complex operation). */ - if (ret || vcpu->mmio_needed) { - vcpu->mmio_needed = false; - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (ret || vcpu->mmio_needed) + goto unhandled_task_switch; kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); return 1; + +unhandled_task_switch: + vcpu->mmio_needed = false; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } EXPORT_SYMBOL_GPL(kvm_task_switch); From d4c03f63957c66bc95f5f33052f8b4be804631c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:27 -0700 Subject: [PATCH 106/142] KVM: x86: Emulate SSP[63:32]!=0 #GP(0) for FAR JMP to 32-bit mode Emulate the Shadow Stack restriction that the current SSP must be a 32-bit value on a FAR JMP from 64-bit mode to compatibility mode. From the SDM's pseudocode for FAR JMP: IF ShadowStackEnabled(CPL) IF (IA32_EFER.LMA and DEST(segment selector).L) = 0 (* If target is legacy or compatibility mode then the SSP must be in low 4GB *) IF (SSP & 0xFFFFFFFF00000000 != 0); THEN #GP(0); FI; FI; FI; Note, only the current CPL needs to be considered, as FAR JMP can't be used for inter-privilege level transfers, and KVM rejects emulation of all other far branch instructions when Shadow Stacks are enabled. To give the emulator access to GUEST_SSP, special case handling MSR_KVM_INTERNAL_GUEST_SSP in emulator_get_msr() to treat the access as a host access (KVM doesn't allow guest accesses to internal "MSRs"). The ->get_msr() API is only used for implicit accesses from the emulator, i.e. is only used with hardcoded MSR indices, and so any access to MSR_KVM_INTERNAL_GUEST_SSP is guaranteed to be from KVM, i.e. not from the guest via RDMSR. Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 2 files changed, 44 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 48b067b179e0..59f93f68718a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1554,6 +1554,37 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } +static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) +{ + const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; + u64 efer = 0, cet = 0, ssp = 0; + + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) + return true; + + /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ + if (!(efer & EFER_LMA)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) + return true; + + if (!(cet & CET_SHSTK_EN)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) + return true; + + /* + * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must + * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. + */ + return ssp >> 32; +} + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1694,6 +1725,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (efer & EFER_LMA) goto exception; } + if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { + err_code = 0; + goto exception; + } /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 31aaff9db083..0a4e58dddf36 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8741,6 +8741,15 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } From 296599346c671f31854d674db2891ff2e1654b6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:28 -0700 Subject: [PATCH 107/142] KVM: x86/mmu: WARN on attempt to check permissions for Shadow Stack #PF Add PFERR_SS_MASK, a.k.a. Shadow Stack access, and WARN if KVM attempts to check permissions for a Shadow Stack access as KVM hasn't been taught to understand the magic Writable=0,Dirty=1 combination that is required for Shadow Stack accesses, and likely will never learn. There are no plans to support Shadow Stacks with the Shadow MMU, and the emulator rejects all instructions that affect Shadow Stacks, i.e. it should be impossible for KVM to observe a #PF due to a shadow stack access. Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5865c9b77b6d..e8d74e949f91 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -267,6 +267,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_MASK BIT(3) #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) +#define PFERR_SS_MASK BIT(6) #define PFERR_SGX_MASK BIT(15) #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b4b6860ab971..f63074048ec6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -212,7 +212,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, fault = (mmu->permissions[index] >> pte_access) & 1; - WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { u32 pkru_bits, offset; From 843af0f2e4617db197034b27a9a658a59271f19f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:29 -0700 Subject: [PATCH 108/142] KVM: x86/mmu: Pretty print PK, SS, and SGX flags in MMU tracepoints Add PK (Protection Keys), SS (Shadow Stacks), and SGX (Software Guard Extensions) to the set of #PF error flags handled via kvm_mmu_trace_pferr_flags. While KVM doesn't expect PK or SS #PFs in particular, pretty print their names instead of the raw hex value saves the user from having to go spelunking in the SDM to figure out what's going on. Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmutrace.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index f35a830ce469..764e3015d021 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -51,6 +51,9 @@ { PFERR_PRESENT_MASK, "P" }, \ { PFERR_WRITE_MASK, "W" }, \ { PFERR_USER_MASK, "U" }, \ + { PFERR_PK_MASK, "PK" }, \ + { PFERR_SS_MASK, "SS" }, \ + { PFERR_SGX_MASK, "SGX" }, \ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } From b3744c59ebc5f2697d62844149c1a1c0e274ead0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:30 -0700 Subject: [PATCH 109/142] KVM: x86: Allow setting CR4.CET if IBT or SHSTK is supported Drop X86_CR4_CET from CR4_RESERVED_BITS and instead mark CET as reserved if and only if IBT *and* SHSTK are unsupported, i.e. allow CR4.CET to be set if IBT or SHSTK is supported. This creates a virtualization hole if the CPU supports both IBT and SHSTK, but the kernel or vCPU model only supports one of the features. However, it's entirely legal for a CPU to have only one of IBT or SHSTK, i.e. the hole is a flaw in the architecture, not in KVM. More importantly, so long as KVM is careful to initialize and context switch both IBT and SHSTK state (when supported in hardware) if either feature is exposed to the guest, a misbehaving guest can only harm itself. E.g. VMX initializes host CET VMCS fields based solely on hardware capabilities. Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: split to separate patch, write changelog] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8d74e949f91..2f14bdc859fd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -142,7 +142,7 @@ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ - | X86_CR4_LAM_SUP)) + | X86_CR4_LAM_SUP | X86_CR4_CET)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 65cbd454c4f1..f3dc77f006f9 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -680,6 +680,9 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) __reserved_bits |= X86_CR4_PCIDE; \ if (!__cpu_has(__c, X86_FEATURE_LAM)) \ __reserved_bits |= X86_CR4_LAM_SUP; \ + if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \ + !__cpu_has(__c, X86_FEATURE_IBT)) \ + __reserved_bits |= X86_CR4_CET; \ __reserved_bits; \ }) From 19e6e083f3f9e4ac1794273d72dfb59d19a0fc69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:31 -0700 Subject: [PATCH 110/142] KVM: nVMX: Always forward XSAVES/XRSTORS exits from L2 to L1 Unconditionally forward XSAVES/XRSTORS VM-Exits from L2 to L1, as KVM doesn't utilize the XSS-bitmap (KVM relies on controlling the XSS value in hardware to prevent unauthorized access to XSAVES state). KVM always loads vmcs02 with vmcs12's bitmap, and so any exit _must_ be due to vmcs12's XSS-bitmap. Drop the comment about XSS never being non-zero in anticipation of enabling CET_KERNEL and CET_USER support. Opportunistically WARN if XSAVES is not enabled for L2, as the CPU is supposed to generate #UD before checking the XSS-bitmap. Reviewed-by: Xiaoyao Li Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2156c9a854f4..846c07380eac 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6570,14 +6570,17 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + case EXIT_REASON_XSAVES: + case EXIT_REASON_XRSTORS: /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. + * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize + * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap + * verbatim, i.e. any exit is due to L1's bitmap. WARN if + * XSAVES isn't enabled, as the CPU is supposed to inject #UD + * in that case, before consulting the XSS-bitmap. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); + WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); + return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, From 69cc3e886582891f9c4d5830f18a2664a7f7cf7c Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:32 -0700 Subject: [PATCH 111/142] KVM: x86: Add XSS support for CET_KERNEL and CET_USER Add CET_KERNEL and CET_USER to KVM's set of supported XSS bits when IBT *or* SHSTK is supported. Like CR4.CET, XFEATURE support for IBT and SHSTK are bundle together under the CET umbrella, and thus prone to virtualization holes if KVM or the guest supports only one of IBT or SHSTK, but hardware supports both. However, again like CR4.CET, such virtualization holes are benign from the host's perspective so long as KVM takes care to always honor the "or" logic. Require CET_KERNEL and CET_USER to come as a pair, and refuse to support IBT or SHSTK if one (or both) features is missing, as the (host) kernel expects them to come as a pair, i.e. may get confused and corrupt state if only one of CET_KERNEL or CET_USER is supported. Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: split to separate patch, write changelog, add XFEATURE_MASK_CET_ALL] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a4e58dddf36..8b4c69330a87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -220,13 +220,14 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) /* * Note, KVM supports exposing PT to the guest, but does not support context * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). */ -#define KVM_SUPPORTED_XSS 0 +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); @@ -10104,6 +10105,16 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -12772,10 +12783,11 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) /* * On INIT, only select XSTATE components are zeroed, most components * are unchanged. Currently, the only components that are zeroed and - * supported by KVM are MPX related. + * supported by KVM are MPX and CET related. */ xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & - (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_CET_ALL); if (!xfeatures_mask) return; From 1f6f68fcfe43cf26fc0a98fe14e0454cc5c75416 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:33 -0700 Subject: [PATCH 112/142] KVM: x86: Disable support for Shadow Stacks if TDP is disabled Make TDP a hard requirement for Shadow Stacks, as there are no plans to add Shadow Stack support to the Shadow MMU. E.g. KVM hasn't been taught to understand the magic Writable=0,Dirty=1 combination that is required for Shadow Stack accesses, and so enabling Shadow Stacks when using shadow paging will put the guest into an infinite #PF loop (KVM thinks the shadow page tables have a valid mapping, hardware says otherwise). Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 32fde9e80c28..499c86bd457e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -955,6 +955,14 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); + /* + * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack + * accesses require "magic" Writable=0,Dirty=1 protection, which KVM + * doesn't know how to emulate or map. + */ + if (!tdp_enabled) + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_init(CPUID_7_EDX, F(AVX512_4VNNIW), F(AVX512_4FMAPS), From f705de12a22c945f55d51baec4ea495aedc5ebd7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Sep 2025 11:47:43 -0700 Subject: [PATCH 113/142] KVM: x86: Initialize allow_smaller_maxphyaddr earlier in setup Initialize allow_smaller_maxphyaddr during hardware setup as soon as KVM knows whether or not TDP will be utilized. To avoid having to teach KVM's emulator all about CET, KVM's upcoming CET virtualization support will be mutually exclusive with allow_smaller_maxphyaddr, i.e. will disable SHSTK and IBT if allow_smaller_maxphyaddr is enabled. In general, allow_smaller_maxphyaddr should be initialized as soon as possible since it's globally visible while its only input is whether or not EPT/NPT is enabled. I.e. there's effectively zero risk of setting allow_smaller_maxphyaddr too early, and substantial risk of setting it too late. Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250922184743.1745778-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 30 +++++++++++++++--------------- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d092102fe1cb..d20e5917b0fe 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5370,6 +5370,21 @@ static __init int svm_hardware_setup(void) get_npt_level(), PG_LEVEL_1G); pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5449,21 +5464,6 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4ab066a3c22f..4d775983506e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8436,6 +8436,14 @@ __init int vmx_hardware_setup(void) return -EOPNOTSUPP; } + /* + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default + */ + if (!enable_ept) + allow_smaller_maxphyaddr = true; + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; @@ -8665,14 +8673,6 @@ int __init vmx_init(void) vmx_check_vmcs12_offsets(); - /* - * Shadow paging doesn't have a (further) performance penalty - * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it - * by default - */ - if (!enable_ept) - allow_smaller_maxphyaddr = true; - return 0; err_l1d_flush: From 343acdd158a55dab1cfb0d209cd1d70b0cabb8b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:34 -0700 Subject: [PATCH 114/142] KVM: x86: Disable support for IBT and SHSTK if allow_smaller_maxphyaddr is true Make IBT and SHSTK virtualization mutually exclusive with "officially" supporting setups with guest.MAXPHYADDR < host.MAXPHYADDR, i.e. if the allow_smaller_maxphyaddr module param is set. Running a guest with a smaller MAXPHYADDR requires intercepting #PF, and can also trigger emulation of arbitrary instructions. Intercepting and reacting to #PFs doesn't play nice with SHSTK, as KVM's MMU hasn't been taught to handle Shadow Stack accesses, and emulating arbitrary instructions doesn't play nice with IBT or SHSTK, as KVM's emulator doesn't handle the various side effects, e.g. doesn't enforce end-branch markers or model Shadow Stack updates. Note, hiding IBT and SHSTK based solely on allow_smaller_maxphyaddr is overkill, as allow_smaller_maxphyaddr is only problematic if the guest is actually configured to have a smaller MAXPHYADDR. However, KVM's ABI doesn't provide a way to express that IBT and SHSTK may break if enabled in conjunction with guest.MAXPHYADDR < host.MAXPHYADDR. I.e. the alternative is to do nothing in KVM and instead update documentation and hope KVM users are thorough readers. Go with the conservative-but-correct approach; worst case scenario, this restriction can be dropped if there's a strong use case for enabling CET on hosts with allow_smaller_maxphyaddr. Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 499c86bd457e..b0731304bd79 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -982,6 +982,16 @@ void kvm_set_cpu_caps(void) F(FLUSH_L1D), ); + /* + * Disable support for IBT and SHSTK if KVM is configured to emulate + * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or + * SHSTK, nor does KVM handle Shadow Stack #PFs (see above). + */ + if (allow_smaller_maxphyaddr) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } + if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && boot_cpu_has(X86_FEATURE_AMD_IBPB) && boot_cpu_has(X86_FEATURE_AMD_IBRS)) From e140467bbdafff84f1f346a7c59f404cd9782b82 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:35 -0700 Subject: [PATCH 115/142] KVM: x86: Enable CET virtualization for VMX and advertise to userspace Add support for the LOAD_CET_STATE VM-Enter and VM-Exit controls, the CET XFEATURE bits in XSS, and advertise support for IBT and SHSTK to userspace. Explicitly clear IBT and SHSTK onn SVM, as additional work is needed to enable CET on SVM, e.g. to context switch S_CET and other state. Disable KVM CET feature if unrestricted_guest is unsupported/disabled as KVM does not support emulating CET, as running without Unrestricted Guest can result in KVM emulating large swaths of guest code. While it's highly unlikely any guest will trigger emulation while also utilizing IBT or SHSTK, there's zero reason to allow CET without Unrestricted Guest as that combination should only be possible when explicitly disabling unrestricted_guest for testing purposes. Disable CET if VMX_BASIC[bit56] == 0, i.e. if hardware strictly enforces the presence of an Error Code based on exception vector, as attempting to inject a #CP with an Error Code (#CP architecturally has an Error Code) will fail due to the #CP vector historically not having an Error Code. Clear S_CET and SSP-related VMCS on "reset" to emulate the architectural of CET MSRs and SSP being reset to 0 after RESET, power-up and INIT. Note, KVM already clears guest CET state that is managed via XSTATE in kvm_xstate_reset(). Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: move some bits to separate patches, massage changelog] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm/svm.c | 4 ++++ arch/x86/kvm/vmx/capabilities.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 6 ++++-- 6 files changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ce10a7e2d3d9..c85c50019523 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -134,6 +134,7 @@ #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) #define VMX_BASIC_INOUT BIT_ULL(54) #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) +#define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56) static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b0731304bd79..d290dbc96831 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -946,6 +946,7 @@ void kvm_set_cpu_caps(void) VENDOR_F(WAITPKG), F(SGX_LC), F(BUS_LOCK_DETECT), + X86_64_F(SHSTK), ); /* @@ -980,6 +981,7 @@ void kvm_set_cpu_caps(void) F(AMX_INT8), F(AMX_BF16), F(FLUSH_L1D), + F(IBT), ); /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d20e5917b0fe..ff4925a7bf96 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5222,6 +5222,10 @@ static __init void svm_set_cpu_caps(void) kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; + /* KVM doesn't yet support CET virtualization for SVM. */ + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 59c83888bdc0..02aadb9d730e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -73,6 +73,11 @@ static inline bool cpu_has_vmx_basic_inout(void) return vmcs_config.basic & VMX_BASIC_INOUT; } +static inline bool cpu_has_vmx_basic_no_hw_errcode_cc(void) +{ + return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4d775983506e..dfa6c5cd2d5c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2602,6 +2602,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); @@ -4868,6 +4869,14 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, 0); + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); + } + if (kvm_cpu_cap_has(X86_FEATURE_IBT) || + kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, 0); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); @@ -6335,6 +6344,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), + vmcs_readl(GUEST_INTR_SSP_TABLE)); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); @@ -6365,6 +6378,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); + if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), + vmcs_readl(HOST_INTR_SSP_TABLE)); pr_err("*** Control State ***\n"); pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", @@ -7946,7 +7963,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7958,6 +7974,18 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + + /* + * Disable CET if unrestricted_guest is unsupported as KVM doesn't + * enforce CET HW behaviors in emulator. On platforms with + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code + * fails, so disable CET in this case too. + */ + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + !cpu_has_vmx_basic_no_hw_errcode_cc()) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 23d6e89b96f2..af8224e074ee 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -484,7 +484,8 @@ static inline u8 vmx_get_rvi(void) VM_ENTRY_LOAD_IA32_EFER | \ VM_ENTRY_LOAD_BNDCFGS | \ VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) + VM_ENTRY_LOAD_IA32_RTIT_CTL | \ + VM_ENTRY_LOAD_CET_STATE) #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ @@ -506,7 +507,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_LOAD_IA32_EFER | \ VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) + VM_EXIT_CLEAR_IA32_RTIT_CTL | \ + VM_EXIT_LOAD_CET_STATE) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ From f7336d47be53d0da1e74143b8befea8c0176f3cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:36 -0700 Subject: [PATCH 116/142] KVM: VMX: Configure nested capabilities after CPU capabilities Swap the order between configuring nested VMX capabilities and base CPU capabilities, so that nested VMX support can be conditioned on core KVM support, e.g. to allow conditioning support for LOAD_CET_STATE on the presence of IBT or SHSTK. Because the sanity checks on nested VMX config performed by vmx_check_processor_compat() run _after_ vmx_hardware_setup(), any use of kvm_cpu_cap_has() when configuring nested VMX support will lead to failures in vmx_check_processor_compat(). While swapping the order of two (or more) configuration flows can lead to a game of whack-a-mole, in this case nested support inarguably should be done after base support. KVM should never condition base support on nested support, because nested support is fully optional, while obviously it's desirable to condition nested support on base support. And there's zero evidence the current ordering was intentional, e.g. commit 66a6950f9995 ("KVM: x86: Introduce kvm_cpu_caps to replace runtime CPUID masking") likely placed the call to kvm_set_cpu_caps() after nested setup because it looked pretty. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dfa6c5cd2d5c..c4b07124689d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8597,6 +8597,13 @@ __init int vmx_hardware_setup(void) setup_default_sgx_lepubkeyhash(); + vmx_set_cpu_caps(); + + /* + * Configure nested capabilities after core CPU capabilities so that + * nested support can be conditional on base support, e.g. so that KVM + * can hide/show features based on kvm_cpu_cap_has(). + */ if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); @@ -8605,8 +8612,6 @@ __init int vmx_hardware_setup(void) return r; } - vmx_set_cpu_caps(); - r = alloc_kvm_area(); if (r && nested) nested_vmx_hardware_unsetup(); From 033cc166f02920b193f7ec989083bce1a20e9abf Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:37 -0700 Subject: [PATCH 117/142] KVM: nVMX: Virtualize NO_HW_ERROR_CODE_CC for L1 event injection to L2 Per SDM description(Vol.3D, Appendix A.1): "If bit 56 is read as 1, software can use VM entry to deliver a hardware exception with or without an error code, regardless of vector" Modify has_error_code check before inject events to nested guest. Only enforce the check when guest is in real mode, the exception is not hard exception and the platform doesn't enumerate bit56 in VMX_BASIC, in all other case ignore the check to make the logic consistent with SDM. Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 27 ++++++++++++++++++--------- arch/x86/kvm/vmx/nested.h | 5 +++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 846c07380eac..b644f4599f70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1272,9 +1272,10 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | VMX_BASIC_INOUT | - VMX_BASIC_TRUE_CTLS; + VMX_BASIC_TRUE_CTLS | + VMX_BASIC_NO_HW_ERROR_CODE_CC; - const u64 reserved_bits = GENMASK_ULL(63, 56) | + const u64 reserved_bits = GENMASK_ULL(63, 57) | GENMASK_ULL(47, 45) | BIT_ULL(31); @@ -2949,7 +2950,6 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; @@ -2966,12 +2966,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (CC(has_error_code != should_have_error_code)) - return -EINVAL; + /* + * Cannot deliver error code in real mode or if the interrupt + * type is not hardware exception. For other cases, do the + * consistency check only if the vCPU doesn't enumerate + * VMX_BASIC_NO_HW_ERROR_CODE_CC. + */ + if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { + if (CC(has_error_code)) + return -EINVAL; + } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { + if (CC(has_error_code != x86_exception_has_error_code(vector))) + return -EINVAL; + } /* VM-entry exception error code */ if (CC(has_error_code && @@ -7217,6 +7224,8 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; + if (cpu_has_vmx_basic_no_hw_errcode_cc()) + msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; } static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6eedcfc91070..983484d42ebf 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -309,6 +309,11 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) __kvm_is_valid_cr4(vcpu, val); } +static inline bool nested_cpu_has_no_hw_errcode_cc(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + /* No difference in the restrictions on guest and host CR4 in VMX operation. */ #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid From 625884996bff1b25a39834fa4935d695d71ef1b1 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:38 -0700 Subject: [PATCH 118/142] KVM: nVMX: Prepare for enabling CET support for nested guest Set up CET MSRs, related VM_ENTRY/EXIT control bits and fixed CR4 setting to enable CET for nested VM. vmcs12 and vmcs02 needs to be synced when L2 exits to L1 or when L1 wants to resume L2, that way correct CET states can be observed by one another. Please note that consistency checks regarding CET state during VM-Entry will be added later to prevent this patch from becoming too large. Advertising the new CET VM_ENTRY/EXIT control bits are also be deferred until after the consistency checks are added. Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Xin Li (Intel) Tested-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 77 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmcs12.c | 6 +++ arch/x86/kvm/vmx/vmcs12.h | 14 ++++++- arch/x86/kvm/vmx/vmx.c | 2 + arch/x86/kvm/vmx/vmx.h | 3 ++ 5 files changed, 101 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b644f4599f70..11e5d3569933 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -721,6 +721,24 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_MPERF, MSR_TYPE_R); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_U_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_S_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL0_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL1_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL2_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL3_SSP, MSR_TYPE_RW); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -2521,6 +2539,32 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 } } +static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, + u64 *ssp, u64 *ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + *s_cet = vmcs_readl(GUEST_S_CET); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + *ssp = vmcs_readl(GUEST_SSP); + *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); + } +} + +static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, s_cet); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, ssp); + vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); + } +} + static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); @@ -2637,6 +2681,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); + set_cr4_guest_host_mask(vmx); } @@ -2676,6 +2724,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, + vmx->nested.pre_vmenter_ssp, + vmx->nested.pre_vmenter_ssp_tbl); + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); @@ -3551,6 +3606,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, + &vmx->nested.pre_vmenter_ssp, + &vmx->nested.pre_vmenter_ssp_tbl); + /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* * nested early checks are disabled. In the event of a "late" VM-Fail, @@ -4634,6 +4695,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; + + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, + &vmcs12->guest_ssp, + &vmcs12->guest_ssp_tbl); } /* @@ -4759,6 +4824,18 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); + /* + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. + * otherwise CET state should be retained across VM-exit, i.e., + * guest values should be propagated from vmcs12 to vmcs01. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, + vmcs12->host_ssp_tbl); + else + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 106a72c923ca..4233b5ca9461 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -139,6 +139,9 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(GUEST_S_CET, guest_s_cet), + FIELD(GUEST_SSP, guest_ssp), + FIELD(GUEST_INTR_SSP_TABLE, guest_ssp_tbl), FIELD(HOST_CR0, host_cr0), FIELD(HOST_CR3, host_cr3), FIELD(HOST_CR4, host_cr4), @@ -151,5 +154,8 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), + FIELD(HOST_S_CET, host_s_cet), + FIELD(HOST_SSP, host_ssp), + FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), }; const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 56fd150a6f24..4ad6b16525b9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -117,7 +117,13 @@ struct __packed vmcs12 { natural_width host_ia32_sysenter_eip; natural_width host_rsp; natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ + natural_width host_s_cet; + natural_width host_ssp; + natural_width host_ssp_tbl; + natural_width guest_s_cet; + natural_width guest_ssp; + natural_width guest_ssp_tbl; + natural_width paddingl[2]; /* room for future expansion */ u32 pin_based_vm_exec_control; u32 cpu_based_vm_exec_control; u32 exception_bitmap; @@ -294,6 +300,12 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(host_ia32_sysenter_eip, 656); CHECK_OFFSET(host_rsp, 664); CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(host_s_cet, 680); + CHECK_OFFSET(host_ssp, 688); + CHECK_OFFSET(host_ssp_tbl, 696); + CHECK_OFFSET(guest_s_cet, 704); + CHECK_OFFSET(guest_ssp, 712); + CHECK_OFFSET(guest_ssp_tbl, 720); CHECK_OFFSET(pin_based_vm_exec_control, 744); CHECK_OFFSET(cpu_based_vm_exec_control, 748); CHECK_OFFSET(exception_bitmap, 752); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c4b07124689d..ad5981ff7097 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7735,6 +7735,8 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); + cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index af8224e074ee..ea93121029f9 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -181,6 +181,9 @@ struct nested_vmx { */ u64 pre_vmenter_debugctl; u64 pre_vmenter_bndcfgs; + u64 pre_vmenter_s_cet; + u64 pre_vmenter_ssp; + u64 pre_vmenter_ssp_tbl; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; From 8060b2bd2dd05a19ad7ec248489d374f2bd2b057 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:39 -0700 Subject: [PATCH 119/142] KVM: nVMX: Add consistency checks for CR0.WP and CR4.CET Add consistency checks for CR4.CET and CR0.WP in guest-state or host-state area in the VMCS12. This ensures that configurations with CR4.CET set and CR0.WP not set result in VM-entry failure, aligning with architectural behavior. Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11e5d3569933..51c50ce9e011 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3110,6 +3110,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; + if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) + return -EINVAL; + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; @@ -3224,6 +3227,9 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) From 62f7533a6b3aad9d39c6098dbc3d8e7daeda9399 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:40 -0700 Subject: [PATCH 120/142] KVM: nVMX: Add consistency checks for CET states Introduce consistency checks for CET states during nested VM-entry. A VMCS contains both guest and host CET states, each comprising the IA32_S_CET MSR, SSP, and IA32_INTERRUPT_SSP_TABLE_ADDR MSR. Various checks are applied to CET states during VM-entry as documented in SDM Vol3 Chapter "VM ENTRIES". Implement all these checks during nested VM-entry to emulate the architectural behavior. In summary, there are three kinds of checks on guest/host CET states during VM-entry: A. Checks applied to both guest states and host states: * The IA32_S_CET field must not set any reserved bits; bits 10 (SUPPRESS) and 11 (TRACKER) cannot both be set. * SSP should not have bits 1:0 set. * The IA32_INTERRUPT_SSP_TABLE_ADDR field must be canonical. B. Checks applied to host states only * IA32_S_CET MSR and SSP must be canonical if the CPU enters 64-bit mode after VM-exit. Otherwise, IA32_S_CET and SSP must have their higher 32 bits cleared. C. Checks applied to guest states only: * IA32_S_CET MSR and SSP are not required to be canonical (i.e., 63:N-1 are identical, where N is the CPU's maximum linear-address width). But, bits 63:N of SSP must be identical. Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-34-seanjc@google.com [sean: have common helper return 0/-EINVAL, not true/false] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 51c50ce9e011..a7bd6fc95057 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3100,6 +3100,16 @@ static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) return !__is_canonical_address(la, l1_address_bits_on_exit); } +static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || + CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3169,6 +3179,27 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, + vmcs12->host_ssp, + vmcs12->host_ssp_tbl)) + return -EINVAL; + + /* + * IA32_S_CET and SSP must be canonical if the host will + * enter 64-bit mode after VM-exit; otherwise, higher + * 32-bits must be all 0s. + */ + if (ia32e) { + if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) + return -EINVAL; + } else { + if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) + return -EINVAL; + } + } + return 0; } @@ -3279,6 +3310,23 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl)) + return -EINVAL; + + /* + * Guest SSP must have 63:N bits identical, rather than + * be canonical (i.e., 63:N-1 bits identical), where N is + * the CPU's maximum linear-address width. Similar to + * is_noncanonical_msr_address(), use the host's + * linear-address width. + */ + if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) + return -EINVAL; + } + if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; From 42ae6448531b5106e9f29794992856e94a52b5cb Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:41 -0700 Subject: [PATCH 121/142] KVM: nVMX: Advertise new VM-Entry/Exit control bits for CET state Advertise the LOAD_CET_STATE VM-Entry/Exit control bits in the nested VMX MSRS, as all nested support for CET virtualization, including consistency checks, is in place. Advertise support if and only if KVM supports at least one of IBT or SHSTK. While it's userspace's responsibility to provide a consistent CPU model to the guest, that doesn't mean KVM should set userspace up to fail. Note, the existing {CLEAR,LOAD}_BNDCFGS behavior predates KVM_X86_QUIRK_STUFF_FEATURE_MSRS, i.e. KVM "solved" the inconsistent CPU model problem by overwriting the VMX MSRs provided by userspace. Signed-off-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-35-seanjc@google.com Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a7bd6fc95057..76271962cb70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7179,13 +7179,17 @@ static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; + /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; } @@ -7201,11 +7205,16 @@ static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; + /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; } From 48b2ec0d540c29cebb5119dd2b8e8e7369bc409c Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:42 -0700 Subject: [PATCH 122/142] KVM: SVM: Emulate reads and writes to shadow stack MSRs Emulate shadow stack MSR access by reading and writing to the corresponding fields in the VMCB. Signed-off-by: John Allen [sean: mark VMCB_CET dirty/clean as appropriate] Link: https://lore.kernel.org/r/20250919223258.1604852-36-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 3 ++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ff4925a7bf96..b977ea4a46e7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2767,6 +2767,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; + case MSR_IA32_S_CET: + msr_info->data = svm->vmcb->save.s_cet; + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = svm->vmcb->save.isst_addr; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = svm->vmcb->save.ssp; + break; case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; @@ -2999,6 +3008,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb01.ptr->save.sysenter_esp = (u32)data; svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; + case MSR_IA32_S_CET: + svm->vmcb->save.s_cet = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_IA32_INT_SSP_TAB: + svm->vmcb->save.isst_addr = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + svm->vmcb->save.ssp = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; case MSR_TSC_AUX: /* * TSC_AUX is always virtualized for SEV-ES guests when the diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 10d5cbc259e1..37de6a4a2665 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -74,6 +74,7 @@ enum { * AVIC PHYSICAL_TABLE pointer, * AVIC LOGICAL_TABLE pointer */ + VMCB_CET, /* S_CET, SSP, ISST_ADDR */ VMCB_SW = 31, /* Reserved for hypervisor/software use */ }; @@ -82,7 +83,7 @@ enum { (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ - (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | (1U << VMCB_CET) | \ (1U << VMCB_SW)) /* TPR and CR2 are always written before VMRUN */ From c5ba49458513bd1ecd669d4ec7124e788b19347c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:43 -0700 Subject: [PATCH 123/142] KVM: nSVM: Save/load CET Shadow Stack state to/from vmcb12/vmcb02 Transfer the three CET Shadow Stack VMCB fields (S_CET, ISST_ADDR, and SSP) on VMRUN, #VMEXIT, and loading nested state (saving nested state simply copies the entire save area). SVM doesn't provide a way to disallow L1 from enabling Shadow Stacks for L2, i.e. KVM *must* provide nested support before advertising SHSTK to userspace. Link: https://lore.kernel.org/r/20250919223258.1604852-37-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 826473f2d7c7..a6443feab252 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -636,6 +636,14 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { + vmcb02->save.s_cet = vmcb12->save.s_cet; + vmcb02->save.isst_addr = vmcb12->save.isst_addr; + vmcb02->save.ssp = vmcb12->save.ssp; + vmcb_mark_dirty(vmcb02, VMCB_CET); + } + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); @@ -1044,6 +1052,12 @@ void svm_copy_vmrun_state(struct vmcb_save_area *to_save, to_save->rsp = from_save->rsp; to_save->rip = from_save->rip; to_save->cpl = 0; + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + to_save->s_cet = from_save->s_cet; + to_save->isst_addr = from_save->isst_addr; + to_save->ssp = from_save->ssp; + } } void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -1111,6 +1125,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb02->save.cpl; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcb12->save.s_cet = vmcb02->save.s_cet; + vmcb12->save.isst_addr = vmcb02->save.isst_addr; + vmcb12->save.ssp = vmcb02->save.ssp; + } + vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; From c7586aa3bed4969cc7345940e5a331efd614f41d Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:44 -0700 Subject: [PATCH 124/142] KVM: SVM: Update dump_vmcb with shadow stack save area additions Add shadow stack VMCB fields to dump_vmcb. PL0_SSP, PL1_SSP, PL2_SSP, PL3_SSP, and U_CET are part of the SEV-ES save area and are encrypted, but can be decrypted and dumped if the guest policy allows debugging. Reviewed-by: Maxim Levitsky Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-38-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b977ea4a46e7..bea8bfea74f9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3410,6 +3410,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "rip:", save->rip, "rflags:", save->rflags); pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "s_cet:", save->s_cet, "ssp:", save->ssp); + pr_err("%-15s %016llx\n", + "isst_addr:", save->isst_addr); pr_err("%-15s %016llx %-13s %016llx\n", "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3434,6 +3438,13 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx\n", "sev_features", vmsa->sev_features); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); + pr_err("%-15s %016llx\n", + "u_cet:", vmsa->u_cet); + pr_err("%-15s %016llx %-13s %016llx\n", "rax:", vmsa->rax, "rbx:", vmsa->rbx); pr_err("%-15s %016llx %-13s %016llx\n", From 38c46bdbf99812a07a8a7ef48718f0a03acd8a8a Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:45 -0700 Subject: [PATCH 125/142] KVM: SVM: Pass through shadow stack MSRs as appropriate Pass through XSAVE managed CET MSRs on SVM when KVM supports shadow stack. These cannot be intercepted without also intercepting XSAVE which would likely cause unacceptable performance overhead. MSR_IA32_INT_SSP_TAB is not managed by XSAVE, so it is intercepted. Reviewed-by: Chao Gao Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-39-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bea8bfea74f9..1c1792f6a65c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -844,6 +844,17 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); + } + if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); From b5fa221f7b08ca145b0357b77b90dcc998278967 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:46 -0700 Subject: [PATCH 126/142] KVM: SEV: Synchronize MSR_IA32_XSS from the GHCB when it's valid Synchronize XSS from the GHCB to KVM's internal tracking if the guest marks XSS as valid on a #VMGEXIT. Like XCR0, KVM needs an up-to-date copy of XSS in order to compute the required XSTATE size when emulating CPUID.0xD.0x1 for the guest. Treat the incoming XSS change as an emulated write, i.e. validatate the guest-provided value, to avoid letting the guest load garbage into KVM's tracking. Simply ignore bad values, as either the guest managed to get an unsupported value into hardware, or the guest is misbehaving and providing pure garbage. In either case, KVM can't fix the broken guest. Explicitly allow access to XSS at all times, as KVM needs to ensure its copy of XSS stays up-to-date. E.g. KVM supports migration of SEV-ES guests and so needs to allow the host to save/restore XSS, otherwise a guest that *knows* its XSS hasn't change could get stale/bad CPUID emulation if the guest doesn't provide XSS in the GHCB on every exit. This creates a hypothetical problem where a guest could request emulation of RDMSR or WRMSR on XSS, but arguably that's not even a problem, e.g. it would be entirely reasonable for a guest to request "emulation" as a way to inform the hypervisor that its XSS value has been modified. Note, emulating the change as an MSR write also takes care of side effects, e.g. marking dynamic CPUID bits as dirty. Suggested-by: John Allen base-commit: 14298d819d5a6b7180a4089e7d2121ca3551dc6c Link: https://lore.kernel.org/r/20250919223258.1604852-40-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 3 +++ arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d2ffacd43234..7971e2a4ddd5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3310,6 +3310,9 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); + if (kvm_ghcb_xss_is_valid(svm)) + __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); + /* Copy the GHCB exit information into the VMCB fields */ exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_code = lower_32_bits(exit_code); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1c1792f6a65c..8ca6ef4b35c1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2721,8 +2721,8 @@ static int svm_get_feature_msr(u32 msr, u64 *data) static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return sev_es_guest(vcpu->kvm) && - vcpu->arch.guest_state_protected && + return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && + msr_info->index != MSR_IA32_XSS && !msr_write_intercepted(vcpu, msr_info->index); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 37de6a4a2665..99ce9b95f782 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -941,5 +941,6 @@ DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1) DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) DEFINE_KVM_GHCB_ACCESSORS(xcr0) +DEFINE_KVM_GHCB_ACCESSORS(xss) #endif From 8db428fd5229ba509d515af10d7f33d7d4a54bfe Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:47 -0700 Subject: [PATCH 127/142] KVM: SVM: Enable shadow stack virtualization for SVM Remove the explicit clearing of shadow stack CPU capabilities. Reviewed-by: Chao Gao Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-41-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ca6ef4b35c1..6068c99bca95 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5263,10 +5263,7 @@ static __init void svm_set_cpu_caps(void) kvm_set_cpu_caps(); kvm_caps.supported_perf_cap = 0; - kvm_caps.supported_xss = 0; - /* KVM doesn't yet support CET virtualization for SVM. */ - kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); /* CPUID 0x80000001 and 0x8000000A (SVM features) */ From d37cc4819a489bfe32008e50e3a76eb967d95d42 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:48 -0700 Subject: [PATCH 128/142] KVM: x86: Add human friendly formatting for #XM, and #VE Add XM_VECTOR and VE_VECTOR pretty-printing for trace_kvm_inj_exception(). Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 57d79fd31df0..06da19b370c5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -461,8 +461,8 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(AC), EXS(MC) + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ + EXS(AC), EXS(MC), EXS(XM), EXS(VE) /* * Tracepoint for kvm interrupt injection: From f2f5519aa4e3ec4e42b009338f773bebb0bfd8a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:49 -0700 Subject: [PATCH 129/142] KVM: x86: Define Control Protection Exception (#CP) vector Add a CP_VECTOR definition for CET's Control Protection Exception (#CP), along with human friendly formatting for trace_kvm_inj_exception(). Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-43-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/trace.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 467116186e71..73e0e88a0a54 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -35,6 +35,7 @@ #define MC_VECTOR 18 #define XM_VECTOR 19 #define VE_VECTOR 20 +#define CP_VECTOR 21 /* Select x86 specific features in */ #define __KVM_HAVE_PIT diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 06da19b370c5..322913dda626 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -462,7 +462,7 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ - EXS(AC), EXS(MC), EXS(XM), EXS(VE) + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP) /* * Tracepoint for kvm interrupt injection: From fddd07626baa419c259ad5f3537a57188b5bb415 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:50 -0700 Subject: [PATCH 130/142] KVM: x86: Define AMD's #HV, #VC, and #SX exception vectors Add {HV,CP,SX}_VECTOR definitions for AMD's Hypervisor Injection Exception, VMM Communication Exception, and SVM Security Exception vectors, along with human friendly formatting for trace_kvm_inj_exception(). Note, KVM is all but guaranteed to never observe or inject #SX, and #HV is also unlikely to go unused. Add the architectural collateral mostly for completeness, and on the off chance that hardware goes off the rails. Link: https://lore.kernel.org/r/20250919223258.1604852-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 4 ++++ arch/x86/kvm/trace.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 73e0e88a0a54..d420c9c066d4 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -37,6 +37,10 @@ #define VE_VECTOR 20 #define CP_VECTOR 21 +#define HV_VECTOR 28 +#define VC_VECTOR 29 +#define SX_VECTOR 30 + /* Select x86 specific features in */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 322913dda626..e79bc9cb7162 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -462,7 +462,8 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ - EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP) + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP), \ + EXS(HV), EXS(VC), EXS(SX) /* * Tracepoint for kvm interrupt injection: From 9c38ddb3df94acdc86485c7073610f01655309f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:52 -0700 Subject: [PATCH 131/142] KVM: selftests: Add an MSR test to exercise guest/host and read/write Add a selftest to verify reads and writes to various MSRs, from both the guest and host, and expect success/failure based on whether or not the vCPU supports the MSR according to supported CPUID. Note, this test is extremely similar to KVM-Unit-Test's "msr" test, but provides more coverage with respect to host accesses, and will be extended to provide addition testing of CPUID-based features, save/restore lists, and KVM_{G,S}ET_ONE_REG, all which are extremely difficult to validate in KUT. If kvm.ignore_msrs=true, skip the unsupported and reserved testcases as KVM's ABI is a mess; what exactly is supposed to be ignored, and when, varies wildly. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-46-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/include/x86/processor.h | 5 + tools/testing/selftests/kvm/x86/msrs_test.c | 322 ++++++++++++++++++ 3 files changed, 328 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/msrs_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f6fe7a07a0a2..e6e410b938b9 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test +TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index fbe875eafca5..51cd84b9ca66 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1362,6 +1362,11 @@ static inline bool kvm_is_unrestricted_guest_enabled(void) return get_kvm_intel_param_bool("unrestricted_guest"); } +static inline bool kvm_is_ignore_msrs(void) +{ + return get_kvm_param_bool("ignore_msrs"); +} + uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c new file mode 100644 index 000000000000..345a39030a0a --- /dev/null +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include + +#include "kvm_util.h" +#include "processor.h" + +/* Use HYPERVISOR for MSRs that are emulated unconditionally (as is HYPERVISOR). */ +#define X86_FEATURE_NONE X86_FEATURE_HYPERVISOR + +struct kvm_msr { + const struct kvm_x86_cpu_feature feature; + const struct kvm_x86_cpu_feature feature2; + const char *name; + const u64 reset_val; + const u64 write_val; + const u64 rsvd_val; + const u32 index; +}; + +#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2) \ +{ \ + .index = msr, \ + .name = str, \ + .write_val = val, \ + .rsvd_val = rsvd, \ + .reset_val = reset, \ + .feature = X86_FEATURE_ ##feat, \ + .feature2 = X86_FEATURE_ ##f2, \ +} + +#define __MSR_TEST(msr, str, val, rsvd, reset, feat) \ + ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat) + +#define MSR_TEST_NON_ZERO(msr, val, rsvd, reset, feat) \ + __MSR_TEST(msr, #msr, val, rsvd, reset, feat) + +#define MSR_TEST(msr, val, rsvd, feat) \ + __MSR_TEST(msr, #msr, val, rsvd, 0, feat) + +#define MSR_TEST2(msr, val, rsvd, feat, f2) \ + ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2) + +/* + * Note, use a page aligned value for the canonical value so that the value + * is compatible with MSRs that use bits 11:0 for things other than addresses. + */ +static const u64 canonical_val = 0x123456789000ull; + +/* + * Arbitrary value with bits set in every byte, but not all bits set. This is + * also a non-canonical value, but that's coincidental (any 64-bit value with + * an alternating 0s/1s pattern will be non-canonical). + */ +static const u64 u64_val = 0xaaaa5555aaaa5555ull; + +#define MSR_TEST_CANONICAL(msr, feat) \ + __MSR_TEST(msr, #msr, canonical_val, NONCANONICAL, 0, feat) + +/* + * The main struct must be scoped to a function due to the use of structures to + * define features. For the global structure, allocate enough space for the + * foreseeable future without getting too ridiculous, to minimize maintenance + * costs (bumping the array size every time an MSR is added is really annoying). + */ +static struct kvm_msr msrs[128]; +static int idx; + +static bool ignore_unsupported_msrs; + +static u64 fixup_rdmsr_val(u32 msr, u64 want) +{ + /* + * AMD CPUs drop bits 63:32 on some MSRs that Intel CPUs support. KVM + * is supposed to emulate that behavior based on guest vendor model + * (which is the same as the host vendor model for this test). + */ + if (!host_cpu_is_amd) + return want; + + switch (msr) { + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + case MSR_TSC_AUX: + return want & GENMASK_ULL(31, 0); + default: + return want; + } +} + +static void __rdmsr(u32 msr, u64 want) +{ + u64 val; + u8 vec; + + vec = rdmsr_safe(msr, &val); + __GUEST_ASSERT(!vec, "Unexpected %s on RDMSR(0x%x)", ex_str(vec), msr); + + __GUEST_ASSERT(val == want, "Wanted 0x%lx from RDMSR(0x%x), got 0x%lx", + want, msr, val); +} + +static void __wrmsr(u32 msr, u64 val) +{ + u8 vec; + + vec = wrmsr_safe(msr, val); + __GUEST_ASSERT(!vec, "Unexpected %s on WRMSR(0x%x, 0x%lx)", + ex_str(vec), msr, val); + __rdmsr(msr, fixup_rdmsr_val(msr, val)); +} + +static void guest_test_supported_msr(const struct kvm_msr *msr) +{ + __rdmsr(msr->index, msr->reset_val); + __wrmsr(msr->index, msr->write_val); + GUEST_SYNC(fixup_rdmsr_val(msr->index, msr->write_val)); + + __rdmsr(msr->index, msr->reset_val); +} + +static void guest_test_unsupported_msr(const struct kvm_msr *msr) +{ + u64 val; + u8 vec; + + /* + * KVM's ABI with respect to ignore_msrs is a mess and largely beyond + * repair, just skip the unsupported MSR tests. + */ + if (ignore_unsupported_msrs) + goto skip_wrmsr_gp; + + if (this_cpu_has(msr->feature2)) + goto skip_wrmsr_gp; + + vec = rdmsr_safe(msr->index, &val); + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on RDMSR(0x%x), got %s", + msr->index, ex_str(vec)); + + vec = wrmsr_safe(msr->index, msr->write_val); + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", + msr->index, msr->write_val, ex_str(vec)); + +skip_wrmsr_gp: + GUEST_SYNC(0); +} + +void guest_test_reserved_val(const struct kvm_msr *msr) +{ + /* Skip reserved value checks as well, ignore_msrs is trully a mess. */ + if (ignore_unsupported_msrs) + return; + + /* + * If the CPU will truncate the written value (e.g. SYSENTER on AMD), + * expect success and a truncated value, not #GP. + */ + if (!this_cpu_has(msr->feature) || + msr->rsvd_val == fixup_rdmsr_val(msr->index, msr->rsvd_val)) { + u8 vec = wrmsr_safe(msr->index, msr->rsvd_val); + + __GUEST_ASSERT(vec == GP_VECTOR, + "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", + msr->index, msr->rsvd_val, ex_str(vec)); + } else { + __wrmsr(msr->index, msr->rsvd_val); + __wrmsr(msr->index, msr->reset_val); + } +} + +static void guest_main(void) +{ + for (;;) { + const struct kvm_msr *msr = &msrs[READ_ONCE(idx)]; + + if (this_cpu_has(msr->feature)) + guest_test_supported_msr(msr); + else + guest_test_unsupported_msr(msr); + + if (msr->rsvd_val) + guest_test_reserved_val(msr); + + GUEST_SYNC(msr->reset_val); + } +} + +static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) +{ + u64 reset_val = msrs[idx].reset_val; + u32 msr = msrs[idx].index; + u64 val; + + if (!kvm_cpu_has(msrs[idx].feature)) + return; + + val = vcpu_get_msr(vcpu, msr); + TEST_ASSERT(val == guest_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", + guest_val, msr, val); + + vcpu_set_msr(vcpu, msr, reset_val); + + val = vcpu_get_msr(vcpu, msr); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", + reset_val, msr, val); +} + +static void do_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + for (;;) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + host_test_msr(vcpu, uc.args[1]); + return; + case UCALL_PRINTF: + pr_info("%s", uc.buffer); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_DONE: + TEST_FAIL("Unexpected UCALL_DONE"); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +static void vcpus_run(struct kvm_vcpu **vcpus, const int NR_VCPUS) +{ + int i; + + for (i = 0; i < NR_VCPUS; i++) + do_vcpu_run(vcpus[i]); +} + +#define MISC_ENABLES_RESET_VAL (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + +static void test_msrs(void) +{ + const struct kvm_msr __msrs[] = { + MSR_TEST_NON_ZERO(MSR_IA32_MISC_ENABLE, + MISC_ENABLES_RESET_VAL | MSR_IA32_MISC_ENABLE_FAST_STRING, + MSR_IA32_MISC_ENABLE_FAST_STRING, MISC_ENABLES_RESET_VAL, NONE), + MSR_TEST_NON_ZERO(MSR_IA32_CR_PAT, 0x07070707, 0, 0x7040600070406, NONE), + + /* + * TSC_AUX is supported if RDTSCP *or* RDPID is supported. Add + * entries for each features so that TSC_AUX doesn't exists for + * the "unsupported" vCPU, and obviously to test both cases. + */ + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDTSCP, RDPID), + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDPID, RDTSCP), + + MSR_TEST(MSR_IA32_SYSENTER_CS, 0x1234, 0, NONE), + /* + * SYSENTER_{ESP,EIP} are technically non-canonical on Intel, + * but KVM doesn't emulate that behavior on emulated writes, + * i.e. this test will observe different behavior if the MSR + * writes are handed by hardware vs. KVM. KVM's behavior is + * intended (though far from ideal), so don't bother testing + * non-canonical values. + */ + MSR_TEST(MSR_IA32_SYSENTER_ESP, canonical_val, 0, NONE), + MSR_TEST(MSR_IA32_SYSENTER_EIP, canonical_val, 0, NONE), + + MSR_TEST_CANONICAL(MSR_FS_BASE, LM), + MSR_TEST_CANONICAL(MSR_GS_BASE, LM), + MSR_TEST_CANONICAL(MSR_KERNEL_GS_BASE, LM), + MSR_TEST_CANONICAL(MSR_LSTAR, LM), + MSR_TEST_CANONICAL(MSR_CSTAR, LM), + MSR_TEST(MSR_SYSCALL_MASK, 0xffffffff, 0, LM), + + MSR_TEST_CANONICAL(MSR_IA32_PL0_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL0_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL1_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL1_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL2_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL2_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL3_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), + }; + + /* + * Create two vCPUs, but run them on the same task, to validate KVM's + * context switching of MSR state. Don't pin the task to a pCPU to + * also validate KVM's handling of cross-pCPU migration. + */ + const int NR_VCPUS = 2; + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct kvm_vm *vm; + + kvm_static_assert(sizeof(__msrs) <= sizeof(msrs)); + kvm_static_assert(ARRAY_SIZE(__msrs) <= ARRAY_SIZE(msrs)); + memcpy(msrs, __msrs, sizeof(__msrs)); + + ignore_unsupported_msrs = kvm_is_ignore_msrs(); + + vm = vm_create_with_vcpus(NR_VCPUS, guest_main, vcpus); + + sync_global_to_guest(vm, msrs); + sync_global_to_guest(vm, ignore_unsupported_msrs); + + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + sync_global_to_guest(vm, idx); + + vcpus_run(vcpus, NR_VCPUS); + vcpus_run(vcpus, NR_VCPUS); + } + + kvm_vm_free(vm); +} + +int main(void) +{ + test_msrs(); +} From 27c41353064f9fdec9276e6b8e618b01110ef282 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:53 -0700 Subject: [PATCH 132/142] KVM: selftests: Add support for MSR_IA32_{S,U}_CET to MSRs test Extend the MSRs test to support {S,U}_CET, which are a bit of a pain to handled due to the MSRs existing if IBT *or* SHSTK is supported. To deal with Intel's wonderful decision to bundle IBT and SHSTK under CET, track the second feature, but skip only RDMSR #GP tests to avoid false failures when running on a CPU with only one of IBT or SHSTK (the WRMSR #GP tests are still valid since the enable bits are per-feature). Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-47-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 345a39030a0a..1efaebb986c4 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -132,13 +132,26 @@ static void guest_test_unsupported_msr(const struct kvm_msr *msr) if (ignore_unsupported_msrs) goto skip_wrmsr_gp; - if (this_cpu_has(msr->feature2)) - goto skip_wrmsr_gp; + /* + * {S,U}_CET exist if IBT or SHSTK is supported, but with bits that are + * writable only if their associated feature is supported. Skip the + * RDMSR #GP test if the secondary feature is supported, but perform + * the WRMSR #GP test as the to-be-written value is tied to the primary + * feature. For all other MSRs, simply do nothing. + */ + if (this_cpu_has(msr->feature2)) { + if (msr->index != MSR_IA32_U_CET && + msr->index != MSR_IA32_S_CET) + goto skip_wrmsr_gp; + + goto skip_rdmsr_gp; + } vec = rdmsr_safe(msr->index, &val); __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on RDMSR(0x%x), got %s", msr->index, ex_str(vec)); +skip_rdmsr_gp: vec = wrmsr_safe(msr->index, msr->write_val); __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", msr->index, msr->write_val, ex_str(vec)); @@ -276,6 +289,10 @@ static void test_msrs(void) MSR_TEST_CANONICAL(MSR_CSTAR, LM), MSR_TEST(MSR_SYSCALL_MASK, 0xffffffff, 0, LM), + MSR_TEST2(MSR_IA32_S_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), + MSR_TEST2(MSR_IA32_S_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), + MSR_TEST2(MSR_IA32_U_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), + MSR_TEST2(MSR_IA32_U_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL0_SSP, SHSTK), MSR_TEST(MSR_IA32_PL0_SSP, canonical_val, canonical_val | 1, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL1_SSP, SHSTK), From a8b9cca99cf454799ef799f8af9bdb55389cfd94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:54 -0700 Subject: [PATCH 133/142] KVM: selftests: Extend MSRs test to validate vCPUs without supported features Add a third vCPUs to the MSRs test that runs with all features disabled in the vCPU's CPUID model, to verify that KVM does the right thing with respect to emulating accesses to MSRs that shouldn't exist. Use the same VM to verify that KVM is honoring the vCPU model, e.g. isn't looking at per-VM state when emulating MSR accesses. Link: https://lore.kernel.org/r/20250919223258.1604852-48-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 28 ++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 1efaebb986c4..ec94c70b3f06 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -303,12 +303,17 @@ static void test_msrs(void) MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), }; + const struct kvm_x86_cpu_feature feat_none = X86_FEATURE_NONE; + const struct kvm_x86_cpu_feature feat_lm = X86_FEATURE_LM; + /* - * Create two vCPUs, but run them on the same task, to validate KVM's + * Create three vCPUs, but run them on the same task, to validate KVM's * context switching of MSR state. Don't pin the task to a pCPU to - * also validate KVM's handling of cross-pCPU migration. + * also validate KVM's handling of cross-pCPU migration. Use the full + * set of features for the first two vCPUs, but clear all features in + * third vCPU in order to test both positive and negative paths. */ - const int NR_VCPUS = 2; + const int NR_VCPUS = 3; struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; @@ -323,6 +328,23 @@ static void test_msrs(void) sync_global_to_guest(vm, msrs); sync_global_to_guest(vm, ignore_unsupported_msrs); + /* + * Clear features in the "unsupported features" vCPU. This needs to be + * done before the first vCPU run as KVM's ABI is that guest CPUID is + * immutable once the vCPU has been run. + */ + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + /* + * Don't clear LM; selftests are 64-bit only, and KVM doesn't + * honor LM=0 for MSRs that are supposed to exist if and only + * if the vCPU is a 64-bit model. Ditto for NONE; clearing a + * fake feature flag will result in false failures. + */ + if (memcmp(&msrs[idx].feature, &feat_lm, sizeof(feat_lm)) && + memcmp(&msrs[idx].feature, &feat_none, sizeof(feat_none))) + vcpu_clear_cpuid_feature(vcpus[2], msrs[idx].feature); + } + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { sync_global_to_guest(vm, idx); From 80c2b6d8e7bb194744f5fdfc4f0560e053ababda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:55 -0700 Subject: [PATCH 134/142] KVM: selftests: Add KVM_{G,S}ET_ONE_REG coverage to MSRs test When KVM_{G,S}ET_ONE_REG are supported, verify that MSRs can be accessed via ONE_REG and through the dedicated MSR ioctls. For simplicity, run the test twice, e.g. instead of trying to get MSR values into the exact right state when switching write methods. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-49-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 22 ++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index ec94c70b3f06..680d381d1bf7 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -200,6 +200,9 @@ static void guest_main(void) } } +static bool has_one_reg; +static bool use_one_reg; + static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) { u64 reset_val = msrs[idx].reset_val; @@ -213,11 +216,21 @@ static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) TEST_ASSERT(val == guest_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", guest_val, msr, val); - vcpu_set_msr(vcpu, msr, reset_val); + if (use_one_reg) + vcpu_set_reg(vcpu, KVM_X86_REG_MSR(msr), reset_val); + else + vcpu_set_msr(vcpu, msr, reset_val); val = vcpu_get_msr(vcpu, msr); TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", reset_val, msr, val); + + if (!has_one_reg) + return; + + val = vcpu_get_reg(vcpu, KVM_X86_REG_MSR(msr)); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + reset_val, msr, val); } static void do_vcpu_run(struct kvm_vcpu *vcpu) @@ -357,5 +370,12 @@ static void test_msrs(void) int main(void) { + has_one_reg = kvm_has_cap(KVM_CAP_ONE_REG); + test_msrs(); + + if (has_one_reg) { + use_one_reg = true; + test_msrs(); + } } From 3469fd203bac201ad67d9586041689c4c6a87882 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:56 -0700 Subject: [PATCH 135/142] KVM: selftests: Add coverage for KVM-defined registers in MSRs test Add test coverage for the KVM-defined GUEST_SSP "register" in the MSRs test. While _KVM's_ goal is to not tie the uAPI of KVM-defined registers to any particular internal implementation, i.e. to not commit in uAPI to handling GUEST_SSP as an MSR, treating GUEST_SSP as an MSR for testing purposes is a-ok and is a naturally fit given the semantics of SSP. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-50-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 97 ++++++++++++++++++++- 1 file changed, 94 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 680d381d1bf7..54ef9c539eb3 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -17,9 +17,10 @@ struct kvm_msr { const u64 write_val; const u64 rsvd_val; const u32 index; + const bool is_kvm_defined; }; -#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2) \ +#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2, is_kvm) \ { \ .index = msr, \ .name = str, \ @@ -28,10 +29,11 @@ struct kvm_msr { .reset_val = reset, \ .feature = X86_FEATURE_ ##feat, \ .feature2 = X86_FEATURE_ ##f2, \ + .is_kvm_defined = is_kvm, \ } #define __MSR_TEST(msr, str, val, rsvd, reset, feat) \ - ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat) + ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat, false) #define MSR_TEST_NON_ZERO(msr, val, rsvd, reset, feat) \ __MSR_TEST(msr, #msr, val, rsvd, reset, feat) @@ -40,7 +42,7 @@ struct kvm_msr { __MSR_TEST(msr, #msr, val, rsvd, 0, feat) #define MSR_TEST2(msr, val, rsvd, feat, f2) \ - ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2) + ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2, false) /* * Note, use a page aligned value for the canonical value so that the value @@ -58,6 +60,9 @@ static const u64 u64_val = 0xaaaa5555aaaa5555ull; #define MSR_TEST_CANONICAL(msr, feat) \ __MSR_TEST(msr, #msr, canonical_val, NONCANONICAL, 0, feat) +#define MSR_TEST_KVM(msr, val, rsvd, feat) \ + ____MSR_TEST(KVM_REG_ ##msr, #msr, val, rsvd, 0, feat, feat, true) + /* * The main struct must be scoped to a function due to the use of structures to * define features. For the global structure, allocate enough space for the @@ -203,6 +208,83 @@ static void guest_main(void) static bool has_one_reg; static bool use_one_reg; +#define KVM_X86_MAX_NR_REGS 1 + +static bool vcpu_has_reg(struct kvm_vcpu *vcpu, u64 reg) +{ + struct { + struct kvm_reg_list list; + u64 regs[KVM_X86_MAX_NR_REGS]; + } regs = {}; + int r, i; + + /* + * If KVM_GET_REG_LIST succeeds with n=0, i.e. there are no supported + * regs, then the vCPU obviously doesn't support the reg. + */ + r = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®s.list); + if (!r) + return false; + + TEST_ASSERT_EQ(errno, E2BIG); + + /* + * KVM x86 is expected to support enumerating a relative small number + * of regs. The majority of registers supported by KVM_{G,S}ET_ONE_REG + * are enumerated via other ioctls, e.g. KVM_GET_MSR_INDEX_LIST. For + * simplicity, hardcode the maximum number of regs and manually update + * the test as necessary. + */ + TEST_ASSERT(regs.list.n <= KVM_X86_MAX_NR_REGS, + "KVM reports %llu regs, test expects at most %u regs, stale test?", + regs.list.n, KVM_X86_MAX_NR_REGS); + + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®s.list); + for (i = 0; i < regs.list.n; i++) { + if (regs.regs[i] == reg) + return true; + } + + return false; +} + +static void host_test_kvm_reg(struct kvm_vcpu *vcpu) +{ + bool has_reg = vcpu_cpuid_has(vcpu, msrs[idx].feature); + u64 reset_val = msrs[idx].reset_val; + u64 write_val = msrs[idx].write_val; + u64 rsvd_val = msrs[idx].rsvd_val; + u32 reg = msrs[idx].index; + u64 val; + int r; + + if (!use_one_reg) + return; + + TEST_ASSERT_EQ(vcpu_has_reg(vcpu, KVM_X86_REG_KVM(reg)), has_reg); + + if (!has_reg) { + r = __vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg), &val); + TEST_ASSERT(r && errno == EINVAL, + "Expected failure on get_reg(0x%x)", reg); + rsvd_val = 0; + goto out; + } + + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + reset_val, reg, val); + + vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), write_val); + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); + TEST_ASSERT(val == write_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + write_val, reg, val); + +out: + r = __vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), rsvd_val); + TEST_ASSERT(r, "Expected failure on set_reg(0x%x, 0x%lx)", reg, rsvd_val); +} + static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) { u64 reset_val = msrs[idx].reset_val; @@ -314,6 +396,8 @@ static void test_msrs(void) MSR_TEST(MSR_IA32_PL2_SSP, canonical_val, canonical_val | 1, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL3_SSP, SHSTK), MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), + + MSR_TEST_KVM(GUEST_SSP, canonical_val, NONCANONICAL, SHSTK), }; const struct kvm_x86_cpu_feature feat_none = X86_FEATURE_NONE; @@ -329,6 +413,7 @@ static void test_msrs(void) const int NR_VCPUS = 3; struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; + int i; kvm_static_assert(sizeof(__msrs) <= sizeof(msrs)); kvm_static_assert(ARRAY_SIZE(__msrs) <= ARRAY_SIZE(msrs)); @@ -359,6 +444,12 @@ static void test_msrs(void) } for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + if (msrs[idx].is_kvm_defined) { + for (i = 0; i < NR_VCPUS; i++) + host_test_kvm_reg(vcpus[i]); + continue; + } + sync_global_to_guest(vm, idx); vcpus_run(vcpus, NR_VCPUS); From 947ab90c91983c965acb994455734aae19317596 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:57 -0700 Subject: [PATCH 136/142] KVM: selftests: Verify MSRs are (not) in save/restore list when (un)supported Add a check in the MSRs test to verify that KVM's reported support for MSRs with feature bits is consistent between KVM's MSR save/restore lists and KVM's supported CPUID. To deal with Intel's wonderful decision to bundle IBT and SHSTK under CET, track the "second" feature to avoid false failures when running on a CPU with only one of IBT or SHSTK. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-51-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 54ef9c539eb3..40d918aedce6 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -444,12 +444,29 @@ static void test_msrs(void) } for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { - if (msrs[idx].is_kvm_defined) { + struct kvm_msr *msr = &msrs[idx]; + + if (msr->is_kvm_defined) { for (i = 0; i < NR_VCPUS; i++) host_test_kvm_reg(vcpus[i]); continue; } + /* + * Verify KVM_GET_SUPPORTED_CPUID and KVM_GET_MSR_INDEX_LIST + * are consistent with respect to MSRs whose existence is + * enumerated via CPUID. Skip the check for FS/GS.base MSRs, + * as they aren't reported in the save/restore list since their + * state is managed via SREGS. + */ + TEST_ASSERT(msr->index == MSR_FS_BASE || msr->index == MSR_GS_BASE || + kvm_msr_is_in_save_restore_list(msr->index) == + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)), + "%s %s in save/restore list, but %s according to CPUID", msr->name, + kvm_msr_is_in_save_restore_list(msr->index) ? "is" : "isn't", + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)) ? + "supported" : "unsupported"); + sync_global_to_guest(vm, idx); vcpus_run(vcpus, NR_VCPUS); From d292035fb5d209b78beda356a2a9720154bd7c00 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 19 Sep 2025 15:32:58 -0700 Subject: [PATCH 137/142] KVM: VMX: Make CR4.CET a guest owned bit Make CR4.CET a guest-owned bit under VMX by extending KVM_POSSIBLE_CR4_GUEST_BITS accordingly. There's no need to intercept changes to CR4.CET, as it's neither included in KVM's MMU role bits, nor does KVM specifically care about the actual value of a (nested) guest's CR4.CET value, beside for enforcing architectural constraints, i.e. make sure that CR0.WP=1 if CR4.CET=1. Intercepting writes to CR4.CET is particularly bad for grsecurity kernels with KERNEXEC or, even worse, KERNSEAL enabled. These features heavily make use of read-only kernel objects and use a cpu-local CR0.WP toggle to override it, when needed. Under a CET-enabled kernel, this also requires toggling CR4.CET, hence the motivation to make it guest-owned. Using the old test from [1] gives the following runtime numbers (perf stat -r 5 ssdd 10 50000): * grsec guest on linux-6.16-rc5 + cet patches: 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) * grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) Not only does not intercepting CR4.CET make the test run ~35% faster, it's also more stable with less fluctuation due to fewer VMEXITs. Therefore, make CR4.CET a guest-owned bit where possible. This change is VMX-specific, as SVM has no such fine-grained control register intercept control. If KVM's assumptions regarding MMU role handling wrt. a guest's CR4.CET value ever change, the BUILD_BUG_ON()s related to KVM_MMU_CR4_ROLE_BITS and KVM_POSSIBLE_CR4_GUEST_BITS will catch that early. Link: https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ [1] Reviewed-by: Chao Gao Signed-off-by: Mathias Krause Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-52-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_cache_regs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 36a8786db291..8ddb01191d6f 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,8 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \ + | X86_CR4_CET) #define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) From 15463eece957be34da64b4d6fe18fc98981bf487 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:32:59 -0700 Subject: [PATCH 138/142] KVM: s390/vfio-ap: Use kvm_is_gpa_in_memslot() instead of open coded equivalent Use kvm_is_gpa_in_memslot() to check the validity of the notification indicator byte address instead of open coding equivalent logic in the VFIO AP driver. Opportunistically use a dedicated wrapper that exists and is exported expressly for the VFIO AP module. kvm_is_gpa_in_memslot() is generally unsuitable for use outside of KVM; other drivers typically shouldn't rely on KVM's memslots, and using the API requires kvm->srcu (or slots_lock) to be held for the entire duration of the usage, e.g. to avoid TOCTOU bugs. handle_pqap() is a bit of a special case, as it's explicitly invoked from KVM with kvm->srcu already held, and the VFIO AP driver is in many ways an extension of KVM that happens to live in a separate module. Providing a dedicated API for the VFIO AP driver will allow restricting the vast majority of generic KVM's exports to KVM submodules (e.g. to x86's kvm-{amd,intel}.ko vendor mdoules). No functional change intended. Acked-by: Anthony Krowiak Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20250919003303.1355064-2-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/priv.c | 8 ++++++++ drivers/s390/crypto/vfio_ap_ops.c | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 95d15416c39d..c2ba3d4398c5 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -722,6 +722,8 @@ extern int kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); +bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa); + static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 9253c70897a8..9a71b6e00948 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -605,6 +605,14 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) } } +#if IS_ENABLED(CONFIG_VFIO_AP) +bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) +{ + return kvm_is_gpa_in_memslot(kvm, gpa); +} +EXPORT_SYMBOL_FOR_MODULES(kvm_s390_is_gpa_in_memslot, "vfio_ap"); +#endif + /* * handle_pqap: Handling pqap interception * @vcpu: the vcpu having issue the pqap instruction diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 766557547f83..eb5ff49f6fe7 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -354,7 +354,7 @@ static int vfio_ap_validate_nib(struct kvm_vcpu *vcpu, dma_addr_t *nib) if (!*nib) return -EINVAL; - if (kvm_is_error_hva(gfn_to_hva(vcpu->kvm, *nib >> PAGE_SHIFT))) + if (!kvm_s390_is_gpa_in_memslot(vcpu->kvm, *nib)) return -EINVAL; return 0; From 20c48920583675e67b3824f147726e0fbda735ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:00 -0700 Subject: [PATCH 139/142] KVM: Export KVM-internal symbols for sub-modules only Rework the vast majority of KVM's exports to expose symbols only to KVM submodules, i.e. to x86's kvm-{amd,intel}.ko and PPC's kvm-{pr,hv}.ko. With few exceptions, KVM's exported APIs are intended (and safe) for KVM- internal usage only. Keep kvm_get_kvm(), kvm_get_kvm_safe(), and kvm_put_kvm() as normal exports, as they are needed by VFIO, and are generally safe for external usage (though ideally even the get/put APIs would be KVM-internal, and VFIO would pin a VM by grabbing a reference to its associated file). Implement a framework in kvm_types.h in anticipation of providing a macro to restrict KVM-specific kernel exports, i.e. to provide symbol exports for KVM if and only if KVM is built as one or more modules. Link: https://lore.kernel.org/r/20250919003303.1355064-3-seanjc@google.com Cc: Nathan Chancellor Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/kvm_types.h | 15 ++++ arch/x86/include/asm/kvm_types.h | 10 +++ include/linux/kvm_types.h | 25 ++++-- virt/kvm/eventfd.c | 2 +- virt/kvm/guest_memfd.c | 4 +- virt/kvm/kvm_main.c | 128 +++++++++++++-------------- 7 files changed, 110 insertions(+), 75 deletions(-) create mode 100644 arch/powerpc/include/asm/kvm_types.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index e5fdc336c9b2..2e23533b67e3 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -3,7 +3,6 @@ generated-y += syscall_table_32.h generated-y += syscall_table_64.h generated-y += syscall_table_spu.h generic-y += agp.h -generic-y += kvm_types.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += early_ioremap.h diff --git a/arch/powerpc/include/asm/kvm_types.h b/arch/powerpc/include/asm/kvm_types.h new file mode 100644 index 000000000000..5d4bffea7d47 --- /dev/null +++ b/arch/powerpc/include/asm/kvm_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PPC_KVM_TYPES_H +#define _ASM_PPC_KVM_TYPES_H + +#if IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) && IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-pr,kvm-hv +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) +#define KVM_SUB_MODULES kvm-pr +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-hv +#else +#undef KVM_SUB_MODULES +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h index 08f1b57d3b62..23268a188e70 100644 --- a/arch/x86/include/asm/kvm_types.h +++ b/arch/x86/include/asm/kvm_types.h @@ -2,6 +2,16 @@ #ifndef _ASM_X86_KVM_TYPES_H #define _ASM_X86_KVM_TYPES_H +#if IS_MODULE(CONFIG_KVM_AMD) && IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-amd,kvm-intel +#elif IS_MODULE(CONFIG_KVM_AMD) +#define KVM_SUB_MODULES kvm-amd +#elif IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-intel +#else +#undef KVM_SUB_MODULES +#endif + #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 #endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 827ecc0b7e10..490464c205b4 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -3,6 +3,23 @@ #ifndef __KVM_TYPES_H__ #define __KVM_TYPES_H__ +#include +#include +#include +#include + +#ifdef KVM_SUB_MODULES +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#else +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +#endif + +#ifndef __ASSEMBLER__ + +#include +#include + struct kvm; struct kvm_async_pf; struct kvm_device_ops; @@ -19,13 +36,6 @@ struct kvm_memslots; enum kvm_mr_change; -#include -#include -#include -#include - -#include - /* * Address types: * @@ -116,5 +126,6 @@ struct kvm_vcpu_stat_generic { }; #define KVM_STATS_NAME_SIZE 48 +#endif /* !__ASSEMBLER__ */ #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 6b1133a6617f..a7794ffdb976 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -525,7 +525,7 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) return false; } -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1d323ca178cb..94bafd6c558c 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -702,7 +702,7 @@ out: fput(file); return r; } -EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, @@ -785,5 +785,5 @@ put_folio_and_exit: fput(file); return ret && !i ? ret : i; } -EXPORT_SYMBOL_GPL(kvm_gmem_populate); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c0c5626ab71d..226faeaa8e56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -77,22 +77,22 @@ MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection @@ -170,7 +170,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } -EXPORT_SYMBOL_GPL(vcpu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { @@ -180,7 +180,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } -EXPORT_SYMBOL_GPL(vcpu_put); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) @@ -288,7 +288,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { @@ -309,7 +309,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } -EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { @@ -499,7 +499,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) atomic_set(&kvm->online_vcpus, 0); } -EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -1365,7 +1365,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } -EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { @@ -1397,7 +1397,7 @@ out_unlock: } return -EINTR; } -EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus); int kvm_lock_all_vcpus(struct kvm *kvm) { @@ -1422,7 +1422,7 @@ out_unlock: } return r; } -EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus); void kvm_unlock_all_vcpus(struct kvm *kvm) { @@ -1434,7 +1434,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) mutex_unlock(&vcpu->mutex); } -EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus); /* * Allocation size is twice as large as the actual dirty bitmap size. @@ -2142,7 +2142,7 @@ int kvm_set_internal_memslot(struct kvm *kvm, return kvm_set_memory_region(kvm, mem); } -EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) @@ -2201,7 +2201,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, *is_dirty = 1; return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** @@ -2636,7 +2636,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2670,7 +2670,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { @@ -2678,7 +2678,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2686,7 +2686,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2743,19 +2743,19 @@ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, { return gfn_to_hva_many(slot, gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. @@ -2819,7 +2819,7 @@ void kvm_release_page_clean(struct page *page) kvm_set_page_accessed(page); put_page(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_clean); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { @@ -2829,7 +2829,7 @@ void kvm_release_page_dirty(struct page *page) kvm_set_page_dirty(page); kvm_release_page_clean(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) @@ -3073,7 +3073,7 @@ kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) @@ -3090,7 +3090,7 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(kvm_prefetch_pages); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM @@ -3112,7 +3112,7 @@ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) (void)kvm_follow_pfn(&kfp); return refcounted_page; } -EXPORT_SYMBOL_GPL(__gfn_to_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) @@ -3146,7 +3146,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(__kvm_vcpu_map); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { @@ -3174,7 +3174,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) map->page = NULL; map->pinned_page = NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { @@ -3210,7 +3210,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) @@ -3219,7 +3219,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { @@ -3239,7 +3239,7 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { @@ -3259,7 +3259,7 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) @@ -3290,7 +3290,7 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, @@ -3320,7 +3320,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) @@ -3329,7 +3329,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) @@ -3350,7 +3350,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) @@ -3371,7 +3371,7 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, @@ -3420,7 +3420,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3451,14 +3451,14 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3488,14 +3488,14 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { @@ -3515,7 +3515,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, @@ -3540,7 +3540,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, set_bit_le(rel_gfn, memslot->dirty_bitmap); } } -EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { @@ -3549,7 +3549,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -3558,7 +3558,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { @@ -3795,7 +3795,7 @@ out: trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { @@ -3807,7 +3807,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* @@ -3859,7 +3859,7 @@ void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) out: put_cpu(); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) @@ -3882,7 +3882,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target) return ret; } -EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. @@ -4037,7 +4037,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } -EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { @@ -5019,7 +5019,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } -EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) @@ -5474,7 +5474,7 @@ bool file_is_kvm(struct file *file) { return file && file->f_op == &kvm_vm_fops; } -EXPORT_SYMBOL_GPL(file_is_kvm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { @@ -5569,10 +5569,10 @@ static struct miscdevice kvm_dev = { #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); -EXPORT_SYMBOL_GPL(enable_virt_at_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); __visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); @@ -5723,7 +5723,7 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_GPL(kvm_enable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); void kvm_disable_virtualization(void) { @@ -5736,7 +5736,7 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_GPL(kvm_disable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5885,7 +5885,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) @@ -5954,7 +5954,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read); static void __free_bus(struct rcu_head *rcu) { @@ -6078,7 +6078,7 @@ out_unlock: return iodev; } -EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), @@ -6415,7 +6415,7 @@ struct kvm_vcpu *kvm_get_running_vcpu(void) return vcpu; } -EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu); /** * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. @@ -6550,7 +6550,7 @@ err_cpu_kick_mask: kmem_cache_destroy(kvm_vcpu_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init); void kvm_exit(void) { @@ -6573,4 +6573,4 @@ void kvm_exit(void) kvm_async_pf_deinit(); kvm_irqfd_exit(); } -EXPORT_SYMBOL_GPL(kvm_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit); From d273b52b6fad95b6b00b93250151c323c19de50d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:01 -0700 Subject: [PATCH 140/142] KVM: x86: Move kvm_intr_is_single_vcpu() to lapic.c Move kvm_intr_is_single_vcpu() to lapic.c, drop its export, and make its "fast" helper local to lapic.c. kvm_intr_is_single_vcpu() is only usable if the local APIC is in-kernel, i.e. it most definitely belongs in the local APIC code. No functional change intended. Fixes: cf04ec393ed0 ("KVM: x86: Dedup AVIC vs. PI code for identifying target vCPU") Link: https://lore.kernel.org/r/20250919003303.1355064-4-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/irq.c | 28 ---------------------------- arch/x86/kvm/lapic.c | 33 +++++++++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 4 ++-- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0eed9b430849..48598d017d6f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2416,9 +2416,6 @@ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); -bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu); - static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a6b122f732be..153134893301 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -354,34 +354,6 @@ int kvm_set_routing_entry(struct kvm *kvm, return 0; } -bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) -{ - int r = 0; - unsigned long i; - struct kvm_vcpu *vcpu; - - if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) - return true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (++r == 2) - return false; - - *dest_vcpu = vcpu; - } - - return r == 1; -} -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, u8 vector, unsigned long *ioapic_handled_vectors) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3b76192b24e9..b5e47c523164 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1237,8 +1237,9 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, * interrupt. * - Otherwise, use remapped mode to inject the interrupt. */ -bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) +static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, + struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; unsigned long bitmap; @@ -1265,6 +1266,34 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, return ret; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int r = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50123fe7f58f..282b9b7da98c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -236,8 +236,8 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, unsigned long *vcpu_bitmap); -bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); From 6560468305da263c35ff51cde1dd74d6a228b286 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:02 -0700 Subject: [PATCH 141/142] KVM: x86: Drop pointless exports of kvm_arch_xxx() hooks Drop the exporting of several kvm_arch_xxx() hooks that are only called from arch-neutral code, i.e. that are only called from kvm.ko. Link: https://lore.kernel.org/r/20250919003303.1355064-5-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5542b8d83602..a618a30423a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13922,14 +13922,12 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm) if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { @@ -13941,7 +13939,6 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return (vcpu->arch.msr_kvm_poll_control & 1) == 0; } -EXPORT_SYMBOL_GPL(kvm_arch_no_poll); #ifdef CONFIG_KVM_GUEST_MEMFD /* From 6b36119b94d0b2bb8cea9d512017efafd461d6ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:03 -0700 Subject: [PATCH 142/142] KVM: x86: Export KVM-internal symbols for sub-modules only Rework almost all of KVM x86's exports to expose symbols only to KVM's vendor modules, i.e. to kvm-{amd,intel}.ko. Keep the generic exports that are guarded by CONFIG_KVM_EXTERNAL_WRITE_TRACKING=y, as they're explicitly designed/intended for external usage. Link: https://lore.kernel.org/r/20250919003303.1355064-6-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 10 +- arch/x86/kvm/hyperv.c | 4 +- arch/x86/kvm/irq.c | 6 +- arch/x86/kvm/kvm_onhyperv.c | 6 +- arch/x86/kvm/lapic.c | 40 +++---- arch/x86/kvm/mmu/mmu.c | 36 +++--- arch/x86/kvm/mmu/spte.c | 10 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- arch/x86/kvm/pmu.c | 10 +- arch/x86/kvm/smm.c | 2 +- arch/x86/kvm/x86.c | 220 ++++++++++++++++++------------------ 11 files changed, 173 insertions(+), 173 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d290dbc96831..52524e0ca97f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -34,7 +34,7 @@ * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -EXPORT_SYMBOL_GPL(kvm_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); struct cpuid_xstate_sizes { u32 eax; @@ -131,7 +131,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( return NULL; } -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_cpuid_entry2); static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { @@ -1261,7 +1261,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); #undef F #undef SCATTERED_F @@ -2085,7 +2085,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, used_max_basic); return exact; } -EXPORT_SYMBOL_GPL(kvm_cpuid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpuid); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { @@ -2103,4 +2103,4 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a471900c7325..38595ecb990d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -923,7 +923,7 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { @@ -935,7 +935,7 @@ int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } -EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 153134893301..7cc8950005b6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -103,7 +103,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_injectable_intr); /* * check if there is pending interrupt without @@ -119,7 +119,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_interrupt); /* * Read pending interrupt(from non-APIC source) @@ -148,7 +148,7 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) WARN_ON_ONCE(!irqchip_split(v->kvm)); return get_userspace_extint(v); } -EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_get_extint); /* * Read pending interrupt vector and intack. diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ded0bd688c65..ee53e75a60cb 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -101,13 +101,13 @@ int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages) return __hv_flush_remote_tlbs_range(kvm, &range); } -EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs_range); int hv_flush_remote_tlbs(struct kvm *kvm) { return __hv_flush_remote_tlbs_range(kvm, NULL); } -EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { @@ -121,4 +121,4 @@ void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) spin_unlock(&kvm_arch->hv_root_tdp_lock); } } -EXPORT_SYMBOL_GPL(hv_track_root_tdp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_track_root_tdp); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5e47c523164..0ae7f913d782 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -106,7 +106,7 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) } __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); -EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); @@ -646,7 +646,7 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) return ((max_updated_irr != -1) && (max_updated_irr == *max_irr)); } -EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { @@ -657,7 +657,7 @@ bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr apic->irr_pending = true; return irr_updated; } -EXPORT_SYMBOL_GPL(kvm_apic_update_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { @@ -697,7 +697,7 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) { apic_clear_irr(vec, vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_clear_irr); static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic) { @@ -779,7 +779,7 @@ void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } -EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { @@ -790,7 +790,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) */ return apic_find_highest_irr(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, @@ -954,7 +954,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) { apic_update_ppr(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { @@ -1065,7 +1065,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } -EXPORT_SYMBOL_GPL(kvm_apic_match_dest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_match_dest); static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) @@ -1292,7 +1292,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, return r == 1; } -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) @@ -1569,7 +1569,7 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated); static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, u32 icr_high, struct kvm_lapic_irq *irq) @@ -1600,7 +1600,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } -EXPORT_SYMBOL_GPL(kvm_apic_send_ipi); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi); static u32 apic_get_tmcct(struct kvm_lapic *apic) { @@ -1717,7 +1717,7 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) return valid_reg_mask; } -EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) @@ -1958,7 +1958,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } -EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_wait_lapic_expire); static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) { @@ -2272,7 +2272,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) out: preempt_enable(); } -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { @@ -2525,7 +2525,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) @@ -2608,7 +2608,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } -EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { @@ -2746,7 +2746,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) kvm_recalculate_apic_map(vcpu->kvm); return 0; } -EXPORT_SYMBOL_GPL(kvm_apic_set_base); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { @@ -2794,7 +2794,7 @@ int kvm_alloc_apic_access_page(struct kvm *kvm) return 0; } -EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_alloc_apic_access_page); void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) { @@ -3058,7 +3058,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } -EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { @@ -3117,7 +3117,7 @@ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) } } -EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_ack_interrupt); static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 55335dbd70ce..667d66cf76d5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,7 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); -EXPORT_SYMBOL_GPL(tdp_mmu_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -3865,7 +3865,7 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, write_unlock(&kvm->mmu_lock); } } -EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { @@ -3892,7 +3892,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) kvm_mmu_free_roots(kvm, mmu, roots_to_free); } -EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_guest_mode_roots); static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) @@ -4876,7 +4876,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, return r; } -EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_page_fault); #ifdef CONFIG_X86_64 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, @@ -4966,7 +4966,7 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_GPL(kvm_tdp_map_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5162,7 +5162,7 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) __clear_sp_write_flooding_count(sp); } } -EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) @@ -5808,7 +5808,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5862,7 +5862,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_mmu_new_pgd(vcpu, new_eptp); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) @@ -5927,7 +5927,7 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu) else init_kvm_softmmu(vcpu, cpu_role); } -EXPORT_SYMBOL_GPL(kvm_init_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { @@ -5963,7 +5963,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); kvm_init_mmu(vcpu); } -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { @@ -5997,7 +5997,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -6059,7 +6059,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } -EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -6385,7 +6385,7 @@ emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_page_fault); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) { @@ -6401,7 +6401,7 @@ void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); pr_cont("\n"); } -EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_print_sptes); static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) @@ -6467,7 +6467,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } -EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { @@ -6484,7 +6484,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } -EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) @@ -6537,7 +6537,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, else max_huge_page_level = PG_LEVEL_2M; } -EXPORT_SYMBOL_GPL(kvm_configure_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_configure_mmu); static void free_mmu_pages(struct kvm_mmu *mmu) { @@ -7204,7 +7204,7 @@ restart: return need_tlb_flush; } -EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index df31039b5d63..37647afde7d3 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -22,7 +22,7 @@ bool __read_mostly enable_mmio_caching = true; static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); -EXPORT_SYMBOL_GPL(enable_mmio_caching); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching); bool __read_mostly kvm_ad_enabled; @@ -470,13 +470,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask); void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) { kvm->arch.shadow_mmio_value = mmio_value; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { @@ -487,7 +487,7 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) shadow_me_value = me_value; shadow_me_mask = me_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { @@ -513,7 +513,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7059ac9d58e2..c5734ca5c17d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1982,7 +1982,7 @@ bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) spte = sptes[leaf]; return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); } -EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); /* * Returns the last level spte pointer of the shadow page walk for the given diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b7dc5bd981ba..40ac4cb44ed2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -31,7 +31,7 @@ static struct x86_pmu_capability __read_mostly kvm_host_pmu; /* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ struct x86_pmu_capability __read_mostly kvm_pmu_cap; -EXPORT_SYMBOL_GPL(kvm_pmu_cap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap); struct kvm_pmu_emulated_event_selectors { u64 INSTRUCTIONS_RETIRED; @@ -373,7 +373,7 @@ void pmc_write_counter(struct kvm_pmc *pmc, u64 val) pmc->counter &= pmc_bitmask(pmc); pmc_update_sample_period(pmc); } -EXPORT_SYMBOL_GPL(pmc_write_counter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter); static int filter_cmp(const void *pa, const void *pb, u64 mask) { @@ -581,7 +581,7 @@ void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); } -EXPORT_SYMBOL_GPL(kvm_pmu_recalc_pmc_emulation); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -996,13 +996,13 @@ void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) { kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); } -EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) { kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } -EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index b0b14ba37f9a..f623c5986119 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,7 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_smm_changed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a618a30423a7..4b8138bd4857 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -97,10 +97,10 @@ * vendor module being reloaded with different module parameters. */ struct kvm_caps kvm_caps __read_mostly; -EXPORT_SYMBOL_GPL(kvm_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps); struct kvm_host_values kvm_host __read_mostly; -EXPORT_SYMBOL_GPL(kvm_host); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -155,7 +155,7 @@ module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, 0644); -EXPORT_SYMBOL_GPL(report_ignored_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); @@ -169,7 +169,7 @@ module_param(tsc_tolerance_ppm, uint, 0644); bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); -EXPORT_SYMBOL_GPL(enable_vmware_backdoor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor); /* * Flags to manipulate forced emulation behavior (any non-zero value will @@ -184,7 +184,7 @@ module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; -EXPORT_SYMBOL_GPL(enable_pmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; @@ -211,7 +211,7 @@ struct kvm_user_return_msrs { }; u32 __read_mostly kvm_nr_uret_msrs; -EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; @@ -230,16 +230,16 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) bool __read_mostly allow_smaller_maxphyaddr = 0; -EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; -EXPORT_SYMBOL_GPL(enable_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv); bool __read_mostly enable_ipiv = true; -EXPORT_SYMBOL_GPL(enable_ipiv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv); bool __read_mostly enable_device_posted_irqs = true; -EXPORT_SYMBOL_GPL(enable_device_posted_irqs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -628,7 +628,7 @@ int kvm_add_user_return_msr(u32 msr) kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -640,7 +640,7 @@ int kvm_find_user_return_msr(u32 msr) } return -1; } -EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { @@ -680,7 +680,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) kvm_user_return_register_notifier(msrs); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) { @@ -689,13 +689,13 @@ void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) msrs->values[slot].curr = value; kvm_user_return_register_notifier(msrs); } -EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); u64 kvm_get_user_return_msr(unsigned int slot) { return this_cpu_ptr(user_return_msrs)->values[slot].curr; } -EXPORT_SYMBOL_GPL(kvm_get_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); static void drop_user_return_notifiers(void) { @@ -717,7 +717,7 @@ noinstr void kvm_spurious_fault(void) /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -822,7 +822,7 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, ex->has_payload = false; ex->payload = 0; } -EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload); static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, bool has_error_code, u32 error_code, @@ -906,7 +906,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, @@ -914,7 +914,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, { kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_p); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@ -949,7 +949,7 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, vcpu->arch.exception.has_payload = false; vcpu->arch.exception.payload = 0; } -EXPORT_SYMBOL_GPL(kvm_requeue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -960,7 +960,7 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) return 1; } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp); static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -1010,7 +1010,7 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, fault_mmu->inject_page_fault(vcpu, fault); } -EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -1022,7 +1022,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -1044,7 +1044,7 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { @@ -1099,7 +1099,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) return 1; } -EXPORT_SYMBOL_GPL(load_pdptrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1152,7 +1152,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1196,13 +1196,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { @@ -1225,7 +1225,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { @@ -1251,7 +1251,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } } -EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -1304,7 +1304,7 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } -EXPORT_SYMBOL_GPL(__kvm_set_xcr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { @@ -1317,7 +1317,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1365,7 +1365,7 @@ void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned lon kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1399,7 +1399,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { @@ -1491,7 +1491,7 @@ handle_tlb_flush: return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { @@ -1503,7 +1503,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { @@ -1512,7 +1512,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { @@ -1537,7 +1537,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_update_dr7); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1578,7 +1578,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { @@ -1595,7 +1595,7 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) return vcpu->arch.dr7; } } -EXPORT_SYMBOL_GPL(kvm_get_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { @@ -1611,7 +1611,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM @@ -1750,7 +1750,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return __kvm_valid_efer(vcpu, efer); } -EXPORT_SYMBOL_GPL(kvm_valid_efer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -1793,7 +1793,7 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { @@ -1836,7 +1836,7 @@ out: return allowed; } -EXPORT_SYMBOL_GPL(kvm_msr_allowed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault @@ -2025,13 +2025,13 @@ int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(__kvm_emulate_msr_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(__kvm_emulate_msr_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { @@ -2040,7 +2040,7 @@ int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) return __kvm_emulate_msr_read(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { @@ -2049,7 +2049,7 @@ int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) return __kvm_emulate_msr_write(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) @@ -2158,7 +2158,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, complete_fast_rdmsr); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { @@ -2166,7 +2166,7 @@ int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr_imm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -2194,13 +2194,13 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); } -EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr_imm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { @@ -2212,7 +2212,7 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_invd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) { @@ -2221,14 +2221,14 @@ fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_REENTER_GUEST; } -EXPORT_SYMBOL_GPL(handle_fastpath_invd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) @@ -2254,13 +2254,13 @@ int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); } -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait); int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); } -EXPORT_SYMBOL_GPL(kvm_emulate_monitor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { @@ -2298,13 +2298,13 @@ fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu)); } -EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); } -EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr_imm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); /* * Adapt set_msr() to msr_io()'s calling convention @@ -2670,7 +2670,7 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { @@ -2685,7 +2685,7 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) nested_offset += l2_offset; return nested_offset; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { @@ -2695,7 +2695,7 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) return l1_multiplier; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { @@ -3773,7 +3773,7 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); } -EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { @@ -4327,7 +4327,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { @@ -4680,7 +4680,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -7836,7 +7836,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -7847,7 +7847,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -7933,7 +7933,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt); static int emulator_read_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -8005,7 +8005,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system); static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -8039,7 +8039,7 @@ int handle_ud(struct kvm_vcpu *vcpu) return kvm_emulate_instruction(vcpu, emul_type); } -EXPORT_SYMBOL_GPL(handle_ud); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud); static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write) @@ -8518,7 +8518,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_emulate_wbinvd_noskip(vcpu); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd); @@ -9016,7 +9016,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); } } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt); static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata, u8 *insn_bytes, u8 insn_size) @@ -9081,13 +9081,13 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, { prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); } -EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) { __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit); void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) { @@ -9109,7 +9109,7 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; run->internal.ndata = ndata; } -EXPORT_SYMBOL_GPL(kvm_prepare_event_vectoring_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { @@ -9233,7 +9233,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) r = kvm_vcpu_do_singlestep(vcpu); return r; } -EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { @@ -9364,7 +9364,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, return r; } -EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) @@ -9588,14 +9588,14 @@ int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len) { return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer); static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) { @@ -9690,7 +9690,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) ret = kvm_fast_pio_out(vcpu, size, port); return ret && kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_fast_pio); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio); static int kvmclock_cpu_down_prep(unsigned int cpu) { @@ -10147,7 +10147,7 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) { @@ -10181,7 +10181,7 @@ void kvm_x86_vendor_exit(void) kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit); #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, @@ -10245,7 +10245,7 @@ bool kvm_apicv_activated(struct kvm *kvm) { return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); } -EXPORT_SYMBOL_GPL(kvm_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) { @@ -10255,7 +10255,7 @@ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) return (vm_reasons | vcpu_reasons) == 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) @@ -10431,7 +10431,7 @@ out: vcpu->run->hypercall.ret = ret; return 1; } -EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { @@ -10444,7 +10444,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), complete_hypercall_exit); } -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall); static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { @@ -10887,7 +10887,7 @@ out: preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv); static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { @@ -10963,7 +10963,7 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -11517,7 +11517,7 @@ bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { @@ -11670,7 +11670,7 @@ int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) { return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); } -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -11681,7 +11681,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) */ return kvm_emulate_halt_noskip(vcpu) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { @@ -11693,7 +11693,7 @@ fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_EXIT_HANDLED; } -EXPORT_SYMBOL_GPL(handle_fastpath_hlt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { @@ -11702,7 +11702,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { @@ -12255,7 +12255,7 @@ unhandled_task_switch: vcpu->run->internal.ndata = 0; return 0; } -EXPORT_SYMBOL_GPL(kvm_task_switch); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch); static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -12956,7 +12956,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -12968,7 +12968,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } -EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector); void kvm_arch_enable_virtualization(void) { @@ -13086,7 +13086,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -13250,7 +13250,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { @@ -13658,13 +13658,13 @@ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { @@ -13675,7 +13675,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { @@ -13690,7 +13690,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { @@ -13933,7 +13933,7 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return atomic_read(&kvm->arch.noncoherent_dma_count); } -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma); bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { @@ -13989,7 +13989,7 @@ int kvm_spec_ctrl_test_value(u64 value) return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { @@ -14014,7 +14014,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); /* * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns @@ -14043,7 +14043,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, return 0; } -EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { @@ -14107,7 +14107,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } } -EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid); static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) { @@ -14192,7 +14192,7 @@ int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write); int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data) @@ -14230,7 +14230,7 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read); static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) { @@ -14318,7 +14318,7 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, return in ? kvm_sev_es_ins(vcpu, size, port) : kvm_sev_es_outs(vcpu, size, port); } -EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);