Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM IRQ changes for 6.17

 - Rework irqbypass to track/match producers and consumers via an xarray
   instead of a linked list.  Using a linked list leads to O(n^2) insertion
   times, which is hugely problematic for use cases that create large numbers
   of VMs.  Such use cases typically don't actually use irqbypass, but
   eliminating the pointless registration is a future problem to solve as it
   likely requires new uAPI.

 - Track irqbypass's "token" as "struct eventfd_ctx *" instead of a "void *",
   to avoid making a simple concept unnecessarily difficult to understand.

 - Add CONFIG_KVM_IOAPIC for x86 to allow disabling support for I/O APIC, PIC,
   and PIT emulation at compile time.

 - Drop x86's irq_comm.c, and move a pile of IRQ related code into irq.c.

 - Fix a variety of flaws and bugs in the AVIC device posted IRQ code.

 - Inhibited AVIC if a vCPU's ID is too big (relative to what hardware
   supports) instead of rejecting vCPU creation.

 - Extend enable_ipiv module param support to SVM, by simply leaving IsRunning
   clear in the vCPU's physical ID table entry.

 - Disable IPI virtualization, via enable_ipiv, if the CPU is affected by
   erratum #1235, to allow (safely) enabling AVIC on such CPUs.

 - Dedup x86's device posted IRQ code, as the vast majority of functionality
   can be shared verbatime between SVM and VMX.

 - Harden the device posted IRQ code against bugs and runtime errors.

 - Use vcpu_idx, not vcpu_id, for GA log tag/metadata, to make lookups O(1)
   instead of O(n).

 - Generate GA Log interrupts if and only if the target vCPU is blocking, i.e.
   only if KVM needs a notification in order to wake the vCPU.

 - Decouple device posted IRQs from VFIO device assignment, as binding a VM to
   a VFIO group is not a requirement for enabling device posted IRQs.

 - Clean up and document/comment the irqfd assignment code.

 - Disallow binding multiple irqfds to an eventfd with a priority waiter, i.e.
   ensure an eventfd is bound to at most one irqfd through the entire host,
   and add a selftest to verify eventfd:irqfd bindings are globally unique.
This commit is contained in:
Paolo Bonzini
2025-07-28 11:03:04 -04:00
56 changed files with 1847 additions and 1765 deletions

View File

@@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
int ret = 0;
if (flags & EPOLLIN) {
/*
* WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
* as KVM holds irqfds.lock when registering the irqfd with the
* eventfd.
*/
u64 cnt;
eventfd_ctx_do_read(irqfd->eventfd, &cnt);
@@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
/* The eventfd is closing, detach from KVM */
unsigned long iflags;
/*
* Taking irqfds.lock is safe here, as KVM holds a reference to
* the eventfd when registering the irqfd, i.e. this path can't
* be reached while kvm_irqfd_add() is running.
*/
spin_lock_irqsave(&kvm->irqfds.lock, iflags);
/*
@@ -245,22 +255,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
return ret;
}
static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct kvm_kernel_irqfd *irqfd =
container_of(pt, struct kvm_kernel_irqfd, pt);
add_wait_queue_priority(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
int n_entries;
lockdep_assert_held(&kvm->irqfds.lock);
n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
write_seqcount_begin(&irqfd->irq_entry_sc);
@@ -274,6 +276,63 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
write_seqcount_end(&irqfd->irq_entry_sc);
}
struct kvm_irqfd_pt {
struct kvm_kernel_irqfd *irqfd;
struct kvm *kvm;
poll_table pt;
int ret;
};
static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
struct kvm_kernel_irqfd *irqfd = p->irqfd;
struct kvm *kvm = p->kvm;
/*
* Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
* and irqfds.items. It does NOT protect registering with the eventfd.
*/
spin_lock_irq(&kvm->irqfds.lock);
/*
* Initialize the routing information prior to adding the irqfd to the
* eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
* irqfd is registered.
*/
irqfd_update(kvm, irqfd);
/*
* Add the irqfd as a priority waiter on the eventfd, with a custom
* wake-up handler, so that KVM *and only KVM* is notified whenever the
* underlying eventfd is signaled.
*/
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
/*
* Temporarily lie to lockdep about holding irqfds.lock to avoid a
* false positive regarding potential deadlock with irqfd_wakeup()
* (see irqfd_wakeup() for details).
*
* Adding to the wait queue will fail if there is already a priority
* waiter, i.e. if the eventfd is associated with another irqfd (in any
* VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown
* jobs to complete, i.e. ensures the irqfd has been removed from the
* eventfd's waitqueue before returning to userspace.
*/
spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
if (p->ret)
goto out;
list_add_tail(&irqfd->list, &kvm->irqfds.items);
out:
spin_unlock_irq(&kvm->irqfds.lock);
}
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
struct irq_bypass_consumer *cons)
@@ -285,26 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
{
}
int __attribute__((weak)) kvm_arch_update_irqfd_routing(
struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
return 0;
}
bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
return true;
}
#endif
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_kernel_irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
struct kvm_irqfd_pt irqfd_pt;
int ret;
__poll_t events;
int idx;
@@ -390,57 +443,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
* Set the irqfd routing and add it to KVM's list before registering
* the irqfd with the eventfd, so that the routing information is valid
* and stays valid, e.g. if there are GSI routing changes, prior to
* making the irqfd visible, i.e. before it might be signaled.
*
* Note, holding SRCU ensures a stable read of routing information, and
* also prevents irqfd_shutdown() from freeing the irqfd before it's
* fully initialized.
*/
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
spin_lock_irq(&kvm->irqfds.lock);
ret = 0;
list_for_each_entry(tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd != tmp->eventfd)
continue;
/* This fd is used for another irq already. */
ret = -EBUSY;
spin_unlock_irq(&kvm->irqfds.lock);
goto fail;
}
idx = srcu_read_lock(&kvm->irq_srcu);
irqfd_update(kvm, irqfd);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
spin_unlock_irq(&kvm->irqfds.lock);
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
* Register the irqfd with the eventfd by polling on the eventfd, and
* simultaneously and the irqfd to KVM's list. If there was en event
* pending on the eventfd prior to registering, manually trigger IRQ
* injection.
*/
events = vfs_poll(fd_file(f), &irqfd->pt);
irqfd_pt.irqfd = irqfd;
irqfd_pt.kvm = kvm;
init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
events = vfs_poll(fd_file(f), &irqfd_pt.pt);
ret = irqfd_pt.ret;
if (ret)
goto fail_poll;
if (events & EPOLLIN)
schedule_work(&irqfd->inject);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
if (kvm_arch_has_irq_bypass()) {
irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
ret = irq_bypass_register_consumer(&irqfd->consumer);
ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
if (ret)
pr_info("irq bypass consumer (token %p) registration fails: %d\n",
irqfd->consumer.token, ret);
pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
irqfd->eventfd, ret);
}
#endif
srcu_read_unlock(&kvm->irq_srcu, idx);
return 0;
fail_poll:
srcu_read_unlock(&kvm->irq_srcu, idx);
fail:
if (irqfd->resampler)
irqfd_resampler_shutdown(irqfd);
@@ -617,13 +667,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
irqfd_update(kvm, irqfd);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
if (irqfd->producer &&
kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
int ret = kvm_arch_update_irqfd_routing(
irqfd->kvm, irqfd->producer->irq,
irqfd->gsi, 1);
WARN_ON(ret);
}
if (irqfd->producer)
kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
#endif
}

View File

@@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_arch_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
kvm_arch_post_irq_routing_update(kvm);
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;

View File

@@ -22,8 +22,8 @@
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");
static LIST_HEAD(producers);
static LIST_HEAD(consumers);
static DEFINE_XARRAY(producers);
static DEFINE_XARRAY(consumers);
static DEFINE_MUTEX(lock);
/* @lock must be held when calling connect */
@@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod,
if (prod->start)
prod->start(prod);
if (!ret) {
prod->consumer = cons;
cons->producer = prod;
}
return ret;
}
@@ -72,56 +76,49 @@ static void __disconnect(struct irq_bypass_producer *prod,
cons->start(cons);
if (prod->start)
prod->start(prod);
prod->consumer = NULL;
cons->producer = NULL;
}
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
* @eventfd: pointer to the eventfd context associated with the producer
* @irq: Linux IRQ number of the underlying producer device
*
* Add the provided IRQ producer to the list of producers and connect
* with any matching token found on the IRQ consumers list.
* Add the provided IRQ producer to the set of producers and connect with the
* consumer with a matching eventfd, if one exists.
*/
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
int irq_bypass_register_producer(struct irq_bypass_producer *producer,
struct eventfd_ctx *eventfd, int irq)
{
struct irq_bypass_producer *tmp;
unsigned long index = (unsigned long)eventfd;
struct irq_bypass_consumer *consumer;
int ret;
if (!producer->token)
if (WARN_ON_ONCE(producer->eventfd))
return -EINVAL;
might_sleep();
producer->irq = irq;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
guard(mutex)(&lock);
mutex_lock(&lock);
ret = xa_insert(&producers, index, producer, GFP_KERNEL);
if (ret)
return ret;
list_for_each_entry(tmp, &producers, node) {
if (tmp->token == producer->token) {
ret = -EBUSY;
goto out_err;
consumer = xa_load(&consumers, index);
if (consumer) {
ret = __connect(producer, consumer);
if (ret) {
WARN_ON_ONCE(xa_erase(&producers, index) != producer);
return ret;
}
}
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
ret = __connect(producer, consumer);
if (ret)
goto out_err;
break;
}
}
list_add(&producer->node, &producers);
mutex_unlock(&lock);
producer->eventfd = eventfd;
return 0;
out_err:
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
@@ -129,95 +126,65 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
* irq_bypass_unregister_producer - unregister IRQ bypass producer
* @producer: pointer to producer structure
*
* Remove a previously registered IRQ producer from the list of producers
* and disconnect it from any connected IRQ consumer.
* Remove a previously registered IRQ producer (note, it's safe to call this
* even if registration was unsuccessful). Disconnect from the associated
* consumer, if one exists.
*/
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
unsigned long index = (unsigned long)producer->eventfd;
if (!producer->token)
if (!producer->eventfd)
return;
might_sleep();
guard(mutex)(&lock);
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
if (producer->consumer)
__disconnect(producer, producer->consumer);
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token != producer->token)
continue;
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&producer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
WARN_ON_ONCE(xa_erase(&producers, index) != producer);
producer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
* @eventfd: pointer to the eventfd context associated with the consumer
*
* Add the provided IRQ consumer to the list of consumers and connect
* with any matching token found on the IRQ producer list.
* Add the provided IRQ consumer to the set of consumers and connect with the
* producer with a matching eventfd, if one exists.
*/
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
struct eventfd_ctx *eventfd)
{
struct irq_bypass_consumer *tmp;
unsigned long index = (unsigned long)eventfd;
struct irq_bypass_producer *producer;
int ret;
if (!consumer->token ||
!consumer->add_producer || !consumer->del_producer)
if (WARN_ON_ONCE(consumer->eventfd))
return -EINVAL;
might_sleep();
if (!consumer->add_producer || !consumer->del_producer)
return -EINVAL;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
guard(mutex)(&lock);
mutex_lock(&lock);
ret = xa_insert(&consumers, index, consumer, GFP_KERNEL);
if (ret)
return ret;
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token == consumer->token || tmp == consumer) {
ret = -EBUSY;
goto out_err;
producer = xa_load(&producers, index);
if (producer) {
ret = __connect(producer, consumer);
if (ret) {
WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
return ret;
}
}
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
ret = __connect(producer, consumer);
if (ret)
goto out_err;
break;
}
}
list_add(&consumer->node, &consumers);
mutex_unlock(&lock);
consumer->eventfd = eventfd;
return 0;
out_err:
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
@@ -225,42 +192,23 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
* irq_bypass_unregister_consumer - unregister IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Remove a previously registered IRQ consumer from the list of consumers
* and disconnect it from any connected IRQ producer.
* Remove a previously registered IRQ consumer (note, it's safe to call this
* even if registration was unsuccessful). Disconnect from the associated
* producer, if one exists.
*/
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
unsigned long index = (unsigned long)consumer->eventfd;
if (!consumer->token)
if (!consumer->eventfd)
return;
might_sleep();
guard(mutex)(&lock);
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
if (consumer->producer)
__disconnect(consumer->producer, consumer);
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp != consumer)
continue;
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&consumer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
consumer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);