genirq: Introduce common irq_force_complete_move() implementation (751dc837) · Commits · git / linux-nf

arch/x86/kernel/apic/vector.c

+108 −123

Original line number	Diff line number	Diff line
		@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
		return err ? err : IRQ_SET_MASK_OK;
		}

		static void free_moved_vector(struct apic_chip_data *apicd)
		{
		unsigned int vector = apicd->prev_vector;
		unsigned int cpu = apicd->prev_cpu;
		bool managed = apicd->is_managed;

		/*
		* Managed interrupts are usually not migrated away
		* from an online CPU, but CPU isolation 'managed_irq'
		* can make that happen.
		* 1) Activation does not take the isolation into account
		* to keep the code simple
		* 2) Migration away from an isolated CPU can happen when
		* a non-isolated CPU which is in the calculated
		* affinity mask comes online.
		*/
		trace_vector_free_moved(apicd->irq, cpu, vector, managed);
		irq_matrix_free(vector_matrix, cpu, vector, managed);
		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
		hlist_del_init(&apicd->clist);
		apicd->prev_vector = 0;
		apicd->move_in_progress = 0;
		}

		/*
		* Called from fixup_irqs() with @desc->lock held and interrupts disabled.
		*/
		static void apic_force_complete_move(struct irq_data *irqd)
		{
		unsigned int cpu = smp_processor_id();
		struct apic_chip_data *apicd;
		unsigned int vector;

		guard(raw_spinlock)(&vector_lock);
		apicd = apic_chip_data(irqd);
		if (!apicd)
		return;

		/*
		* If prev_vector is empty or the descriptor is neither currently
		* nor previously on the outgoing CPU no action required.
		*/
		vector = apicd->prev_vector;
		if (!vector \|\| (apicd->cpu != cpu && apicd->prev_cpu != cpu))
		return;

		/*
		* This is tricky. If the cleanup of the old vector has not been
		* done yet, then the following setaffinity call will fail with
		* -EBUSY. This can leave the interrupt in a stale state.
		*
		* All CPUs are stuck in stop machine with interrupts disabled so
		* calling __irq_complete_move() would be completely pointless.
		*
		* 1) The interrupt is in move_in_progress state. That means that we
		* have not seen an interrupt since the io_apic was reprogrammed to
		* the new vector.
		*
		* 2) The interrupt has fired on the new vector, but the cleanup IPIs
		* have not been processed yet.
		*/
		if (apicd->move_in_progress) {
		/*
		* In theory there is a race:
		*
		* set_ioapic(new_vector) <-- Interrupt is raised before update
		* is effective, i.e. it's raised on
		* the old vector.
		*
		* So if the target cpu cannot handle that interrupt before
		* the old vector is cleaned up, we get a spurious interrupt
		* and in the worst case the ioapic irq line becomes stale.
		*
		* But in case of cpu hotplug this should be a non issue
		* because if the affinity update happens right before all
		* cpus rendezvous in stop machine, there is no way that the
		* interrupt can be blocked on the target cpu because all cpus
		* loops first with interrupts enabled in stop machine, so the
		* old vector is not yet cleaned up when the interrupt fires.
		*
		* So the only way to run into this issue is if the delivery
		* of the interrupt on the apic/system bus would be delayed
		* beyond the point where the target cpu disables interrupts
		* in stop machine. I doubt that it can happen, but at least
		* there is a theoretical chance. Virtualization might be
		* able to expose this, but AFAICT the IOAPIC emulation is not
		* as stupid as the real hardware.
		*
		* Anyway, there is nothing we can do about that at this point
		* w/o refactoring the whole fixup_irq() business completely.
		* We print at least the irq number and the old vector number,
		* so we have the necessary information when a problem in that
		* area arises.
		*/
		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
		irqd->irq, vector);
		}
		free_moved_vector(apicd);
		}

		#else
		# define apic_set_affinity NULL
		# define apic_force_complete_move NULL
		#endif

		static int apic_retrigger_irq(struct irq_data *irqd)
		@@ -927,35 +1028,12 @@ static struct irq_chip lapic_controller = {
		.irq_ack = apic_ack_edge,
		.irq_set_affinity = apic_set_affinity,
		.irq_compose_msi_msg = x86_vector_msi_compose_msg,
		.irq_force_complete_move = apic_force_complete_move,
		.irq_retrigger = apic_retrigger_irq,
		};

		#ifdef CONFIG_SMP

		static void free_moved_vector(struct apic_chip_data *apicd)
		{
		unsigned int vector = apicd->prev_vector;
		unsigned int cpu = apicd->prev_cpu;
		bool managed = apicd->is_managed;

		/*
		* Managed interrupts are usually not migrated away
		* from an online CPU, but CPU isolation 'managed_irq'
		* can make that happen.
		* 1) Activation does not take the isolation into account
		* to keep the code simple
		* 2) Migration away from an isolated CPU can happen when
		* a non-isolated CPU which is in the calculated
		* affinity mask comes online.
		*/
		trace_vector_free_moved(apicd->irq, cpu, vector, managed);
		irq_matrix_free(vector_matrix, cpu, vector, managed);
		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
		hlist_del_init(&apicd->clist);
		apicd->prev_vector = 0;
		apicd->move_in_progress = 0;
		}

		static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
		{
		struct apic_chip_data *apicd;
		@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
		__vector_schedule_cleanup(apicd);
		}

		/*
		* Called from fixup_irqs() with @desc->lock held and interrupts disabled.
		*/
		void irq_force_complete_move(struct irq_desc *desc)
		{
		unsigned int cpu = smp_processor_id();
		struct apic_chip_data *apicd;
		struct irq_data *irqd;
		unsigned int vector;

		/*
		* The function is called for all descriptors regardless of which
		* irqdomain they belong to. For example if an IRQ is provided by
		* an irq_chip as part of a GPIO driver, the chip data for that
		* descriptor is specific to the irq_chip in question.
		*
		* Check first that the chip_data is what we expect
		* (apic_chip_data) before touching it any further.
		*/
		irqd = irq_domain_get_irq_data(x86_vector_domain,
		irq_desc_get_irq(desc));
		if (!irqd)
		return;

		raw_spin_lock(&vector_lock);
		apicd = apic_chip_data(irqd);
		if (!apicd)
		goto unlock;

		/*
		* If prev_vector is empty or the descriptor is neither currently
		* nor previously on the outgoing CPU no action required.
		*/
		vector = apicd->prev_vector;
		if (!vector \|\| (apicd->cpu != cpu && apicd->prev_cpu != cpu))
		goto unlock;

		/*
		* This is tricky. If the cleanup of the old vector has not been
		* done yet, then the following setaffinity call will fail with
		* -EBUSY. This can leave the interrupt in a stale state.
		*
		* All CPUs are stuck in stop machine with interrupts disabled so
		* calling __irq_complete_move() would be completely pointless.
		*
		* 1) The interrupt is in move_in_progress state. That means that we
		* have not seen an interrupt since the io_apic was reprogrammed to
		* the new vector.
		*
		* 2) The interrupt has fired on the new vector, but the cleanup IPIs
		* have not been processed yet.
		*/
		if (apicd->move_in_progress) {
		/*
		* In theory there is a race:
		*
		* set_ioapic(new_vector) <-- Interrupt is raised before update
		* is effective, i.e. it's raised on
		* the old vector.
		*
		* So if the target cpu cannot handle that interrupt before
		* the old vector is cleaned up, we get a spurious interrupt
		* and in the worst case the ioapic irq line becomes stale.
		*
		* But in case of cpu hotplug this should be a non issue
		* because if the affinity update happens right before all
		* cpus rendezvous in stop machine, there is no way that the
		* interrupt can be blocked on the target cpu because all cpus
		* loops first with interrupts enabled in stop machine, so the
		* old vector is not yet cleaned up when the interrupt fires.
		*
		* So the only way to run into this issue is if the delivery
		* of the interrupt on the apic/system bus would be delayed
		* beyond the point where the target cpu disables interrupts
		* in stop machine. I doubt that it can happen, but at least
		* there is a theoretical chance. Virtualization might be
		* able to expose this, but AFAICT the IOAPIC emulation is not
		* as stupid as the real hardware.
		*
		* Anyway, there is nothing we can do about that at this point
		* w/o refactoring the whole fixup_irq() business completely.
		* We print at least the irq number and the old vector number,
		* so we have the necessary information when a problem in that
		* area arises.
		*/
		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
		irqd->irq, vector);
		}
		free_moved_vector(apicd);
		unlock:
		raw_spin_unlock(&vector_lock);
		}

		#ifdef CONFIG_HOTPLUG_CPU
		/*
		* Note, this is not accurate accounting, but at least good enough to

include/linux/irq.h

+3 −2

Original line number	Diff line number	Diff line
		@@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
		* @ipi_send_mask: send an IPI to destination cpus in cpumask
		* @irq_nmi_setup: function called from core code before enabling an NMI
		* @irq_nmi_teardown: function called from core code after disabling an NMI
		* @irq_force_complete_move: optional function to force complete pending irq move
		* @flags: chip specific flags
		*/
		struct irq_chip {
		@@ -537,6 +538,8 @@ struct irq_chip {
		int (irq_nmi_setup)(struct irq_data data);
		void (irq_nmi_teardown)(struct irq_data data);

		void (irq_force_complete_move)(struct irq_data data);

		unsigned long flags;
		};

		@@ -619,11 +622,9 @@ static inline void irq_move_irq(struct irq_data *data)
		__irq_move_irq(data);
		}
		void irq_move_masked_irq(struct irq_data *data);
		void irq_force_complete_move(struct irq_desc *desc);
		#else
		static inline void irq_move_irq(struct irq_data *data) { }
		static inline void irq_move_masked_irq(struct irq_data *data) { }
		static inline void irq_force_complete_move(struct irq_desc *desc) { }
		#endif

		extern int no_irq_affinity;

kernel/irq/internals.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -442,6 +442,7 @@ static inline struct cpumask irq_desc_get_pending_mask(struct irq_desc desc)
		return desc->pending_mask;
		}
		bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
		void irq_force_complete_move(struct irq_desc *desc);
		#else /* CONFIG_GENERIC_PENDING_IRQ */
		static inline bool irq_can_move_pcntxt(struct irq_data *data)
		{
		@@ -467,6 +468,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
		{
		return false;
		}
		static inline void irq_force_complete_move(struct irq_desc *desc) { }
		#endif /* !CONFIG_GENERIC_PENDING_IRQ */

		#if !defined(CONFIG_IRQ_DOMAIN) \|\| !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)

kernel/irq/migration.c

+10 −0

Original line number	Diff line number	Diff line
		@@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
		return true;
		}

		void irq_force_complete_move(struct irq_desc *desc)
		{
		for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) {
		if (d->chip && d->chip->irq_force_complete_move) {
		d->chip->irq_force_complete_move(d);
		return;
		}
		}
		}

		void irq_move_masked_irq(struct irq_data *idata)
		{
		struct irq_desc *desc = irq_data_to_desc(idata);