Commit 4cb92fa7 authored by Benjamin Block's avatar Benjamin Block Committed by Heiko Carstens
Browse files

s390/pci: Fix cyclic dead-lock in zpci_zdev_put() and zpci_scan_devices()



When triggering PCI device recovery by writing into the SysFS attribute
`recover` of a Physical Function with existing child SR-IOV Virtual
Functions, lockdep is reporting a possible deadlock between three
threads:

         Thread (A)             Thread (B)             Thread (C)
             |                      |                      |
      recover_store()      zpci_scan_devices()    zpci_scan_devices()
lock(pci_rescan_remove_lock)        |                      |
             |                      |                      |
             |                      |            zpci_bus_scan_busses()
             |                      |             lock(zbus_list_lock)
             |              zpci_add_device()              |
             |          lock(zpci_add_remove_lock)         |
             |                      |                      ┴
             |                      |             zpci_bus_scan_bus()
             |                      |         lock(pci_rescan_remove_lock)
             ┴                      |
      zpci_zdev_put()               |
 lock(zpci_add_remove_lock)         |
                                    ┴
                              zpci_bus_get()
                           lock(zbus_list_lock)

In zpci_bus_scan_busses() the `zbus_list_lock` is taken for the whole
duration of the function, which also includes taking
`pci_rescan_remove_lock`, among other things. But `zbus_list_lock` only
really needs to protect the modification of the global registration
`zbus_list`, it can be dropped while the functions within the list
iteration run; this way we break the cycle above.

Break up zpci_bus_scan_busses() into an "iterator" zpci_bus_get_next()
that iterates over `zbus_list` element by element, and acquires and
releases `zbus_list_lock` as necessary, but never keep holding it.
References to `zpci_bus` objects are also acquired and released.

The reference counting on `zpci_bus` objects is also changed so that all
put() and get() operations are done under the protection of
`zbus_list_lock`, and if the operation results in a modification of
`zpci_bus_list`, this modification is done in the same critical section
(apart the very first initialization). This way objects are never seen
on the list that are about to be released and/or half-initialized.

Fixes: 14c87ba8 ("s390/pci: separate zbus registration from scanning")
Suggested-by: default avatarNiklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: default avatarBenjamin Block <bblock@linux.ibm.com>
Reviewed-by: default avatarNiklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: default avatarGerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parent b1aa01d3
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -748,6 +748,7 @@ ForEachMacros:
  - 'ynl_attr_for_each_nested'
  - 'ynl_attr_for_each_payload'
  - 'zorro_for_each_dev'
  - 'zpci_bus_for_each'

IncludeBlocks: Preserve
IncludeCategories:
+5 −1
Original line number Diff line number Diff line
@@ -1148,6 +1148,7 @@ static void zpci_add_devices(struct list_head *scan_list)

int zpci_scan_devices(void)
{
	struct zpci_bus *zbus;
	LIST_HEAD(scan_list);
	int rc;

@@ -1156,7 +1157,10 @@ int zpci_scan_devices(void)
		return rc;

	zpci_add_devices(&scan_list);
	zpci_bus_scan_busses();
	zpci_bus_for_each(zbus) {
		zpci_bus_scan_bus(zbus);
		cond_resched();
	}
	return 0;
}

+71 −27
Original line number Diff line number Diff line
@@ -153,23 +153,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus)
	return ret;
}

/* zpci_bus_scan_busses - Scan all registered busses
 *
 * Scan all available zbusses
 *
 */
void zpci_bus_scan_busses(void)
{
	struct zpci_bus *zbus = NULL;

	mutex_lock(&zbus_list_lock);
	list_for_each_entry(zbus, &zbus_list, bus_next) {
		zpci_bus_scan_bus(zbus);
		cond_resched();
	}
	mutex_unlock(&zbus_list_lock);
}

static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
{
	return !s390_pci_no_rid && zdev->rid_available &&
@@ -222,10 +205,29 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s
	return -ENOMEM;
}

static void zpci_bus_release(struct kref *kref)
/**
 * zpci_bus_release - Un-initialize resources associated with the zbus and
 *		      free memory
 * @kref:	refcount * that is part of struct zpci_bus
 *
 * MUST be called with `zbus_list_lock` held, but the lock is released during
 * run of the function.
 */
static inline void zpci_bus_release(struct kref *kref)
	__releases(&zbus_list_lock)
{
	struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref);

	lockdep_assert_held(&zbus_list_lock);

	list_del(&zbus->bus_next);
	mutex_unlock(&zbus_list_lock);

	/*
	 * At this point no-one should see this object, or be able to get a new
	 * reference to it.
	 */

	if (zbus->bus) {
		pci_lock_rescan_remove();
		pci_stop_root_bus(zbus->bus);
@@ -237,16 +239,19 @@ static void zpci_bus_release(struct kref *kref)
		pci_unlock_rescan_remove();
	}

	mutex_lock(&zbus_list_lock);
	list_del(&zbus->bus_next);
	mutex_unlock(&zbus_list_lock);
	zpci_remove_parent_msi_domain(zbus);
	kfree(zbus);
}

static void zpci_bus_put(struct zpci_bus *zbus)
static inline void __zpci_bus_get(struct zpci_bus *zbus)
{
	lockdep_assert_held(&zbus_list_lock);
	kref_get(&zbus->kref);
}

static inline void zpci_bus_put(struct zpci_bus *zbus)
{
	kref_put(&zbus->kref, zpci_bus_release);
	kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock);
}

static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
@@ -258,7 +263,7 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
		if (!zbus->multifunction)
			continue;
		if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
			kref_get(&zbus->kref);
			__zpci_bus_get(zbus);
			goto out_unlock;
		}
	}
@@ -268,6 +273,44 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
	return zbus;
}

/**
 * zpci_bus_get_next - get the next zbus object from given position in the list
 * @pos:	current position/cursor in the global zbus list
 *
 * Acquires and releases references as the cursor iterates (might also free/
 * release the cursor). Is tolerant of concurrent operations on the list.
 *
 * To begin the iteration, set *@pos to %NULL before calling the function.
 *
 * *@pos is set to %NULL in cases where either the list is empty, or *@pos is
 * the last element in the list.
 *
 * Context: Process context. May sleep.
 */
void zpci_bus_get_next(struct zpci_bus **pos)
{
	struct zpci_bus *curp = *pos, *next = NULL;

	mutex_lock(&zbus_list_lock);
	if (curp)
		next = list_next_entry(curp, bus_next);
	else
		next = list_first_entry(&zbus_list, typeof(*curp), bus_next);

	if (list_entry_is_head(next, &zbus_list, bus_next))
		next = NULL;

	if (next)
		__zpci_bus_get(next);

	*pos = next;
	mutex_unlock(&zbus_list_lock);

	/* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */
	if (curp)
		zpci_bus_put(curp);
}

static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
{
	struct zpci_bus *zbus;
@@ -279,9 +322,6 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
	zbus->topo = topo;
	zbus->topo_is_tid = topo_is_tid;
	INIT_LIST_HEAD(&zbus->bus_next);
	mutex_lock(&zbus_list_lock);
	list_add_tail(&zbus->bus_next, &zbus_list);
	mutex_unlock(&zbus_list_lock);

	kref_init(&zbus->kref);
	INIT_LIST_HEAD(&zbus->resources);
@@ -291,6 +331,10 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
	zbus->bus_resource.flags = IORESOURCE_BUS;
	pci_add_resource(&zbus->resources, &zbus->bus_resource);

	mutex_lock(&zbus_list_lock);
	list_add_tail(&zbus->bus_next, &zbus_list);
	mutex_unlock(&zbus_list_lock);

	return zbus;
}

+14 −1
Original line number Diff line number Diff line
@@ -15,7 +15,20 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
void zpci_bus_device_unregister(struct zpci_dev *zdev);

int zpci_bus_scan_bus(struct zpci_bus *zbus);
void zpci_bus_scan_busses(void);
void zpci_bus_get_next(struct zpci_bus **pos);

/**
 * zpci_bus_for_each - iterate over all the registered zbus objects
 * @pos:	a struct zpci_bus * as cursor
 *
 * Acquires and releases references as the cursor iterates over the registered
 * objects. Is tolerant against concurrent removals of objects.
 *
 * Context: Process context. May sleep.
 */
#define zpci_bus_for_each(pos)					     \
	for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \
	     zpci_bus_get_next(&(pos)))

int zpci_bus_scan_device(struct zpci_dev *zdev);
void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);