Commit 726a9b67 authored by Song Liu's avatar Song Liu
Browse files

Merge branch 'md-next-rcu-cleanup' into md-next

From Yu Kuai:

md: remove rcu protection to access rdev from conf

The lifetime of rdev:

1. md_import_device() generate a rdev based on underlying disk;

   mddev_lock()
   rdev = kzalloc();
   rdev->bdev = blkdev_get_by_dev();
   mddev_unlock()

2. bind_rdev_to_array() add this rdev to mddev->disks;

   mddev_lock()
   kobject_add(&rdev->kobj, &mddev->kobj, ...);
   list_add_rcu(&rdev->same_set, &mddev->disks);
   mddev_unlock()

3. remove_and_add_spares() add this rdev to conf;

   mddev_lock()
   rdev_addable();
   pers->hot_add_disk();
   rcu_assign_pointer(conf->rdev, rdev);
   mddev_unlock()

4. Use this array with rdev;

5. remove_and_add_spares() remove rdev from conf;

   // triggered by sysfs/ioctl
   mddev_lock()
   rdev_removeable();
   pers->hot_remove_disk();
    rcu_assign_pointer(conf->rdev, NULL);
    synchronize_rcu();
   mddev_unlock()

   // triggered by daemon
   mddev_lock()
   rdev_removeable();
   synchronize_rcu(); -> this can't protect accessing rdev from conf
   pers->hot_remove_disk();
    rcu_assign_pointer(conf->rdev, NULL);
   mddev_unlock()

6. md_kick_rdev_from_array() remove rdev from mddev->disks;

   mddev_lock()
   list_del_rcu(&rdev->same_set);
   synchronize_rcu();
   list_add(&rdev->same_set, &mddev->deleting)
   mddev_unlock()
    export_rdev

There are two separate rcu protection for rdev, and this pathset remove
the protection of conf(step 3 and 5), because it's safe to access rdev
from conf in following cases:

 - If 'reconfig_mutex' is held, because rdev can't be added or rmoved to
 conf;
 - If there is normal IO inflight, because mddev_suspend() will wait for
 IO to be done and prevent rdev to be added or removed to conf;
 - If sync thread is running, because remove_and_add_spares() can only be
 called from daemon thread when sync thread is done, and
 'MD_RECOVERY_RUNNING' is also checked for ioctl/sysfs;
 - if any spinlock or rcu_read_lock() is held, because synchronize_rcu()
 from step 6 prevent rdev to be freed until spinlock is released or
 rcu_read_unlock();
parents bed9e27b 7ecab28c
Loading
Loading
Loading
Loading
+12 −20
Original line number Diff line number Diff line
@@ -32,17 +32,15 @@ static int multipath_map (struct mpconf *conf)
	 * now we use the first available disk.
	 */

	rcu_read_lock();
	for (i = 0; i < disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
		struct md_rdev *rdev = conf->multipaths[i].rdev;

		if (rdev && test_bit(In_sync, &rdev->flags) &&
		    !test_bit(Faulty, &rdev->flags)) {
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
			return i;
		}
	}
	rcu_read_unlock();

	pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
	return (-1);
@@ -137,14 +135,16 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
	struct mpconf *conf = mddev->private;
	int i;

	lockdep_assert_held(&mddev->lock);

	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
		    conf->raid_disks - mddev->degraded);
	rcu_read_lock();
	for (i = 0; i < conf->raid_disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
		seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
		struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev);

		seq_printf(seq, "%s",
			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
	}
	rcu_read_unlock();
	seq_putc(seq, ']');
}

@@ -195,6 +195,7 @@ static void print_multipath_conf (struct mpconf *conf)
	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
		 conf->raid_disks);

	lockdep_assert_held(&conf->mddev->reconfig_mutex);
	for (i = 0; i < conf->raid_disks; i++) {
		tmp = conf->multipaths + i;
		if (tmp->rdev)
@@ -231,7 +232,7 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
			rdev->raid_disk = path;
			set_bit(In_sync, &rdev->flags);
			spin_unlock_irq(&conf->device_lock);
			rcu_assign_pointer(p->rdev, rdev);
			WRITE_ONCE(p->rdev, rdev);
			err = 0;
			break;
		}
@@ -257,16 +258,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
			err = -EBUSY;
			goto abort;
		}
		p->rdev = NULL;
		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
			synchronize_rcu();
			if (atomic_read(&rdev->nr_pending)) {
				/* lost the race, try later */
				err = -EBUSY;
				p->rdev = rdev;
				goto abort;
			}
		}
		WRITE_ONCE(p->rdev, NULL);
		err = md_integrity_register(mddev);
	}
abort:
+6 −31
Original line number Diff line number Diff line
@@ -9244,45 +9244,20 @@ static int remove_and_add_spares(struct mddev *mddev,
	struct md_rdev *rdev;
	int spares = 0;
	int removed = 0;
	bool remove_some = false;

	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
		/* Mustn't remove devices when resync thread is running */
		return 0;

	rdev_for_each(rdev, mddev) {
		if ((this == NULL || rdev == this) &&
		    rdev->raid_disk >= 0 &&
		    !test_bit(Blocked, &rdev->flags) &&
		    test_bit(Faulty, &rdev->flags) &&
		    atomic_read(&rdev->nr_pending)==0) {
			/* Faulty non-Blocked devices with nr_pending == 0
			 * never get nr_pending incremented,
			 * never get Faulty cleared, and never get Blocked set.
			 * So we can synchronize_rcu now rather than once per device
			 */
			remove_some = true;
			set_bit(RemoveSynchronized, &rdev->flags);
		}
	}

	if (remove_some)
		synchronize_rcu();
	rdev_for_each(rdev, mddev) {
		if ((this == NULL || rdev == this) &&
		    (test_bit(RemoveSynchronized, &rdev->flags) ||
		     rdev_removeable(rdev))) {
			if (mddev->pers->hot_remove_disk(
				    mddev, rdev) == 0) {
		if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
		    !mddev->pers->hot_remove_disk(mddev, rdev)) {
			sysfs_unlink_rdev(mddev, rdev);
			rdev->saved_raid_disk = rdev->raid_disk;
			rdev->raid_disk = -1;
			removed++;
		}
	}
		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
			clear_bit(RemoveSynchronized, &rdev->flags);
	}

	if (removed && mddev->kobj.sd)
		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
+0 −5
Original line number Diff line number Diff line
@@ -190,11 +190,6 @@ enum flag_bits {
				 * than other devices in the array
				 */
	ClusterRemove,
	RemoveSynchronized,	/* synchronize_rcu() was called after
				 * this device was known to be faulty,
				 * so it is safe to remove without
				 * another synchronize_rcu() call.
				 */
	ExternalBbl,            /* External metadata provides bad
				 * block management for a disk
				 */
+23 −48
Original line number Diff line number Diff line
@@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
	int choose_first;
	int choose_next_idle;

	rcu_read_lock();
	/*
	 * Check if we can balance. We can balance on the whole
	 * device if no resync is going on, or below the resync window.
@@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
		unsigned int pending;
		bool nonrot;

		rdev = rcu_dereference(conf->mirrors[disk].rdev);
		rdev = conf->mirrors[disk].rdev;
		if (r1_bio->bios[disk] == IO_BLOCKED
		    || rdev == NULL
		    || test_bit(Faulty, &rdev->flags))
@@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
	}

	if (best_disk >= 0) {
		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
		rdev = conf->mirrors[best_disk].rdev;
		if (!rdev)
			goto retry;
		atomic_inc(&rdev->nr_pending);
@@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect

		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
	}
	rcu_read_unlock();
	*max_sectors = sectors;

	return best_disk;
@@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,

	if (r1bio_existed) {
		/* Need to get the block device name carefully */
		struct md_rdev *rdev;
		rcu_read_lock();
		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
		struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;

		if (rdev)
			snprintf(b, sizeof(b), "%pg", rdev->bdev);
		else
			strcpy(b, "???");
		rcu_read_unlock();
	}

	/*
@@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,

	disks = conf->raid_disks * 2;
	blocked_rdev = NULL;
	rcu_read_lock();
	max_sectors = r1_bio->sectors;
	for (i = 0;  i < disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
		struct md_rdev *rdev = conf->mirrors[i].rdev;

		/*
		 * The write-behind io is only attempted on drives marked as
@@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
		}
		r1_bio->bios[i] = bio;
	}
	rcu_read_unlock();

	if (unlikely(blocked_rdev)) {
		/* Wait for this device to become unblocked */
@@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
	struct r1conf *conf = mddev->private;
	int i;

	lockdep_assert_held(&mddev->lock);

	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
		   conf->raid_disks - mddev->degraded);
	rcu_read_lock();
	for (i = 0; i < conf->raid_disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
		struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);

		seq_printf(seq, "%s",
			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
	}
	rcu_read_unlock();
	seq_printf(seq, "]");
}

@@ -1691,16 +1686,15 @@ static void print_conf(struct r1conf *conf)
	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
		 conf->raid_disks);

	rcu_read_lock();
	lockdep_assert_held(&conf->mddev->reconfig_mutex);
	for (i = 0; i < conf->raid_disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
		struct md_rdev *rdev = conf->mirrors[i].rdev;
		if (rdev)
			pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
				 i, !test_bit(In_sync, &rdev->flags),
				 !test_bit(Faulty, &rdev->flags),
				 rdev->bdev);
	}
	rcu_read_unlock();
}

static void close_sync(struct r1conf *conf)
@@ -1810,7 +1804,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
			 */
			if (rdev->saved_raid_disk < 0)
				conf->fullsync = 1;
			rcu_assign_pointer(p->rdev, rdev);
			WRITE_ONCE(p->rdev, rdev);
			break;
		}
		if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1826,7 +1820,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
		rdev->raid_disk = repl_slot;
		err = 0;
		conf->fullsync = 1;
		rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
		WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
	}

	print_conf(conf);
@@ -1862,16 +1856,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
			err = -EBUSY;
			goto abort;
		}
		p->rdev = NULL;
		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
			synchronize_rcu();
			if (atomic_read(&rdev->nr_pending)) {
				/* lost the race, try later */
				err = -EBUSY;
				p->rdev = rdev;
				goto abort;
			}
		}
		WRITE_ONCE(p->rdev, NULL);
		if (conf->mirrors[conf->raid_disks + number].rdev) {
			/* We just removed a device that is being replaced.
			 * Move down the replacement.  We drain all IO before
@@ -1892,7 +1877,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
				goto abort;
			}
			clear_bit(Replacement, &repl->flags);
			p->rdev = repl;
			WRITE_ONCE(p->rdev, repl);
			conf->mirrors[conf->raid_disks + number].rdev = NULL;
			unfreeze_array(conf);
		}
@@ -2290,8 +2275,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
			sector_t first_bad;
			int bad_sectors;

			rcu_read_lock();
			rdev = rcu_dereference(conf->mirrors[d].rdev);
			rdev = conf->mirrors[d].rdev;
			if (rdev &&
			    (test_bit(In_sync, &rdev->flags) ||
			     (!test_bit(Faulty, &rdev->flags) &&
@@ -2299,15 +2283,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
			    is_badblock(rdev, sect, s,
					&first_bad, &bad_sectors) == 0) {
				atomic_inc(&rdev->nr_pending);
				rcu_read_unlock();
				if (sync_page_io(rdev, sect, s<<9,
					 conf->tmppage, REQ_OP_READ, false))
					success = 1;
				rdev_dec_pending(rdev, mddev);
				if (success)
					break;
			} else
				rcu_read_unlock();
			}

			d++;
			if (d == conf->raid_disks * 2)
				d = 0;
@@ -2326,29 +2309,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
			if (d==0)
				d = conf->raid_disks * 2;
			d--;
			rcu_read_lock();
			rdev = rcu_dereference(conf->mirrors[d].rdev);
			rdev = conf->mirrors[d].rdev;
			if (rdev &&
			    !test_bit(Faulty, &rdev->flags)) {
				atomic_inc(&rdev->nr_pending);
				rcu_read_unlock();
				r1_sync_page_io(rdev, sect, s,
						conf->tmppage, WRITE);
				rdev_dec_pending(rdev, mddev);
			} else
				rcu_read_unlock();
			}
		}
		d = start;
		while (d != read_disk) {
			if (d==0)
				d = conf->raid_disks * 2;
			d--;
			rcu_read_lock();
			rdev = rcu_dereference(conf->mirrors[d].rdev);
			rdev = conf->mirrors[d].rdev;
			if (rdev &&
			    !test_bit(Faulty, &rdev->flags)) {
				atomic_inc(&rdev->nr_pending);
				rcu_read_unlock();
				if (r1_sync_page_io(rdev, sect, s,
						    conf->tmppage, READ)) {
					atomic_add(s, &rdev->corrected_errors);
@@ -2359,8 +2337,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
						rdev->bdev);
				}
				rdev_dec_pending(rdev, mddev);
			} else
				rcu_read_unlock();
			}
		}
		sectors -= s;
		sect += s;
@@ -2741,7 +2718,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,

	r1_bio = raid1_alloc_init_r1buf(conf);

	rcu_read_lock();
	/*
	 * If we get a correctably read error during resync or recovery,
	 * we might want to read from a different device.  So we
@@ -2762,7 +2738,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
		struct md_rdev *rdev;
		bio = r1_bio->bios[i];

		rdev = rcu_dereference(conf->mirrors[i].rdev);
		rdev = conf->mirrors[i].rdev;
		if (rdev == NULL ||
		    test_bit(Faulty, &rdev->flags)) {
			if (i < conf->raid_disks)
@@ -2820,7 +2796,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
				bio->bi_opf |= MD_FAILFAST;
		}
	}
	rcu_read_unlock();
	if (disk < 0)
		disk = wonly;
	r1_bio->read_disk = disk;
+58 −164

File changed.

Preview size limit exceeded, changes collapsed.

Loading