Commit 86b1e613 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'md-6.9-20240301' of...

Merge tag 'md-6.9-20240301' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.9/block

Pull MD updates from Song:

"The major changes are:

 1. Refactor raid1 read_balance, by Yu Kuai and Paul Luse.
 2. Clean up and fix for md_ioctl, by Li Nan.
 3. Other small fixes, by Gui-Dong Han and Heming Zhao."

* tag 'md-6.9-20240301' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (22 commits)
  md/raid1: factor out helpers to choose the best rdev from read_balance()
  md/raid1: factor out the code to manage sequential IO
  md/raid1: factor out choose_bb_rdev() from read_balance()
  md/raid1: factor out choose_slow_rdev() from read_balance()
  md/raid1: factor out read_first_rdev() from read_balance()
  md/raid1-10: factor out a new helper raid1_should_read_first()
  md/raid1-10: add a helper raid1_check_read_range()
  md/raid1: fix choose next idle in read_balance()
  md/raid1: record nonrot rdevs while adding/removing rdevs to conf
  md/raid1: factor out helpers to add rdev to conf
  md: add a new helper rdev_has_badblock()
  md/raid5: fix atomicity violation in raid5_cache_count
  md/md-bitmap: fix incorrect usage for sb_index
  md: check mddev->pers before calling md_set_readonly()
  md: clean up openers check in do_md_stop() and md_set_readonly()
  md: sync blockdev before stopping raid or setting readonly
  md: factor out a helper to sync mddev
  md: Don't clear MD_CLOSING when the raid is about to stop
  md: return directly before setting did_set_md_closing
  md: clean up invalid BUG_ON in md_ioctl
  ...
parents 13fe8e68 e81faa91
Loading
Loading
Loading
Loading
+6 −3
Original line number Diff line number Diff line
@@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
	sector_t doff;

	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
	if (pg_index == store->file_pages - 1) {
	/* we compare length (page numbers), not page offset. */
	if ((pg_index - store->sb_index) == store->file_pages - 1) {
		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);

		if (last_page_size == 0)
@@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
	struct page *page = store->filemap[pg_index];

	if (mddev_is_clustered(bitmap->mddev)) {
		pg_index += bitmap->cluster_slot *
			DIV_ROUND_UP(store->bytes, PAGE_SIZE);
		/* go to node bitmap area starting point */
		pg_index += store->sb_index;
	}

	if (store->file)
@@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
	unsigned long index = file_page_index(store, chunk);
	unsigned long node_offset = 0;

	index += store->sb_index;
	if (mddev_is_clustered(bitmap->mddev))
		node_offset = bitmap->cluster_slot * store->file_pages;

@@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
	unsigned long index = file_page_index(store, chunk);
	unsigned long node_offset = 0;

	index += store->sb_index;
	if (mddev_is_clustered(bitmap->mddev))
		node_offset = bitmap->cluster_slot * store->file_pages;

+91 −92
Original line number Diff line number Diff line
@@ -529,6 +529,24 @@ void mddev_resume(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_resume);

/* sync bdev before setting device to readonly or stopping raid*/
static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
{
	mutex_lock(&mddev->open_mutex);
	if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
		mutex_unlock(&mddev->open_mutex);
		return -EBUSY;
	}
	if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
		mutex_unlock(&mddev->open_mutex);
		return -EBUSY;
	}
	mutex_unlock(&mddev->open_mutex);

	sync_blockdev(mddev->gendisk->part0);
	return 0;
}

/*
 * Generic flush handling for md
 */
@@ -4464,8 +4482,8 @@ array_state_show(struct mddev *mddev, char *page)
	return sprintf(page, "%s\n", array_states[st]);
}

static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
static int do_md_stop(struct mddev *mddev, int ro);
static int md_set_readonly(struct mddev *mddev);
static int restart_array(struct mddev *mddev);

static ssize_t
@@ -4482,6 +4500,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
	case broken:		/* cannot be set */
	case bad_word:
		return -EINVAL;
	case clear:
	case readonly:
	case inactive:
	case read_auto:
		if (!mddev->pers || !md_is_rdwr(mddev))
			break;
		/* write sysfs will not open mddev and opener should be 0 */
		err = mddev_set_closing_and_sync_blockdev(mddev, 0);
		if (err)
			return err;
		break;
	default:
		break;
	}
@@ -4515,14 +4544,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
	case inactive:
		/* stop an active array, return 0 otherwise */
		if (mddev->pers)
			err = do_md_stop(mddev, 2, NULL);
			err = do_md_stop(mddev, 2);
		break;
	case clear:
		err = do_md_stop(mddev, 0, NULL);
		err = do_md_stop(mddev, 0);
		break;
	case readonly:
		if (mddev->pers)
			err = md_set_readonly(mddev, NULL);
			err = md_set_readonly(mddev);
		else {
			mddev->ro = MD_RDONLY;
			set_disk_ro(mddev->gendisk, 1);
@@ -4532,7 +4561,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
	case read_auto:
		if (mddev->pers) {
			if (md_is_rdwr(mddev))
				err = md_set_readonly(mddev, NULL);
				err = md_set_readonly(mddev);
			else if (mddev->ro == MD_RDONLY)
				err = restart_array(mddev);
			if (err == 0) {
@@ -4581,6 +4610,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
		sysfs_notify_dirent_safe(mddev->sysfs_state);
	}
	mddev_unlock(mddev);

	if (st == readonly || st == read_auto || st == inactive ||
	    (err && st == clear))
		clear_bit(MD_CLOSING, &mddev->flags);

	return err ?: len;
}
static struct md_sysfs_entry md_array_state =
@@ -6265,7 +6299,15 @@ static void md_clean(struct mddev *mddev)
	mddev->persistent = 0;
	mddev->level = LEVEL_NONE;
	mddev->clevel[0] = 0;
	/*
	 * Don't clear MD_CLOSING, or mddev can be opened again.
	 * 'hold_active != 0' means mddev is still in the creation
	 * process and will be used later.
	 */
	if (mddev->hold_active)
		mddev->flags = 0;
	else
		mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
	mddev->sb_flags = 0;
	mddev->ro = MD_RDWR;
	mddev->metadata_type[0] = 0;
@@ -6378,7 +6420,8 @@ void md_stop(struct mddev *mddev)

EXPORT_SYMBOL_GPL(md_stop);

static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
/* ensure 'mddev->pers' exist before calling md_set_readonly() */
static int md_set_readonly(struct mddev *mddev)
{
	int err = 0;
	int did_freeze = 0;
@@ -6396,15 +6439,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
	mddev_lock_nointr(mddev);

	mutex_lock(&mddev->open_mutex);
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
		pr_warn("md: %s still in use.\n",mdname(mddev));
		err = -EBUSY;
		goto out;
	}

	if (mddev->pers) {
	__md_stop_writes(mddev);

	if (mddev->ro == MD_RDONLY) {
@@ -6414,16 +6454,14 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)

	mddev->ro = MD_RDONLY;
	set_disk_ro(mddev->gendisk, 1);
	}

out:
	if ((mddev->pers && !err) || did_freeze) {
	if (!err || did_freeze) {
		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		sysfs_notify_dirent_safe(mddev->sysfs_state);
	}

	mutex_unlock(&mddev->open_mutex);
	return err;
}

@@ -6431,8 +6469,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 *   0 - completely stop and dis-assemble array
 *   2 - stop but do not disassemble array
 */
static int do_md_stop(struct mddev *mddev, int mode,
		      struct block_device *bdev)
static int do_md_stop(struct mddev *mddev, int mode)
{
	struct gendisk *disk = mddev->gendisk;
	struct md_rdev *rdev;
@@ -6445,12 +6482,9 @@ static int do_md_stop(struct mddev *mddev, int mode,

	stop_sync_thread(mddev, true, false);

	mutex_lock(&mddev->open_mutex);
	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
	    mddev->sysfs_active ||
	if (mddev->sysfs_active ||
	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
		pr_warn("md: %s still in use.\n",mdname(mddev));
		mutex_unlock(&mddev->open_mutex);
		if (did_freeze) {
			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -6472,13 +6506,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
				sysfs_unlink_rdev(mddev, rdev);

		set_capacity_and_notify(disk, 0);
		mutex_unlock(&mddev->open_mutex);
		mddev->changed = 1;

		if (!md_is_rdwr(mddev))
			mddev->ro = MD_RDWR;
	} else
		mutex_unlock(&mddev->open_mutex);
	}
	/*
	 * Free resources if final stop
	 */
@@ -6524,7 +6556,7 @@ static void autorun_array(struct mddev *mddev)
	err = do_md_run(mddev);
	if (err) {
		pr_warn("md: do_md_run() returned %d\n", err);
		do_md_stop(mddev, 0, NULL);
		do_md_stop(mddev, 0);
	}
}

@@ -7522,16 +7554,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
	return 0;
}

static inline bool md_ioctl_valid(unsigned int cmd)
static inline int md_ioctl_valid(unsigned int cmd)
{
	switch (cmd) {
	case ADD_NEW_DISK:
	case GET_ARRAY_INFO:
	case GET_BITMAP_FILE:
	case GET_DISK_INFO:
	case RAID_VERSION:
		return 0;
	case ADD_NEW_DISK:
	case GET_BITMAP_FILE:
	case HOT_ADD_DISK:
	case HOT_REMOVE_DISK:
	case RAID_VERSION:
	case RESTART_ARRAY_RW:
	case RUN_ARRAY:
	case SET_ARRAY_INFO:
@@ -7540,9 +7573,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
	case STOP_ARRAY:
	case STOP_ARRAY_RO:
	case CLUSTERED_DISK_NACK:
		return true;
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
		return 0;
	default:
		return false;
		return -ENOTTY;
	}
}

@@ -7600,31 +7635,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
	int err = 0;
	void __user *argp = (void __user *)arg;
	struct mddev *mddev = NULL;
	bool did_set_md_closing = false;

	if (!md_ioctl_valid(cmd))
		return -ENOTTY;

	switch (cmd) {
	case RAID_VERSION:
	case GET_ARRAY_INFO:
	case GET_DISK_INFO:
		break;
	default:
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
	}
	err = md_ioctl_valid(cmd);
	if (err)
		return err;

	/*
	 * Commands dealing with the RAID driver but not any
	 * particular array:
	 */
	switch (cmd) {
	case RAID_VERSION:
		err = get_version(argp);
		goto out;
	default:;
	}
	if (cmd == RAID_VERSION)
		return get_version(argp);

	/*
	 * Commands creating/starting a new array:
@@ -7632,35 +7653,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,

	mddev = bdev->bd_disk->private_data;

	if (!mddev) {
		BUG();
		goto out;
	}

	/* Some actions do not requires the mutex */
	switch (cmd) {
	case GET_ARRAY_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_array_info(mddev, argp);
		goto out;
			return -ENODEV;
		return get_array_info(mddev, argp);

	case GET_DISK_INFO:
		if (!mddev->raid_disks && !mddev->external)
			err = -ENODEV;
		else
			err = get_disk_info(mddev, argp);
		goto out;
			return -ENODEV;
		return get_disk_info(mddev, argp);

	case SET_DISK_FAULTY:
		err = set_disk_faulty(mddev, new_decode_dev(arg));
		goto out;
		return set_disk_faulty(mddev, new_decode_dev(arg));

	case GET_BITMAP_FILE:
		err = get_bitmap_file(mddev, argp);
		goto out;

		return get_bitmap_file(mddev, argp);
	}

	if (cmd == HOT_REMOVE_DISK)
@@ -7673,20 +7682,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
		/* Need to flush page cache, and ensure no-one else opens
		 * and writes
		 */
		mutex_lock(&mddev->open_mutex);
		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
			mutex_unlock(&mddev->open_mutex);
			err = -EBUSY;
			goto out;
		}
		if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
			mutex_unlock(&mddev->open_mutex);
			err = -EBUSY;
			goto out;
		}
		did_set_md_closing = true;
		mutex_unlock(&mddev->open_mutex);
		sync_blockdev(bdev);
		err = mddev_set_closing_and_sync_blockdev(mddev, 1);
		if (err)
			return err;
	}

	if (!md_is_rdwr(mddev))
@@ -7727,11 +7725,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
		goto unlock;

	case STOP_ARRAY:
		err = do_md_stop(mddev, 0, bdev);
		err = do_md_stop(mddev, 0);
		goto unlock;

	case STOP_ARRAY_RO:
		err = md_set_readonly(mddev, bdev);
		if (mddev->pers)
			err = md_set_readonly(mddev);
		goto unlock;

	case HOT_REMOVE_DISK:
@@ -7826,7 +7825,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
				     mddev_unlock(mddev);

out:
	if(did_set_md_closing)
	if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
		clear_bit(MD_CLOSING, &mddev->flags);
	return err;
}
+11 −0
Original line number Diff line number Diff line
@@ -207,6 +207,7 @@ enum flag_bits {
				 * check if there is collision between raid1
				 * serial bios.
				 */
	Nonrot,			/* non-rotational device (SSD) */
};

static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
	}
	return 0;
}

static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
				    int sectors)
{
	sector_t first_bad;
	int bad_sectors;

	return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}

extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
			      int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+69 −0
Original line number Diff line number Diff line
@@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)

	return false;
}

/**
 * raid1_check_read_range() - check a given read range for bad blocks,
 * available read length is returned;
 * @rdev: the rdev to read;
 * @this_sector: read position;
 * @len: read length;
 *
 * helper function for read_balance()
 *
 * 1) If there are no bad blocks in the range, @len is returned;
 * 2) If the range are all bad blocks, 0 is returned;
 * 3) If there are partial bad blocks:
 *  - If the bad block range starts after @this_sector, the length of first
 *  good region is returned;
 *  - If the bad block range starts before @this_sector, 0 is returned and
 *  the @len is updated to the offset into the region before we get to the
 *  good blocks;
 */
static inline int raid1_check_read_range(struct md_rdev *rdev,
					 sector_t this_sector, int *len)
{
	sector_t first_bad;
	int bad_sectors;

	/* no bad block overlap */
	if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
		return *len;

	/*
	 * bad block range starts offset into our range so we can return the
	 * number of sectors before the bad blocks start.
	 */
	if (first_bad > this_sector)
		return first_bad - this_sector;

	/* read range is fully consumed by bad blocks. */
	if (this_sector + *len <= first_bad + bad_sectors)
		return 0;

	/*
	 * final case, bad block range starts before or at the start of our
	 * range but does not cover our entire range so we still return 0 but
	 * update the length with the number of sectors before we get to the
	 * good ones.
	 */
	*len = first_bad + bad_sectors - this_sector;
	return 0;
}

/*
 * Check if read should choose the first rdev.
 *
 * Balance on the whole device if no resync is going on (recovery is ok) or
 * below the resync window. Otherwise, take the first readable disk.
 */
static inline bool raid1_should_read_first(struct mddev *mddev,
					   sector_t this_sector, int len)
{
	if ((mddev->recovery_cp < this_sector + len))
		return true;

	if (mddev_is_clustered(mddev) &&
	    md_cluster_ops->area_resyncing(mddev, READ, this_sector,
					   this_sector + len))
		return true;

	return false;
}
+334 −216

File changed.

Preview size limit exceeded, changes collapsed.

Loading