Commit 0d56d9ca authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'md-7.0-20260127' of...

Merge tag 'md-7.0-20260127' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.0/block

Pull MD updates from Yu:

"Bug Fixes:
 - Fix raid5_run() to return error when log_init() fails (Yu Kuai)
 - Fix IO hang with degraded array with llbitmap (Yu Kuai)
 - Fix percpu_ref not resurrected on suspend timeout in llbitmap
   (Yu Kuai)
 - Fix GPF in write_page caused by resize race (Jack Wang)
 - Fix NULL pointer dereference in process_metadata_update
   (Jiasheng Jiang)
 - Fix hang when stopping arrays with metadata through dm-raid
   (Heinz Mauelshagen)
 - Fix any_working flag handling in raid10_sync_request (Li Nan)

 Cleanups & Refactoring:
 - Refactor sync/recovery code path, improve error handling for
   badblocks, and remove unused recovery_disabled field (Li Nan)
 - Consolidate mddev boolean fields into mddev_flags (Yu Kuai)

 Improvements:
 - Use mempool to allocate stripe_request_ctx and make sure max_sectors
   is not less than io_opt in raid5 (Yu Kuai)"

* tag 'md-7.0-20260127' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux: (23 commits)
  md raid: fix hang when stopping arrays with metadata through dm-raid
  md-cluster: fix NULL pointer dereference in process_metadata_update
  md/bitmap: fix GPF in write_page caused by resize race
  md/md-llbitmap: fix percpu_ref not resurrected on suspend timeout
  md/raid5: fix IO hang with degraded array with llbitmap
  md: remove recovery_disabled
  md/raid10: cleanup skip handling in raid10_sync_request
  md/raid10: fix any_working flag handling in raid10_sync_request
  md: move finish_reshape to md_finish_sync()
  md: factor out sync completion update into helper
  md: remove MD_RECOVERY_ERROR handling and simplify resync_offset update
  md: update curr_resync_completed even when MD_RECOVERY_INTR is set
  md: mark rdev Faulty when badblocks setting fails
  md: break remaining operations on badblocks set failure in narrow_write_error
  md/raid1,raid10: support narrow_write_error when badblocks is disabled
  md: factor error handling out of md_done_sync into helper
  md/raid1: simplify uptodate handling in end_sync_write
  md/raid5: make sure max_sectors is not less than io_opt
  md/raid5: use mempool to allocate stripe_request_ctx
  md: merge mddev serialize_policy into mddev_flags
  ...
parents 72a41750 cefcb929
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -2085,7 +2085,7 @@ static void bitmap_destroy(struct mddev *mddev)
		return;

	bitmap_wait_behind_writes(mddev);
	if (!mddev->serialize_policy)
	if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
		mddev_destroy_serial_pool(mddev, NULL);

	mutex_lock(&mddev->bitmap_info.mutex);
@@ -2453,6 +2453,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
		memcpy(page_address(store.sb_page),
		       page_address(bitmap->storage.sb_page),
		       sizeof(bitmap_super_t));
	mutex_lock(&bitmap->mddev->bitmap_info.mutex);
	spin_lock_irq(&bitmap->counts.lock);
	md_bitmap_file_unmap(&bitmap->storage);
	bitmap->storage = store;
@@ -2560,7 +2561,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
	}
	spin_unlock_irq(&bitmap->counts.lock);

	mutex_unlock(&bitmap->mddev->bitmap_info.mutex);
	if (!init) {
		__bitmap_unplug(bitmap);
		bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
@@ -2809,7 +2810,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
	mddev->bitmap_info.max_write_behind = backlog;
	if (!backlog && mddev->serial_info_pool) {
		/* serial_info_pool is not needed if backlog is zero */
		if (!mddev->serialize_policy)
		if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
			mddev_destroy_serial_pool(mddev, NULL);
	} else if (backlog && !mddev->serial_info_pool) {
		/* serial_info_pool is needed since backlog is not zero */
+6 −1
Original line number Diff line number Diff line
@@ -549,8 +549,13 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg

	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);

	/* daemaon thread must exist */
	thread = rcu_dereference_protected(mddev->thread, true);
	if (!thread) {
		pr_warn("md-cluster: Received metadata update but MD thread is not ready\n");
		dlm_unlock_sync(cinfo->no_new_dev_lockres);
		return;
	}

	wait_event(thread->wqueue,
		   (got_lock = mddev_trylock(mddev)) ||
		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+3 −1
Original line number Diff line number Diff line
@@ -712,8 +712,10 @@ static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
	percpu_ref_kill(&pctl->active);

	if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
			llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
			llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
		percpu_ref_resurrect(&pctl->active);
		return -ETIMEDOUT;
	}

	return 0;
}
+99 −89
Original line number Diff line number Diff line
@@ -279,7 +279,8 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)

		rdev_for_each(temp, mddev) {
			if (!rdev) {
				if (!mddev->serialize_policy ||
				if (!test_bit(MD_SERIALIZE_POLICY,
					      &mddev->flags) ||
				    !rdev_need_serial(temp))
					rdev_uninit_serial(temp);
				else
@@ -2617,9 +2618,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
	list_add_rcu(&rdev->same_set, &mddev->disks);
	bd_link_disk_holder(rdev->bdev, mddev->gendisk);

	/* May as well allow recovery to be retried once */
	mddev->recovery_disabled++;

	return 0;

 fail:
@@ -5864,11 +5862,11 @@ __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,

static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%d\n", mddev->fail_last_dev);
	return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags));
}

/*
 * Setting fail_last_dev to true to allow last device to be forcibly removed
 * Setting MD_FAILLAST_DEV to allow last device to be forcibly removed
 * from RAID1/RAID10.
 */
static ssize_t
@@ -5881,8 +5879,10 @@ fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
	if (ret)
		return ret;

	if (value != mddev->fail_last_dev)
		mddev->fail_last_dev = value;
	if (value)
		set_bit(MD_FAILLAST_DEV, &mddev->flags);
	else
		clear_bit(MD_FAILLAST_DEV, &mddev->flags);

	return len;
}
@@ -5895,11 +5895,12 @@ static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
	if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
		return sprintf(page, "n/a\n");
	else
		return sprintf(page, "%d\n", mddev->serialize_policy);
		return sprintf(page, "%d\n",
			       test_bit(MD_SERIALIZE_POLICY, &mddev->flags));
}

/*
 * Setting serialize_policy to true to enforce write IO is not reordered
 * Setting MD_SERIALIZE_POLICY enforce write IO is not reordered
 * for raid1.
 */
static ssize_t
@@ -5912,7 +5913,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
	if (err)
		return err;

	if (value == mddev->serialize_policy)
	if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
		return len;

	err = mddev_suspend_and_lock(mddev);
@@ -5924,11 +5925,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
		goto unlock;
	}

	if (value)
	if (value) {
		mddev_create_serial_pool(mddev, NULL);
	else
		set_bit(MD_SERIALIZE_POLICY, &mddev->flags);
	} else {
		mddev_destroy_serial_pool(mddev, NULL);
	mddev->serialize_policy = value;
		clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
	}
unlock:
	mddev_unlock_and_resume(mddev);
	return err ?: len;
@@ -6502,7 +6505,7 @@ int md_run(struct mddev *mddev)
	 * the only valid external interface is through the md
	 * device.
	 */
	mddev->has_superblocks = false;
	clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags);
	rdev_for_each(rdev, mddev) {
		if (test_bit(Faulty, &rdev->flags))
			continue;
@@ -6515,7 +6518,7 @@ int md_run(struct mddev *mddev)
		}

		if (rdev->sb_page)
			mddev->has_superblocks = true;
			set_bit(MD_HAS_SUPERBLOCK, &mddev->flags);

		/* perform some consistency tests on the device.
		 * We don't want the data to overlap the metadata,
@@ -6848,6 +6851,7 @@ static void __md_stop_writes(struct mddev *mddev)
{
	timer_delete_sync(&mddev->safemode_timer);

	if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) {
		if (mddev->pers && mddev->pers->quiesce) {
			mddev->pers->quiesce(mddev, 1);
			mddev->pers->quiesce(mddev, 0);
@@ -6855,6 +6859,7 @@ static void __md_stop_writes(struct mddev *mddev)

		if (md_bitmap_enabled(mddev, true))
			mddev->bitmap_ops->flush(mddev);
	}

	if (md_is_rdwr(mddev) &&
	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6865,7 +6870,7 @@ static void __md_stop_writes(struct mddev *mddev)
		md_update_sb(mddev, 1);
	}
	/* disable policy to guarantee rdevs free resources for serialization */
	mddev->serialize_policy = 0;
	clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
	mddev_destroy_serial_pool(mddev, NULL);
}

@@ -9068,19 +9073,21 @@ static bool is_mddev_idle(struct mddev *mddev, int init)
	return idle;
}

void md_done_sync(struct mddev *mddev, int blocks, int ok)
void md_done_sync(struct mddev *mddev, int blocks)
{
	/* another "blocks" (512byte) blocks have been synced */
	atomic_sub(blocks, &mddev->recovery_active);
	wake_up(&mddev->recovery_wait);
	if (!ok) {
}
EXPORT_SYMBOL(md_done_sync);

void md_sync_error(struct mddev *mddev)
{
	// stop recovery, signal do_sync ....
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
		// stop recovery, signal do_sync ....
	}
}
EXPORT_SYMBOL(md_done_sync);
EXPORT_SYMBOL(md_sync_error);

/* md_write_start(mddev, bi)
 * If we need to update some array metadata (e.g. 'active' flag
@@ -9125,7 +9132,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
	rcu_read_unlock();
	if (did_change)
		sysfs_notify_dirent_safe(mddev->sysfs_state);
	if (!mddev->has_superblocks)
	if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags))
		return;
	wait_event(mddev->sb_wait,
		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
@@ -9430,6 +9437,53 @@ static bool sync_io_within_limit(struct mddev *mddev)
	       (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
}

/*
 * Update sync offset and mddev status when sync completes
 */
static void md_finish_sync(struct mddev *mddev, enum sync_action action)
{
	struct md_rdev *rdev;

	switch (action) {
	case ACTION_RESYNC:
	case ACTION_REPAIR:
		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			mddev->curr_resync = MaxSector;
		mddev->resync_offset = mddev->curr_resync;
		break;
	case ACTION_RECOVER:
		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
			mddev->curr_resync = MaxSector;
		rcu_read_lock();
		rdev_for_each_rcu(rdev, mddev)
			if (mddev->delta_disks >= 0 &&
			    rdev_needs_recovery(rdev, mddev->curr_resync))
				rdev->recovery_offset = mddev->curr_resync;
		rcu_read_unlock();
		break;
	case ACTION_RESHAPE:
		if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
		    mddev->delta_disks > 0 &&
		    mddev->pers->finish_reshape &&
		    mddev->pers->size &&
		    !mddev_is_dm(mddev)) {
			mddev_lock_nointr(mddev);
			md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
			mddev_unlock(mddev);
			if (!mddev_is_clustered(mddev))
				set_capacity_and_notify(mddev->gendisk,
							mddev->array_sectors);
		}
		if (mddev->pers->finish_reshape)
			mddev->pers->finish_reshape(mddev);
		break;
	/* */
	case ACTION_CHECK:
	default:
		break;
	}
}

#define SYNC_MARKS	10
#define	SYNC_MARK_STEP	(3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -9445,7 +9499,6 @@ void md_do_sync(struct md_thread *thread)
	int last_mark,m;
	sector_t last_check;
	int skipped = 0;
	struct md_rdev *rdev;
	enum sync_action action;
	const char *desc;
	struct blk_plug plug;
@@ -9731,65 +9784,21 @@ void md_do_sync(struct md_thread *thread)
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
		/* All sync IO completes after recovery_active becomes 0 */
		mddev->curr_resync_completed = mddev->curr_resync;
		sysfs_notify_dirent_safe(mddev->sysfs_completed);
	}
	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);

	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
				if (mddev->curr_resync >= mddev->resync_offset) {
					pr_debug("md: checkpointing %s of %s.\n",
						 desc, mdname(mddev));
					if (test_bit(MD_RECOVERY_ERROR,
						&mddev->recovery))
						mddev->resync_offset =
							mddev->curr_resync_completed;
					else
						mddev->resync_offset =
							mddev->curr_resync;
				}
			} else
				mddev->resync_offset = MaxSector;
		} else {
			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
				mddev->curr_resync = MaxSector;
			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
				rcu_read_lock();
				rdev_for_each_rcu(rdev, mddev)
					if (mddev->delta_disks >= 0 &&
					    rdev_needs_recovery(rdev, mddev->curr_resync))
						rdev->recovery_offset = mddev->curr_resync;
				rcu_read_unlock();
			}
		}
	}
	if (mddev->curr_resync > MD_RESYNC_ACTIVE)
		md_finish_sync(mddev, action);
 skip:
	/* set CHANGE_PENDING here since maybe another update is needed,
	 * so other nodes are informed. It should be harmless for normal
	 * raid */
	set_mask_bits(&mddev->sb_flags, 0,
		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));

	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
			mddev->delta_disks > 0 &&
			mddev->pers->finish_reshape &&
			mddev->pers->size &&
			!mddev_is_dm(mddev)) {
		mddev_lock_nointr(mddev);
		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
		mddev_unlock(mddev);
		if (!mddev_is_clustered(mddev))
			set_capacity_and_notify(mddev->gendisk,
						mddev->array_sectors);
	}

	spin_lock(&mddev->lock);
	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
		/* We completed so min/max setting can be forgotten if used. */
@@ -10304,7 +10313,7 @@ void md_reap_sync_thread(struct mddev *mddev)
{
	struct md_rdev *rdev;
	sector_t old_dev_sectors = mddev->dev_sectors;
	bool is_reshaped = false;
	bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);

	/* resync has finished, collect result */
	md_unregister_thread(mddev, &mddev->sync_thread);
@@ -10320,12 +10329,6 @@ void md_reap_sync_thread(struct mddev *mddev)
			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
		}
	}
	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    mddev->pers->finish_reshape) {
		mddev->pers->finish_reshape(mddev);
		if (mddev_is_clustered(mddev))
			is_reshaped = true;
	}

	/* If array is no-longer degraded, then any saved_raid_disk
	 * information must be scrapped.
@@ -10352,8 +10355,9 @@ void md_reap_sync_thread(struct mddev *mddev)
	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
	 * so it is time to update size across cluster.
	 */
	if (mddev_is_clustered(mddev) && is_reshaped
				      && !test_bit(MD_CLOSING, &mddev->flags))
	if (mddev_is_clustered(mddev) && is_reshaped &&
	    mddev->pers->finish_reshape &&
	    !test_bit(MD_CLOSING, &mddev->flags))
		mddev->cluster_ops->update_size(mddev, old_dev_sectors);
	/* flag recovery needed just to double check */
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -10413,8 +10417,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
	else
		s += rdev->data_offset;

	if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
	if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) {
		/*
		 * Mark the disk as Faulty when setting badblocks fails,
		 * otherwise, bad sectors may be read.
		 */
		md_error(mddev, rdev);
		return false;
	}

	/* Make sure they get written out promptly */
	if (test_bit(ExternalBbl, &rdev->flags))
+12 −13
Original line number Diff line number Diff line
@@ -22,6 +22,10 @@
#include <trace/events/block.h>

#define MaxSector (~(sector_t)0)
/*
 * Number of guaranteed raid bios in case of extreme VM load:
 */
#define	NR_RAID_BIOS 256

enum md_submodule_type {
	MD_PERSONALITY = 0,
@@ -340,6 +344,9 @@ struct md_cluster_operations;
 *		   array is ready yet.
 * @MD_BROKEN: This is used to stop writes and mark array as failed.
 * @MD_DELETED: This device is being deleted
 * @MD_HAS_SUPERBLOCK: There is persistence sb in member disks.
 * @MD_FAILLAST_DEV: Allow last rdev to be removed.
 * @MD_SERIALIZE_POLICY: Enforce write IO is not reordered, just used by raid1.
 *
 * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
 */
@@ -356,6 +363,9 @@ enum mddev_flags {
	MD_BROKEN,
	MD_DO_DELETE,
	MD_DELETED,
	MD_HAS_SUPERBLOCK,
	MD_FAILLAST_DEV,
	MD_SERIALIZE_POLICY,
};

enum mddev_sb_flags {
@@ -495,12 +505,6 @@ struct mddev {
	int				ok_start_degraded;

	unsigned long			recovery;
	/* If a RAID personality determines that recovery (of a particular
	 * device) will fail due to a read error on the source device, it
	 * takes a copy of this number and does not attempt recovery again
	 * until this number changes.
	 */
	int				recovery_disabled;

	int				in_sync;	/* know to not need resync */
	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
@@ -622,10 +626,6 @@ struct mddev {

	/* The sequence number for sync thread */
	atomic_t sync_seq;

	bool	has_superblocks:1;
	bool	fail_last_dev:1;
	bool	serialize_policy:1;
};

enum recovery_flags {
@@ -646,8 +646,6 @@ enum recovery_flags {
	MD_RECOVERY_FROZEN,
	/* waiting for pers->start() to finish */
	MD_RECOVERY_WAIT,
	/* interrupted because io-error */
	MD_RECOVERY_ERROR,

	/* flags determines sync action, see details in enum sync_action */

@@ -912,7 +910,8 @@ extern const char *md_sync_action_name(enum sync_action action);
extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_done_sync(struct mddev *mddev, int blocks);
extern void md_sync_error(struct mddev *mddev);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
Loading