Commit d0cc5f58 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'md-7.1-20260407' of...

Merge tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.1/block

Pull MD changes from Yu Kuai:

"Bug Fixes:
 - avoid a sysfs deadlock when clearing array state (Yu Kuai)
 - validate raid5 journal payloads before reading metadata (Junrui Luo)
 - fall back to the correct bitmap operations after version mismatches
   (Yu Kuai)
 - serialize overlapping writes on writemostly raid1 disks (Xiao Ni)
 - wake raid456 reshape waiters before suspend (Yu Kuai)
 - prevent retry_aligned_read() from triggering soft lockups
   (Chia-Ming Chang)

 Improvements:
 - switch raid0 strip zone and devlist allocations to kvmalloc helpers
   (Gregory Price)
 - track clean unwritten stripes for proactive RAID5 parity building
   (Yu Kuai)
 - speed up initial llbitmap sync with write_zeroes_unmap support
   (Yu Kuai)

 Cleanups:
 - remove the unused static md workqueue definition
   (Abd-Alrhman Masalkhi)"

* tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux:
  md/raid5: fix soft lockup in retry_aligned_read()
  md: wake raid456 reshape waiters before suspend
  md/raid1: serialize overlap io for writemostly disk
  md/md-llbitmap: optimize initial sync with write_zeroes_unmap support
  md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building
  md: add fallback to correct bitmap_ops on version mismatch
  md/raid5: validate payload size before accessing journal metadata
  md: remove unused static md_wq workqueue
  md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations
  md: fix array_state=clear sysfs deadlock
parents 2d148a21 7f9f7c69
Loading
Loading
Loading
Loading
+189 −13
Original line number Diff line number Diff line
@@ -208,6 +208,20 @@ enum llbitmap_state {
	BitNeedSync,
	/* data is synchronizing */
	BitSyncing,
	/*
	 * Proactive sync requested for unwritten region (raid456 only).
	 * Triggered via sysfs when user wants to pre-build XOR parity
	 * for regions that have never been written.
	 */
	BitNeedSyncUnwritten,
	/* Proactive sync in progress for unwritten region */
	BitSyncingUnwritten,
	/*
	 * XOR parity has been pre-built for a region that has never had
	 * user data written. When user writes to this region, it transitions
	 * to BitDirty.
	 */
	BitCleanUnwritten,
	BitStateCount,
	BitNone = 0xff,
};
@@ -232,6 +246,12 @@ enum llbitmap_action {
	 * BitNeedSync.
	 */
	BitmapActionStale,
	/*
	 * Proactive sync trigger for raid456 - builds XOR parity for
	 * Unwritten regions without requiring user data write first.
	 */
	BitmapActionProactiveSync,
	BitmapActionClearUnwritten,
	BitmapActionCount,
	/* Init state is BitUnwritten */
	BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitNone,
		[BitmapActionStale]		= BitNone,
		[BitmapActionProactiveSync]	= BitNeedSyncUnwritten,
		[BitmapActionClearUnwritten]	= BitNone,
	},
	[BitClean] = {
		[BitmapActionStartwrite]	= BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitNone,
	},
	[BitDirty] = {
		[BitmapActionStartwrite]	= BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
		[BitmapActionDaemon]		= BitClean,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitNone,
	},
	[BitNeedSync] = {
		[BitmapActionStartwrite]	= BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNone,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitNone,
	},
	[BitSyncing] = {
		[BitmapActionStartwrite]	= BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitNone,
	},
	[BitNeedSyncUnwritten] = {
		[BitmapActionStartwrite]	= BitNeedSync,
		[BitmapActionStartsync]		= BitSyncingUnwritten,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitUnwritten,
		[BitmapActionReload]		= BitUnwritten,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitUnwritten,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitUnwritten,
	},
	[BitSyncingUnwritten] = {
		[BitmapActionStartwrite]	= BitSyncing,
		[BitmapActionStartsync]		= BitSyncingUnwritten,
		[BitmapActionEndsync]		= BitCleanUnwritten,
		[BitmapActionAbortsync]		= BitUnwritten,
		[BitmapActionReload]		= BitUnwritten,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitUnwritten,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitUnwritten,
	},
	[BitCleanUnwritten] = {
		[BitmapActionStartwrite]	= BitDirty,
		[BitmapActionStartsync]		= BitNone,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitNone,
		[BitmapActionReload]		= BitNone,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitUnwritten,
		[BitmapActionProactiveSync]	= BitNone,
		[BitmapActionClearUnwritten]	= BitUnwritten,
	},
};

@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
			break;
		case BitClean:
		case BitCleanUnwritten:
			pctl->state[pos] = BitDirty;
			break;
		}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
}

static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
				    int offset)
				    int offset, bool infect)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
	unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
	 * resync all the dirty bits, hence skip infect new dirty bits to
	 * prevent resync unnecessary data.
	 */
	if (llbitmap->mddev->degraded) {
	if (llbitmap->mddev->degraded || !infect) {
		set_bit(block, pctl->dirty);
		return;
	}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,

	llbitmap->pctl[idx]->state[bit] = state;
	if (state == BitDirty || state == BitNeedSync)
		llbitmap_set_page_dirty(llbitmap, idx, bit);
		llbitmap_set_page_dirty(llbitmap, idx, bit, true);
	else if (state == BitNeedSyncUnwritten)
		llbitmap_set_page_dirty(llbitmap, idx, bit, false);
}

static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -585,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
	return 0;
}

/*
 * Check if all underlying disks support write_zeroes with unmap.
 */
static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
{
	struct mddev *mddev = llbitmap->mddev;
	struct md_rdev *rdev;

	rdev_for_each(rdev, mddev) {
		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
			continue;

		if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
			return false;
	}

	return true;
}

/*
 * Issue write_zeroes to all underlying disks to zero their data regions.
 * This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
 * Returns true if all disks were successfully zeroed.
 */
static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
{
	struct mddev *mddev = llbitmap->mddev;
	struct md_rdev *rdev;
	sector_t dev_sectors = mddev->dev_sectors;
	int ret;

	rdev_for_each(rdev, mddev) {
		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
			continue;

		ret = blkdev_issue_zeroout(rdev->bdev,
					   rdev->data_offset,
					   dev_sectors,
					   GFP_KERNEL, 0);
		if (ret) {
			pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
				rdev->bdev, ret);
			return false;
		}
	}

	return true;
}

static void llbitmap_init_state(struct llbitmap *llbitmap)
{
	struct mddev *mddev = llbitmap->mddev;
	enum llbitmap_state state = BitUnwritten;
	unsigned long i;

	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
		state = BitClean;
	} else if (raid_is_456(mddev) &&
		   llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
		/*
		 * All disks support write_zeroes with unmap. Zero all disks
		 * to ensure parity consistency, then set BitCleanUnwritten
		 * to skip initial sync.
		 */
		if (llbitmap_zero_all_disks(llbitmap))
			state = BitCleanUnwritten;
	}

	for (i = 0; i < llbitmap->chunks; i++)
		llbitmap_write(llbitmap, state, i);
@@ -627,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
			goto write_bitmap;
		}

		if (c == BitNeedSync)
		if (c == BitNeedSync || c == BitNeedSyncUnwritten)
			need_resync = !mddev->degraded;

		state = state_machine[c][action];

write_bitmap:
		if (unlikely(mddev->degraded)) {
			/* For degraded array, mark new data as need sync. */
@@ -658,8 +786,7 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
		}

		llbitmap_write(llbitmap, state, start);

		if (state == BitNeedSync)
		if (state == BitNeedSync || state == BitNeedSyncUnwritten)
			need_resync = !mddev->degraded;
		else if (state == BitDirty &&
			 !timer_pending(&llbitmap->pending_timer))
@@ -1229,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
	unsigned long p = offset >> llbitmap->chunkshift;
	enum llbitmap_state c = llbitmap_read(llbitmap, p);

	return c == BitClean || c == BitDirty;
	return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
}

static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1243,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
	if (c == BitUnwritten)
		return blocks;

	/* Skip CleanUnwritten - no user data, will be reset after recovery */
	if (c == BitCleanUnwritten)
		return blocks;

	/* For degraded array, don't skip */
	if (mddev->degraded)
		return 0;
@@ -1261,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long p = offset >> llbitmap->chunkshift;
	enum llbitmap_state state;

	/*
	 * Before recovery starts, convert CleanUnwritten to Unwritten.
	 * This ensures the new disk won't have stale parity data.
	 */
	if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
		llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
				       BitmapActionClearUnwritten);


	/*
	 * Handle one bit at a time, this is much simpler. And it doesn't matter
	 * if md_do_sync() loop more times.
	 */
	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
	return llbitmap_state_machine(llbitmap, p, p,
				      BitmapActionStartsync) == BitSyncing;
	state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
	return state == BitSyncing || state == BitSyncingUnwritten;
}

/* Something is wrong, sync_thread stop at @offset */
@@ -1474,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
	}

	mutex_unlock(&mddev->bitmap_info.mutex);
	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
	return sprintf(page,
		       "unwritten %d\nclean %d\ndirty %d\n"
		       "need sync %d\nsyncing %d\n"
		       "need sync unwritten %d\nsyncing unwritten %d\n"
		       "clean unwritten %d\n",
		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
		       bits[BitNeedSync], bits[BitSyncing]);
		       bits[BitNeedSync], bits[BitSyncing],
		       bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
		       bits[BitCleanUnwritten]);
}

static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1549,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)

static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);

static ssize_t
proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
	struct llbitmap *llbitmap;

	/* Only for RAID-456 */
	if (!raid_is_456(mddev))
		return -EINVAL;

	mutex_lock(&mddev->bitmap_info.mutex);
	llbitmap = mddev->bitmap;
	if (!llbitmap || !llbitmap->pctl) {
		mutex_unlock(&mddev->bitmap_info.mutex);
		return -ENODEV;
	}

	/* Trigger proactive sync on all Unwritten regions */
	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
			       BitmapActionProactiveSync);

	mutex_unlock(&mddev->bitmap_info.mutex);
	return len;
}

static struct md_sysfs_entry llbitmap_proactive_sync =
	__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);

static struct attribute *md_llbitmap_attrs[] = {
	&llbitmap_bits.attr,
	&llbitmap_metadata.attr,
	&llbitmap_daemon_sleep.attr,
	&llbitmap_barrier_idle.attr,
	&llbitmap_proactive_sync.attr,
	NULL
};

+128 −11
Original line number Diff line number Diff line
@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;

static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;

/*
 * This workqueue is used for sync_work to register new sync_thread, and for
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)

		spin_lock_init(&serial_tmp->serial_lock);
		serial_tmp->serial_rb = RB_ROOT_CACHED;
		init_waitqueue_head(&serial_tmp->serial_io_wait);
	}

	rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
	}

	percpu_ref_kill(&mddev->active_io);

	/*
	 * RAID456 IO can sleep in wait_for_reshape while still holding an
	 * active_io reference. If reshape is already interrupted or frozen,
	 * wake those waiters so they can abort and drop the reference instead
	 * of deadlocking suspend.
	 */
	if (mddev->pers && mddev->pers->prepare_suspend &&
	    reshape_interrupted(mddev))
		mddev->pers->prepare_suspend(mddev);

	if (interruptible)
		err = wait_event_interruptible(mddev->sb_wait,
				percpu_ref_is_zero(&mddev->active_io));
@@ -6130,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
	}
	spin_unlock(&all_mddevs_lock);
	rv = entry->store(mddev, page, length);
	mddev_put(mddev);

	/*
	 * For "array_state=clear", dropping the extra kobject reference from
	 * sysfs_break_active_protection() can trigger md kobject deletion.
	 * Restore active protection before mddev_put() so deletion happens
	 * after the sysfs write path fully unwinds.
	 */
	if (kn)
		sysfs_unbreak_active_protection(kn);
	mddev_put(mddev);

	return rv;
}
@@ -6449,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)

static int start_dirty_degraded;

/*
 * Read bitmap superblock and return the bitmap_id based on disk version.
 * This is used as fallback when default bitmap version and on-disk version
 * doesn't match, and mdadm is not the latest version to set bitmap_type.
 */
static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
{
	struct md_rdev *rdev;
	struct page *sb_page;
	bitmap_super_t *sb;
	enum md_submodule_id id = ID_BITMAP_NONE;
	sector_t sector;
	u32 version;

	if (!mddev->bitmap_info.offset)
		return ID_BITMAP_NONE;

	sb_page = alloc_page(GFP_KERNEL);
	if (!sb_page) {
		pr_warn("md: %s: failed to allocate memory for bitmap\n",
			mdname(mddev));
		return ID_BITMAP_NONE;
	}

	sector = mddev->bitmap_info.offset;

	rdev_for_each(rdev, mddev) {
		u32 iosize;

		if (!test_bit(In_sync, &rdev->flags) ||
		    test_bit(Faulty, &rdev->flags) ||
		    test_bit(Bitmap_sync, &rdev->flags))
			continue;

		iosize = roundup(sizeof(bitmap_super_t),
				 bdev_logical_block_size(rdev->bdev));
		if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
				 true))
			goto read_ok;
	}
	pr_warn("md: %s: failed to read bitmap from any device\n",
		mdname(mddev));
	goto out;

read_ok:
	sb = kmap_local_page(sb_page);
	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
		pr_warn("md: %s: invalid bitmap magic 0x%x\n",
			mdname(mddev), le32_to_cpu(sb->magic));
		goto out_unmap;
	}

	version = le32_to_cpu(sb->version);
	switch (version) {
	case BITMAP_MAJOR_LO:
	case BITMAP_MAJOR_HI:
	case BITMAP_MAJOR_CLUSTERED:
		id = ID_BITMAP;
		break;
	case BITMAP_MAJOR_LOCKLESS:
		id = ID_LLBITMAP;
		break;
	default:
		pr_warn("md: %s: unknown bitmap version %u\n",
			mdname(mddev), version);
		break;
	}

out_unmap:
	kunmap_local(sb);
out:
	__free_page(sb_page);
	return id;
}

static int md_bitmap_create(struct mddev *mddev)
{
	enum md_submodule_id orig_id = mddev->bitmap_id;
	enum md_submodule_id sb_id;
	int err;

	if (mddev->bitmap_id == ID_BITMAP_NONE)
		return -EINVAL;

	if (!mddev_set_bitmap_ops(mddev))
		return -ENOENT;

	return mddev->bitmap_ops->create(mddev);
	err = mddev->bitmap_ops->create(mddev);
	if (!err)
		return 0;

	/*
	 * Create failed, if default bitmap version and on-disk version
	 * doesn't match, and mdadm is not the latest version to set
	 * bitmap_type, set bitmap_ops based on the disk version.
	 */
	mddev_clear_bitmap_ops(mddev);

	sb_id = md_bitmap_get_id_from_sb(mddev);
	if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
		return err;

	pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
		mdname(mddev), orig_id, sb_id);

	mddev->bitmap_id = sb_id;
	if (!mddev_set_bitmap_ops(mddev)) {
		mddev->bitmap_id = orig_id;
		return -ENOENT;
	}

	err = mddev->bitmap_ops->create(mddev);
	if (err) {
		mddev_clear_bitmap_ops(mddev);
		mddev->bitmap_id = orig_id;
	}

	return err;
}

static void md_bitmap_destroy(struct mddev *mddev)
@@ -10505,10 +10629,6 @@ static int __init md_init(void)
		goto err_bitmap;

	ret = -ENOMEM;
	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
	if (!md_wq)
		goto err_wq;

	md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
	if (!md_misc_wq)
		goto err_misc_wq;
@@ -10533,8 +10653,6 @@ static int __init md_init(void)
err_md:
	destroy_workqueue(md_misc_wq);
err_misc_wq:
	destroy_workqueue(md_wq);
err_wq:
	md_llbitmap_exit();
err_bitmap:
	md_bitmap_exit();
@@ -10843,7 +10961,6 @@ static __exit void md_exit(void)
	spin_unlock(&all_mddevs_lock);

	destroy_workqueue(md_misc_wq);
	destroy_workqueue(md_wq);
	md_bitmap_exit();
}

+4 −1
Original line number Diff line number Diff line
@@ -126,7 +126,6 @@ enum sync_action {
struct serial_in_rdev {
	struct rb_root_cached serial_rb;
	spinlock_t serial_lock;
	wait_queue_head_t serial_io_wait;
};

/*
@@ -381,7 +380,11 @@ struct serial_info {
	struct rb_node node;
	sector_t start;		/* start sector of rb node */
	sector_t last;		/* end sector of rb node */
	sector_t wnode_start; /* address of waiting nodes on the same list */
	sector_t _subtree_last; /* highest sector in subtree of rb node */
	struct list_head	list_node;
	struct list_head	waiters;
	struct completion	ready;
};

/*
+9 −9
Original line number Diff line number Diff line
@@ -143,10 +143,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
	}

	err = -ENOMEM;
	conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
	conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
	if (!conf->strip_zone)
		goto abort;
	conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
	conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
					     conf->nr_strip_zones,
					     mddev->raid_disks),
				 GFP_KERNEL);
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)

	return 0;
abort:
	kfree(conf->strip_zone);
	kfree(conf->devlist);
	kvfree(conf->strip_zone);
	kvfree(conf->devlist);
	kfree(conf);
	*private_conf = ERR_PTR(err);
	return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
{
	struct r0conf *conf = priv;

	kfree(conf->strip_zone);
	kfree(conf->devlist);
	kvfree(conf->strip_zone);
	kvfree(conf->devlist);
	kfree(conf);
}

+35 −12
Original line number Diff line number Diff line
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
		     START, LAST, static inline, raid1_rb);

static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
				struct serial_info *si, int idx)
				struct serial_info *si)
{
	unsigned long flags;
	int ret = 0;
	sector_t lo = r1_bio->sector;
	sector_t hi = lo + r1_bio->sectors - 1;
	int idx = sector_to_idx(r1_bio->sector);
	struct serial_in_rdev *serial = &rdev->serial[idx];
	struct serial_info *head_si;

	spin_lock_irqsave(&serial->serial_lock, flags);
	/* collision happened */
	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
	head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
	if (head_si && head_si != si) {
		si->start = lo;
		si->last = hi;
		si->wnode_start = head_si->wnode_start;
		list_add_tail(&si->list_node, &head_si->waiters);
		ret = -EBUSY;
	else {
	} else if (!head_si) {
		si->start = lo;
		si->last = hi;
		si->wnode_start = si->start;
		raid1_rb_insert(si, &serial->serial_rb);
	}
	spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
{
	struct mddev *mddev = rdev->mddev;
	struct serial_info *si;
	int idx = sector_to_idx(r1_bio->sector);
	struct serial_in_rdev *serial = &rdev->serial[idx];

	if (WARN_ON(!mddev->serial_info_pool))
		return;
	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
	wait_event(serial->serial_io_wait,
		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
	INIT_LIST_HEAD(&si->waiters);
	INIT_LIST_HEAD(&si->list_node);
	init_completion(&si->ready);
	while (check_and_add_serial(rdev, r1_bio, si)) {
		wait_for_completion(&si->ready);
		reinit_completion(&si->ready);
	}
}

static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
	struct serial_info *si;
	struct serial_info *si, *iter_si;
	unsigned long flags;
	int found = 0;
	struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
	     si; si = raid1_rb_iter_next(si, lo, hi)) {
		if (si->start == lo && si->last == hi) {
			raid1_rb_remove(si, &serial->serial_rb);
			mempool_free(si, mddev->serial_info_pool);
			found = 1;
			break;
		}
	}
	if (!found)
	if (found) {
		raid1_rb_remove(si, &serial->serial_rb);
		if (!list_empty(&si->waiters)) {
			list_for_each_entry(iter_si, &si->waiters, list_node) {
				if (iter_si->wnode_start == si->wnode_start) {
					list_del_init(&iter_si->list_node);
					list_splice_init(&si->waiters, &iter_si->waiters);
					raid1_rb_insert(iter_si, &serial->serial_rb);
					complete(&iter_si->ready);
					break;
				}
			}
		}
		mempool_free(si, mddev->serial_info_pool);
	} else {
		WARN(1, "The write IO is not recorded for serialization\n");
	}
	spin_unlock_irqrestore(&serial->serial_lock, flags);
	wake_up(&serial->serial_io_wait);
}

/*
Loading