Commit 79b24810 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'md-6.18-20250909' of...

Merge tag 'md-6.18-20250909' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux into for-6.18/block

Pull MD changes from Yu Kuai:

"Redundant data is used to enhance data fault tolerance, and the storage
 method for redundant data vary depending on the RAID levels. And it's
 important to maintain the consistency of redundant data.

 Bitmap is used to record which data blocks have been synchronized and
 which ones need to be resynchronized or recovered. Each bit in the
 bitmap represents a segment of data in the array. When a bit is set,
 it indicates that the multiple redundant copies of that data segment
 may not be consistent. Data synchronization can be performed based on
 the bitmap after power failure or readding a disk. If there is no
 bitmap, a full disk synchronization is required.

 Due to known performance issues with md-bitmap and the unreasonable
 implementations:

 - self-managed IO submitting like filemap_write_page();
 - global spin_lock

 I have decided not to continue optimizing based on the current bitmap
 implementation, this new bitmap is invented without locking from IO fast
 path and can be used with fast disks.

 Key features for the new bitmap:
  - IO fastpath is lockless, if user issues lots of write IO to the same
    bitmap bit in a short time, only the first write has additional
    overhead to update bitmap bit, no additional overhead for the
    following writes;
  - support only resync or recover written data, means in the case
    creating new array or replacing with a new disk, there is no need to
    do a full disk resync/recovery;"

* tag 'md-6.18-20250909' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux: (24 commits)
  md/md-llbitmap: introduce new lockless bitmap
  md/md-bitmap: make method bitmap_ops->daemon_work optional
  md: add a new recovery_flag MD_RECOVERY_LAZY_RECOVER
  md/md-bitmap: add a new method blocks_synced() in bitmap_operations
  md/md-bitmap: add a new method skip_sync_blocks() in bitmap_operations
  md/md-bitmap: delay registration of bitmap_ops until creating bitmap
  md/md-bitmap: add a new sysfs api bitmap_type
  md: add a new mddev field 'bitmap_id'
  md/md-bitmap: support discard for bitmap ops
  md: factor out a helper raid_is_456()
  md: add a new parameter 'offset' to md_super_write()
  md/md-bitmap: introduce CONFIG_MD_BITMAP
  md: check before referencing mddev->bitmap_ops
  md/dm-raid: check before referencing mddev->bitmap_ops
  md/raid5: check before referencing mddev->bitmap_ops
  md/raid10: check before referencing mddev->bitmap_ops
  md/raid1: check before referencing mddev->bitmap_ops
  md/raid1: check bitmap before behind write
  md/md-bitmap: handle the case bitmap is not enabled before end_sync()
  md/md-bitmap: handle the case bitmap is not enabled before start_sync()
  ...
parents d0d1d522 5ab829f1
Loading
Loading
Loading
Loading
+61 −25
Original line number Diff line number Diff line
@@ -347,6 +347,54 @@ All md devices contain:
     active-idle
         like active, but no writes have been seen for a while (safe_mode_delay).

  consistency_policy
     This indicates how the array maintains consistency in case of unexpected
     shutdown. It can be:

     none
       Array has no redundancy information, e.g. raid0, linear.

     resync
       Full resync is performed and all redundancy is regenerated when the
       array is started after unclean shutdown.

     bitmap
       Resync assisted by a write-intent bitmap.

     journal
       For raid4/5/6, journal device is used to log transactions and replay
       after unclean shutdown.

     ppl
       For raid5 only, Partial Parity Log is used to close the write hole and
       eliminate resync.

     The accepted values when writing to this file are ``ppl`` and ``resync``,
     used to enable and disable PPL.

  uuid
     This indicates the UUID of the array in the following format:
     xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx

  bitmap_type
     [RW] When read, this file will display the current and available
     bitmap for this array. The currently active bitmap will be enclosed
     in [] brackets. Writing an bitmap name or ID to this file will switch
     control of this array to that new bitmap. Note that writing a new
     bitmap for created array is forbidden.

     none
         No bitmap
     bitmap
         The default internal bitmap
     llbitmap
         The lockless internal bitmap

If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or
llbitmap/xxx will be created after md device KOBJ_CHANGE event.

If bitmap_type is bitmap, then the md device will also contain:

  bitmap/location
     This indicates where the write-intent bitmap for the array is
     stored.
@@ -401,35 +449,23 @@ All md devices contain:
     once the array becomes non-degraded, and this fact has been
     recorded in the metadata.

  consistency_policy
     This indicates how the array maintains consistency in case of unexpected
     shutdown. It can be:

     none
       Array has no redundancy information, e.g. raid0, linear.

     resync
       Full resync is performed and all redundancy is regenerated when the
       array is started after unclean shutdown.

     bitmap
       Resync assisted by a write-intent bitmap.
If bitmap_type is llbitmap, then the md device will also contain:

     journal
       For raid4/5/6, journal device is used to log transactions and replay
       after unclean shutdown.
  llbitmap/bits
     This is read-only, show status of bitmap bits, the number of each
     value.

     ppl
       For raid5 only, Partial Parity Log is used to close the write hole and
       eliminate resync.

     The accepted values when writing to this file are ``ppl`` and ``resync``,
     used to enable and disable PPL.
  llbitmap/metadata
     This is read-only, show bitmap metadata, include chunksize, chunkshift,
     chunks, offset and daemon_sleep.

  uuid
     This indicates the UUID of the array in the following format:
     xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
  llbitmap/daemon_sleep
     This is read-write, time in seconds that daemon function will be
     triggered to clear dirty bits.

  llbitmap/barrier_idle
     This is read-write, time in seconds that page barrier will be idled,
     means dirty bits in the page will be cleared.

As component devices are added to an md array, they appear in the ``md``
directory as new directories named::
+29 −0
Original line number Diff line number Diff line
@@ -37,6 +37,32 @@ config BLK_DEV_MD

	  If unsure, say N.

config MD_BITMAP
	bool "MD RAID bitmap support"
	default y
	depends on BLK_DEV_MD
	help
	  If you say Y here, support for the write intent bitmap will be
	  enabled. The bitmap can be used to optimize resync speed after power
	  failure or readding a disk, limiting it to recorded dirty sectors in
	  bitmap.

	  This feature can be added to existing MD array or MD array can be
	  created with bitmap via mdadm(8).

	  If unsure, say Y.

config MD_LLBITMAP
	bool "MD RAID lockless bitmap support"
	depends on BLK_DEV_MD
	help
	  If you say Y here, support for the lockless write intent bitmap will
	  be enabled.

	  Note, this is an experimental feature.

	  If unsure, say N.

config MD_AUTODETECT
	bool "Autodetect RAID arrays during kernel boot"
	depends on BLK_DEV_MD=y
@@ -54,6 +80,7 @@ config MD_AUTODETECT
config MD_BITMAP_FILE
	bool "MD bitmap file support (deprecated)"
	default y
	depends on MD_BITMAP
	help
	  If you say Y here, support for write intent bitmaps in files on an
	  external file system is enabled.  This is an alternative to the internal
@@ -174,6 +201,7 @@ config MD_RAID456

config MD_CLUSTER
	tristate "Cluster Support for MD"
	select MD_BITMAP
	depends on BLK_DEV_MD
	depends on DLM
	default n
@@ -393,6 +421,7 @@ config DM_RAID
       select MD_RAID1
       select MD_RAID10
       select MD_RAID456
       select MD_BITMAP
       select BLK_DEV_MD
	help
	 A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
+3 −1
Original line number Diff line number Diff line
@@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y	+= dm-verity-target.o
dm-zoned-y	+= dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o

md-mod-y	+= md.o md-bitmap.o
md-mod-y	+= md.o
md-mod-$(CONFIG_MD_BITMAP)	+= md-bitmap.o
md-mod-$(CONFIG_MD_LLBITMAP)	+= md-llbitmap.o
raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
linear-y       += md-linear.o

+11 −7
Original line number Diff line number Diff line
@@ -3953,10 +3953,12 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
	    !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
		struct mddev *mddev = &rs->md;

		if (md_bitmap_enabled(mddev, false)) {
			r = mddev->bitmap_ops->load(mddev);
			if (r)
				DMERR("Failed to load bitmap");
		}
	}

	return r;
}
@@ -4070,11 +4072,13 @@ static int raid_preresume(struct dm_target *ti)
	       mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
		int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;

		if (md_bitmap_enabled(mddev, false)) {
			r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
					      chunksize, false);
						      chunksize);
			if (r)
				DMERR("Failed to resize bitmap");
		}
	}

	/* Check for any resize/reshape on @rs and adjust/initiate */
	if (mddev->resync_offset && mddev->resync_offset < MaxSector) {
+44 −45
Original line number Diff line number Diff line
@@ -34,15 +34,6 @@
#include "md-bitmap.h"
#include "md-cluster.h"

#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
 * with version 3, it is host-endian which is non-portable
 * Version 5 is currently set only for clustered devices
 */
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define	BITMAP_MAJOR_HOSTENDIAN 3

/*
 * in-memory bitmap:
 *
@@ -224,6 +215,8 @@ struct bitmap {
	int cluster_slot;
};

static struct workqueue_struct *md_bitmap_wq;

static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
			   int chunksize, bool init);

@@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap)
	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}

static bool __bitmap_enabled(struct bitmap *bitmap)
static bool bitmap_enabled(void *data, bool flush)
{
	return bitmap->storage.filemap &&
	       !test_bit(BITMAP_STALE, &bitmap->flags);
}

static bool bitmap_enabled(struct mddev *mddev)
{
	struct bitmap *bitmap = mddev->bitmap;
	struct bitmap *bitmap = data;

	if (!bitmap)
		return false;
	if (!flush)
		return true;

	return __bitmap_enabled(bitmap);
	/*
	 * If caller want to flush bitmap pages to underlying disks, check if
	 * there are cached pages in filemap.
	 */
	return !test_bit(BITMAP_STALE, &bitmap->flags) &&
	       bitmap->storage.filemap != NULL;
}

/*
@@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
			return -EINVAL;
	}

	md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
	md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit),
			  page, 0);
	return 0;
}

@@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap)
	int dirty, need_write;
	int writing = 0;

	if (!__bitmap_enabled(bitmap))
	if (!bitmap_enabled(bitmap, true))
		return;

	/* look at each page to see if there are any set bits that need to be
@@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
				sector_t *blocks, bool degraded)
{
	bitmap_counter_t *bmc;
	bool rv;
	bool rv = false;

	if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
		*blocks = 1024;
		return true; /* always resync if no bitmap */
	}
	spin_lock_irq(&bitmap->counts.lock);

	rv = false;
	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
	if (bmc) {
		/* locked */
@@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
	bitmap_counter_t *bmc;
	unsigned long flags;

	if (bitmap == NULL) {
		*blocks = 1024;
		return;
	}
	spin_lock_irqsave(&bitmap->counts.lock, flags);
	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
	if (bmc == NULL)
@@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev)
	struct bitmap *bitmap = mddev->bitmap;
	int bw;

	if (!bitmap)
		return;

	atomic_inc(&bitmap->behind_writes);
	bw = atomic_read(&bitmap->behind_writes);
	if (bw > bitmap->behind_writes_used)
@@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev)
{
	struct bitmap *bitmap = mddev->bitmap;

	if (!bitmap)
		return;

	if (atomic_dec_and_test(&bitmap->behind_writes))
		wake_up(&bitmap->behind_wait);
	pr_debug("dec write-behind count %d/%lu\n",
@@ -2593,15 +2570,14 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
	return ret;
}

static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
			 bool init)
static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
{
	struct bitmap *bitmap = mddev->bitmap;

	if (!bitmap)
		return 0;

	return __bitmap_resize(bitmap, blocks, chunksize, init);
	return __bitmap_resize(bitmap, blocks, chunksize, false);
}

static ssize_t
@@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = {
	&max_backlog_used.attr,
	NULL
};
const struct attribute_group md_bitmap_group = {

static struct attribute_group md_bitmap_group = {
	.name = "bitmap",
	.attrs = md_bitmap_attrs,
};

static struct bitmap_operations bitmap_ops = {
	.head = {
		.type	= MD_BITMAP,
		.id	= ID_BITMAP,
		.name	= "bitmap",
	},

	.enabled		= bitmap_enabled,
	.create			= bitmap_create,
	.resize			= bitmap_resize,
@@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = {

	.start_write		= bitmap_start_write,
	.end_write		= bitmap_end_write,
	.start_discard		= bitmap_start_write,
	.end_discard		= bitmap_end_write,

	.start_sync		= bitmap_start_sync,
	.end_sync		= bitmap_end_sync,
	.cond_end_sync		= bitmap_cond_end_sync,
@@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = {
	.copy_from_slot		= bitmap_copy_from_slot,
	.set_pages		= bitmap_set_pages,
	.free			= md_bitmap_free,

	.group			= &md_bitmap_group,
};

void mddev_set_bitmap_ops(struct mddev *mddev)
int md_bitmap_init(void)
{
	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
				       0);
	if (!md_bitmap_wq)
		return -ENOMEM;

	return register_md_submodule(&bitmap_ops.head);
}

void md_bitmap_exit(void)
{
	mddev->bitmap_ops = &bitmap_ops;
	destroy_workqueue(md_bitmap_wq);
	unregister_md_submodule(&bitmap_ops.head);
}
Loading