Commit 62ed1b58 authored by Li Nan's avatar Li Nan Committed by Yu Kuai
Browse files

md: allow configuring logical block size

Previously, raid array used the maximum logical block size (LBS)
of all member disks. Adding a larger LBS disk at runtime could
unexpectedly increase RAID's LBS, risking corruption of existing
partitions. This can be reproduced by:

```
  # LBS of sd[de] is 512 bytes, sdf is 4096 bytes.
  mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean

  # LBS is 512
  cat /sys/block/md0/queue/logical_block_size

  # create partition md0p1
  parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100%
  lsblk | grep md0p1

  # LBS becomes 4096 after adding sdf
  mdadm --add -q /dev/md0 /dev/sdf
  cat /sys/block/md0/queue/logical_block_size

  # partition lost
  partprobe /dev/md0
  lsblk | grep md0p1
```

Simply restricting larger-LBS disks is inflexible. In some scenarios,
only disks with 512 bytes LBS are available currently, but later, disks
with 4KB LBS may be added to the array.

Making LBS configurable is the best way to solve this scenario.
After this patch, the raid will:
  - store LBS in disk metadata
  - add a read-write sysfs 'mdX/logical_block_size'

Future mdadm should support setting LBS via metadata field during RAID
creation and the new sysfs. Though the kernel allows runtime LBS changes,
users should avoid modifying it after creating partitions or filesystems
to prevent compatibility issues.

Only 1.x metadata supports configurable LBS. 0.90 metadata inits all
fields to default values at auto-detect. Supporting 0.90 would require
more extensive changes and no such use case has been observed.

Note that many RAID paths rely on PAGE_SIZE alignment, including for
metadata I/O. A larger LBS than PAGE_SIZE will result in metadata
read/write failures. So this config should be prevented.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com


Signed-off-by: default avatarLi Nan <linan122@huawei.com>
Reviewed-by: default avatarXiao Ni <xni@redhat.com>
Signed-off-by: default avatarYu Kuai <yukuai@fnnas.com>
parent 9c47127a
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -238,6 +238,16 @@ All md devices contain:
     the number of devices in a raid4/5/6, or to support external
     metadata formats which mandate such clipping.

  logical_block_size
     Configure the array's logical block size in bytes. This attribute
     is only supported for 1.x meta. Write the value before starting
     array. The final array LBS uses the maximum between this
     configuration and LBS of all combined devices. Note that
     LBS cannot exceed PAGE_SIZE before RAID supports folio.
     WARNING: Arrays created on new kernel cannot be assembled at old
     kernel due to padding check, Set module parameter 'check_new_feature'
     to false to bypass, but data loss may occur.

  reshape_position
     This is either ``none`` or a sector number within the devices of
     the array where ``reshape`` is up to.  If this is set, the three
+1 −0
Original line number Diff line number Diff line
@@ -72,6 +72,7 @@ static int linear_set_limits(struct mddev *mddev)

	md_init_stacking_limits(&lim);
	lim.max_hw_sectors = mddev->chunk_sectors;
	lim.logical_block_size = mddev->logical_block_size;
	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
	lim.io_min = mddev->chunk_sectors << 9;
+77 −0
Original line number Diff line number Diff line
@@ -1999,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
		mddev->layout = le32_to_cpu(sb->layout);
		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
		mddev->dev_sectors = le64_to_cpu(sb->size);
		mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
		mddev->events = ev1;
		mddev->bitmap_info.offset = 0;
		mddev->bitmap_info.space = 0;
@@ -2208,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
	sb->level = cpu_to_le32(mddev->level);
	sb->layout = cpu_to_le32(mddev->layout);
	sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
	if (test_bit(FailFast, &rdev->flags))
		sb->devflags |= FailFast1;
	else
@@ -5936,6 +5938,68 @@ static struct md_sysfs_entry md_serialize_policy =
__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
       serialize_policy_store);

static int mddev_set_logical_block_size(struct mddev *mddev,
				unsigned int lbs)
{
	int err = 0;
	struct queue_limits lim;

	if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
		pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
		       mdname(mddev), lbs);
		return -EINVAL;
	}

	lim = queue_limits_start_update(mddev->gendisk->queue);
	lim.logical_block_size = lbs;
	pr_info("%s: logical_block_size is changed, data may be lost\n",
		mdname(mddev));
	err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
	if (err)
		return err;

	mddev->logical_block_size = lbs;
	/* New lbs will be written to superblock after array is running */
	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
	return 0;
}

static ssize_t
lbs_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%u\n", mddev->logical_block_size);
}

static ssize_t
lbs_store(struct mddev *mddev, const char *buf, size_t len)
{
	unsigned int lbs;
	int err = -EBUSY;

	/* Only 1.x meta supports configurable LBS */
	if (mddev->major_version == 0)
		return -EINVAL;

	if (mddev->pers)
		return -EBUSY;

	err = kstrtouint(buf, 10, &lbs);
	if (err < 0)
		return -EINVAL;

	err = mddev_lock(mddev);
	if (err)
		goto unlock;

	err = mddev_set_logical_block_size(mddev, lbs);

unlock:
	mddev_unlock(mddev);
	return err ?: len;
}

static struct md_sysfs_entry md_logical_block_size =
__ATTR(logical_block_size, 0644, lbs_show, lbs_store);

static struct attribute *md_default_attrs[] = {
	&md_level.attr,
@@ -5958,6 +6022,7 @@ static struct attribute *md_default_attrs[] = {
	&md_consistency_policy.attr,
	&md_fail_last_dev.attr,
	&md_serialize_policy.attr,
	&md_logical_block_size.attr,
	NULL,
};

@@ -6088,6 +6153,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
			return -EINVAL;
	}

	/*
	 * Before RAID adding folio support, the logical_block_size
	 * should be smaller than the page size.
	 */
	if (lim->logical_block_size > PAGE_SIZE) {
		pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
			mdname(mddev));
		return -EINVAL;
	}
	mddev->logical_block_size = lim->logical_block_size;

	return 0;
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
@@ -6699,6 +6775,7 @@ static void md_clean(struct mddev *mddev)
	mddev->chunk_sectors = 0;
	mddev->ctime = mddev->utime = 0;
	mddev->layout = 0;
	mddev->logical_block_size = 0;
	mddev->max_disks = 0;
	mddev->events = 0;
	mddev->can_decrease_events = 0;
+1 −0
Original line number Diff line number Diff line
@@ -433,6 +433,7 @@ struct mddev {
	sector_t			array_sectors; /* exported array size */
	int				external_size; /* size managed
							* externally */
	unsigned int			logical_block_size;
	__u64				events;
	/* If the last 'event' was simply a clean->dirty transition, and
	 * we didn't write it to the spares, then it is safe and simple
+1 −0
Original line number Diff line number Diff line
@@ -380,6 +380,7 @@ static int raid0_set_limits(struct mddev *mddev)
	lim.max_hw_sectors = mddev->chunk_sectors;
	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
	lim.logical_block_size = mddev->logical_block_size;
	lim.io_min = mddev->chunk_sectors << 9;
	lim.io_opt = lim.io_min * mddev->raid_disks;
	lim.chunk_sectors = mddev->chunk_sectors;
Loading