Commit 12c612e1 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'md-6.12-20240829' of...

Merge tag 'md-6.12-20240829' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.12/block

Pull MD updates from Song:

"Major changes in this set are:

 1. md-bitmap refactoring, by Yu Kuai;
 2. raid5 performance optimization, by Artur Paszkiewicz;
 3. Other small fixes, by Yu Kuai and Chen Ni."

* tag 'md-6.12-20240829' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (49 commits)
  md/raid5: rename wait_for_overlap to wait_for_reshape
  md/raid5: only add to wq if reshape is in progress
  md/raid5: use wait_on_bit() for R5_Overlap
  md: Remove flush handling
  md/md-bitmap: make in memory structure internal
  md/md-bitmap: merge md_bitmap_enabled() into bitmap_operations
  md/md-bitmap: merge md_bitmap_wait_behind_writes() into bitmap_operations
  md/md-bitmap: merge md_bitmap_free() into bitmap_operations
  md/md-bitmap: merge md_bitmap_set_pages() into struct bitmap_operations
  md/md-bitmap: merge md_bitmap_copy_from_slot() into struct bitmap_operation.
  md/md-bitmap: merge get_bitmap_from_slot() into bitmap_operations
  md/md-bitmap: merge md_bitmap_resize() into bitmap_operations
  md/md-bitmap: pass in mddev directly for md_bitmap_resize()
  md/md-bitmap: merge md_bitmap_daemon_work() into bitmap_operations
  md/md-bitmap: merge bitmap_unplug() into bitmap_operations
  md/md-bitmap: merge md_bitmap_unplug_async() into md_bitmap_unplug()
  md/md-bitmap: merge md_bitmap_sync_with_cluster() into bitmap_operations
  md/md-bitmap: merge md_bitmap_cond_end_sync() into bitmap_operations
  md/md-bitmap: merge md_bitmap_close_sync() into bitmap_operations
  md/md-bitmap: merge md_bitmap_end_sync() into bitmap_operations
  ...
parents 12515809 fb16787b
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
	/* Try loading the bitmap unless "raid0", which does not have one */
	if (!rs_is_raid0(rs) &&
	    !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
		r = md_bitmap_load(&rs->md);
		struct mddev *mddev = &rs->md;

		r = mddev->bitmap_ops->load(mddev);
		if (r)
			DMERR("Failed to load bitmap");
	}
@@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti)
	       mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
		int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;

		r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
		r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
					      chunksize, false);
		if (r)
			DMERR("Failed to resize bitmap");
	}
+432 −136

File changed.

Preview size limit exceeded, changes collapsed.

+48 −220
Original line number Diff line number Diff line
@@ -7,81 +7,7 @@
#ifndef BITMAP_H
#define BITMAP_H 1

#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
 * with version 3, it is host-endian which is non-portable
 * Version 5 is currently set only for clustered devices
 */
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define	BITMAP_MAJOR_HOSTENDIAN 3

/*
 * in-memory bitmap:
 *
 * Use 16 bit block counters to track pending writes to each "chunk".
 * The 2 high order bits are special-purpose, the first is a flag indicating
 * whether a resync is needed.  The second is a flag indicating whether a
 * resync is active.
 * This means that the counter is actually 14 bits:
 *
 * +--------+--------+------------------------------------------------+
 * | resync | resync |               counter                          |
 * | needed | active |                                                |
 * |  (0-1) |  (0-1) |              (0-16383)                         |
 * +--------+--------+------------------------------------------------+
 *
 * The "resync needed" bit is set when:
 *    a '1' bit is read from storage at startup.
 *    a write request fails on some drives
 *    a resync is aborted on a chunk with 'resync active' set
 * It is cleared (and resync-active set) when a resync starts across all drives
 * of the chunk.
 *
 *
 * The "resync active" bit is set when:
 *    a resync is started on all drives, and resync_needed is set.
 *       resync_needed will be cleared (as long as resync_active wasn't already set).
 * It is cleared when a resync completes.
 *
 * The counter counts pending write requests, plus the on-disk bit.
 * When the counter is '1' and the resync bits are clear, the on-disk
 * bit can be cleared as well, thus setting the counter to 0.
 * When we set a bit, or in the counter (to start a write), if the fields is
 * 0, we first set the disk bit and set the counter to 1.
 *
 * If the counter is 0, the on-disk bit is clear and the stripe is clean
 * Anything that dirties the stripe pushes the counter to 2 (at least)
 * and sets the on-disk bit (lazily).
 * If a periodic sweep find the counter at 2, it is decremented to 1.
 * If the sweep find the counter at 1, the on-disk bit is cleared and the
 * counter goes to zero.
 *
 * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
 * counters as a fallback when "page" memory cannot be allocated:
 *
 * Normal case (page memory allocated):
 *
 *     page pointer (32-bit)
 *
 *     [ ] ------+
 *               |
 *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
 *                          c1   c2    c2048
 *
 * Hijacked case (page memory allocation failed):
 *
 *     hijacked page pointer (32-bit)
 *
 *     [		  ][		  ] (no page memory allocated)
 *      counter #1 (16-bit) counter #2 (16-bit)
 *
 */

#ifdef __KERNEL__

#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
#define BITMAP_MAGIC 0x6d746962

typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
@@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t;
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)

/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)

#define BITMAP_BLOCK_SHIFT 9

#endif

/*
 * bitmap structures:
 */

#define BITMAP_MAGIC 0x6d746962

/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
@@ -152,136 +58,58 @@ typedef struct bitmap_super_s {
 *    devices.  For raid10 it is the size of the array.
 */

#ifdef __KERNEL__
struct md_bitmap_stats {
	u64		events_cleared;
	int		behind_writes;
	bool		behind_wait;

/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
	/*
	 * map points to the actual memory page
	 */
	char *map;
	/*
	 * in emergencies (when map cannot be alloced), hijack the map
	 * pointer and use it as two counters itself
	 */
	unsigned int hijacked:1;
	/*
	 * If any counter in this page is '1' or '2' - and so could be
	 * cleared then that page is marked as 'pending'
	 */
	unsigned int pending:1;
	/*
	 * count of dirty bits on the page
	 */
	unsigned int  count:30;
};

/* the main bitmap structure - one per mddev */
struct bitmap {

	struct bitmap_counts {
		spinlock_t lock;
		struct bitmap_page *bp;
		unsigned long pages;		/* total number of pages
						 * in the bitmap */
		unsigned long missing_pages;	/* number of pages
						 * not yet allocated */
		unsigned long chunkshift;	/* chunksize = 2^chunkshift
						 * (for bitops) */
		unsigned long chunks;		/* Total number of data
						 * chunks for the array */
	} counts;

	struct mddev *mddev; /* the md device that the bitmap is for */

	__u64	events_cleared;
	int need_sync;

	struct bitmap_storage {
		struct file *file;		/* backing disk file */
		struct page *sb_page;		/* cached copy of the bitmap
						 * file superblock */
		unsigned long sb_index;
		struct page **filemap;		/* list of cache pages for
						 * the file */
		unsigned long *filemap_attr;	/* attributes associated
						 * w/ filemap pages */
		unsigned long file_pages;	/* number of pages in the file*/
		unsigned long bytes;		/* total bytes in the bitmap */
	} storage;

	unsigned long flags;

	int allclean;

	atomic_t behind_writes;
	unsigned long behind_writes_used; /* highest actual value at runtime */

	/*
	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
	 * file, cleaning up bits and flushing out pages to disk as necessary
	 */
	unsigned long daemon_lastrun; /* jiffies of last run */
	unsigned long last_end_sync; /* when we lasted called end_sync to
				      * update bitmap with resync progress */

	atomic_t pending_writes; /* pending writes to the bitmap file */
	wait_queue_head_t write_wait;
	wait_queue_head_t overflow_wait;
	wait_queue_head_t behind_wait;

	struct kernfs_node *sysfs_can_clear;
	int cluster_slot;		/* Slot offset for clustered env */
	unsigned long	missing_pages;
	unsigned long	file_pages;
	unsigned long	sync_size;
	unsigned long	pages;
	struct file	*file;
};

/* the bitmap API */

/* these are used only by md/bitmap */
struct bitmap *md_bitmap_create(struct mddev *mddev, int slot);
int md_bitmap_load(struct mddev *mddev);
void md_bitmap_flush(struct mddev *mddev);
void md_bitmap_destroy(struct mddev *mddev);

void md_bitmap_print_sb(struct bitmap *bitmap);
void md_bitmap_update_sb(struct bitmap *bitmap);
void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap);

int  md_bitmap_setallbits(struct bitmap *bitmap);
void md_bitmap_write_all(struct bitmap *bitmap);

void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);

/* these are exported */
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
			 unsigned long sectors, int behind);
void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
			unsigned long sectors, int success, int behind);
int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void md_bitmap_close_sync(struct bitmap *bitmap);
void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void md_bitmap_sync_with_cluster(struct mddev *mddev,
struct bitmap_operations {
	bool (*enabled)(struct mddev *mddev);
	int (*create)(struct mddev *mddev, int slot);
	int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
		      bool init);

	int (*load)(struct mddev *mddev);
	void (*destroy)(struct mddev *mddev);
	void (*flush)(struct mddev *mddev);
	void (*write_all)(struct mddev *mddev);
	void (*dirty_bits)(struct mddev *mddev, unsigned long s,
			   unsigned long e);
	void (*unplug)(struct mddev *mddev, bool sync);
	void (*daemon_work)(struct mddev *mddev);
	void (*wait_behind_writes)(struct mddev *mddev);

	int (*startwrite)(struct mddev *mddev, sector_t offset,
			  unsigned long sectors, bool behind);
	void (*endwrite)(struct mddev *mddev, sector_t offset,
			 unsigned long sectors, bool success, bool behind);
	bool (*start_sync)(struct mddev *mddev, sector_t offset,
			   sector_t *blocks, bool degraded);
	void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
	void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force);
	void (*close_sync)(struct mddev *mddev);

	void (*update_sb)(void *data);
	int (*get_stats)(void *data, struct md_bitmap_stats *stats);

	void (*sync_with_cluster)(struct mddev *mddev,
				  sector_t old_lo, sector_t old_hi,
				  sector_t new_lo, sector_t new_hi);
	void *(*get_from_slot)(struct mddev *mddev, int slot);
	int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo,
			      sector_t *hi, bool clear_bits);
	void (*set_pages)(void *data, unsigned long pages);
	void (*free)(void *data);
};

void md_bitmap_unplug(struct bitmap *bitmap);
void md_bitmap_unplug_async(struct bitmap *bitmap);
void md_bitmap_daemon_work(struct mddev *mddev);

int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
		     int chunksize, int init);
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
			     sector_t *lo, sector_t *hi, bool clear_bits);
void md_bitmap_free(struct bitmap *bitmap);
void md_bitmap_wait_behind_writes(struct mddev *mddev);

static inline bool md_bitmap_enabled(struct bitmap *bitmap)
{
	return bitmap && bitmap->storage.filemap &&
	       !test_bit(BITMAP_STALE, &bitmap->flags);
}

#endif
/* the bitmap API */
void mddev_set_bitmap_ops(struct mddev *mddev);

#endif
+53 −38
Original line number Diff line number Diff line
@@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
					str, ret);
			goto clear_bit;
		}
		ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
		ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true);
		if (ret) {
			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
			goto clear_bit;
@@ -497,7 +497,7 @@ static void process_suspend_info(struct mddev *mddev,
	 * we don't want to trigger lots of WARN.
	 */
	if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
		md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
		mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low,
						     cinfo->sync_hi, lo, hi);
	cinfo->sync_low = lo;
	cinfo->sync_hi = hi;
@@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
		break;
	case BITMAP_RESIZE:
		if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
			ret = md_bitmap_resize(mddev->bitmap,
					    le64_to_cpu(msg->high), 0, 0);
			ret = mddev->bitmap_ops->resize(mddev,
							le64_to_cpu(msg->high),
							0, false);
		break;
	default:
		ret = -1;
@@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
		}

		/* Read the disk bitmap sb and check if it needs recovery */
		ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
		ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false);
		if (ret) {
			pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
			lockres_free(bm_lockres);
@@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size)

static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
{
	struct bitmap_counts *counts;
	char str[64];
	struct dlm_lock_resource *bm_lockres;
	struct bitmap *bitmap = mddev->bitmap;
	unsigned long my_pages = bitmap->counts.pages;
	void *bitmap = mddev->bitmap;
	struct md_bitmap_stats stats;
	unsigned long my_pages;
	int i, rv;

	rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
	if (rv)
		return rv;

	my_pages = stats.pages;
	/*
	 * We need to ensure all the nodes can grow to a larger
	 * bitmap size before make the reshaping.
@@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
		return rv;

	for (i = 0; i < mddev->bitmap_info.nodes; i++) {
		struct dlm_lock_resource *bm_lockres;
		char str[64];

		if (i == md_cluster_ops->slot_number(mddev))
			continue;

		bitmap = get_bitmap_from_slot(mddev, i);
		bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
		if (IS_ERR(bitmap)) {
			pr_err("can't get bitmap from slot %d\n", i);
			bitmap = NULL;
			goto out;
		}
		counts = &bitmap->counts;

		rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
		if (rv)
			goto out;
		/*
		 * If we can hold the bitmap lock of one node then
		 * the slot is not occupied, update the pages.
@@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
		bm_lockres->flags |= DLM_LKF_NOQUEUE;
		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
		if (!rv)
			counts->pages = my_pages;
			mddev->bitmap_ops->set_pages(bitmap, my_pages);
		lockres_free(bm_lockres);

		if (my_pages != counts->pages)
		if (my_pages != stats.pages)
			/*
			 * Let's revert the bitmap size if one node
			 * can't resize bitmap
			 */
			goto out;
		md_bitmap_free(bitmap);
		mddev->bitmap_ops->free(bitmap);
	}

	return 0;
out:
	md_bitmap_free(bitmap);
	mddev->bitmap_ops->free(bitmap);
	update_bitmap_size(mddev, oldsize);
	return -1;
}
@@ -1207,24 +1216,27 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
 */
static int cluster_check_sync_size(struct mddev *mddev)
{
	int i, rv;
	bitmap_super_t *sb;
	unsigned long my_sync_size, sync_size = 0;
	int node_num = mddev->bitmap_info.nodes;
	int current_slot = md_cluster_ops->slot_number(mddev);
	struct bitmap *bitmap = mddev->bitmap;
	char str[64];
	int node_num = mddev->bitmap_info.nodes;
	struct dlm_lock_resource *bm_lockres;
	struct md_bitmap_stats stats;
	void *bitmap = mddev->bitmap;
	unsigned long sync_size = 0;
	unsigned long my_sync_size;
	char str[64];
	int i, rv;

	sb = kmap_atomic(bitmap->storage.sb_page);
	my_sync_size = sb->sync_size;
	kunmap_atomic(sb);
	rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
	if (rv)
		return rv;

	my_sync_size = stats.sync_size;

	for (i = 0; i < node_num; i++) {
		if (i == current_slot)
			continue;

		bitmap = get_bitmap_from_slot(mddev, i);
		bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
		if (IS_ERR(bitmap)) {
			pr_err("can't get bitmap from slot %d\n", i);
			return -1;
@@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev)
		bm_lockres = lockres_init(mddev, str, NULL, 1);
		if (!bm_lockres) {
			pr_err("md-cluster: Cannot initialize %s\n", str);
			md_bitmap_free(bitmap);
			mddev->bitmap_ops->free(bitmap);
			return -1;
		}
		bm_lockres->flags |= DLM_LKF_NOQUEUE;
		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
		if (!rv)
			md_bitmap_update_sb(bitmap);
			mddev->bitmap_ops->update_sb(bitmap);
		lockres_free(bm_lockres);

		sb = kmap_atomic(bitmap->storage.sb_page);
		if (sync_size == 0)
			sync_size = sb->sync_size;
		else if (sync_size != sb->sync_size) {
			kunmap_atomic(sb);
			md_bitmap_free(bitmap);
		rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
		if (rv) {
			mddev->bitmap_ops->free(bitmap);
			return rv;
		}

		if (sync_size == 0) {
			sync_size = stats.sync_size;
		} else if (sync_size != stats.sync_size) {
			mddev->bitmap_ops->free(bitmap);
			return -1;
		}
		kunmap_atomic(sb);
		md_bitmap_free(bitmap);
		mddev->bitmap_ops->free(bitmap);
	}

	return (my_sync_size == sync_size) ? 0 : -1;
@@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
	for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
		if (sn == (cinfo->slot_number - 1))
			continue;
		err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
		err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false);
		if (err) {
			pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
			goto out;
+115 −179

File changed.

Preview size limit exceeded, changes collapsed.

Loading