Commit 418b3e64 authored by Benjamin Marzinski's avatar Benjamin Marzinski Committed by Yu Kuai
Browse files

md/raid5: Fix UAF on IO across the reshape position

If make_stripe_request() returns STRIPE_WAIT_RESHAPE,
raid5_make_request() will free the cloned bio. But raid5_make_request()
can call make_stripe_request() multiple times, writing to the various
stripes. If that bio got added to the toread or towrite lists of a
stripe disk in an earlier call to make_stripe_request(), then it's not
safe to just free the bio if a later part of it is found to cross the
reshape position. Doing so can lead to a UAF error, when bio_endio()
is called on the bio for the earlier stripes.

Instead, raid5_make_request() needs to wait until all parts of the bio
have called bio_endio(). To do this, bios that cross the reshape
position while the reshape can't make progress are flagged as needing to
wait for all parts to complete. When raid5_make_request() has a bio that
failed make_stripe_request() with STRIPE_WAIT_RESHAPE, it sets
bi->bi_private to a completion struct and waits for completion after
ending the bio.  When the bio_endio() is called for the last time on a
clone bio with bi->bi_private set, it wakes up the waiter. This
guarantees that raid5_make_request() doesn't return until the cloned bio
needing a retry for io across the reshape boundary is safely cleaned up.

There is a simple reproducer available at [1]. Compile the kernel with
KASAN for more useful reporting when the error is triggered (this is not
necessary to see the bug).

[1] https://gist.github.com/bmarzins/e48598824305cf2171289e47d7241fa5



Signed-off-by: default avatarBenjamin Marzinski <bmarzins@redhat.com>
Reviewed-by: default avatarXiao Ni <xni@redhat.com>
Link: https://lore.kernel.org/r/20260408043548.1695157-1-bmarzins@redhat.com


Signed-off-by: default avatarYu Kuai <yukuai@fnnas.com>
parent 0898a817
Loading
Loading
Loading
Loading
+8 −23
Original line number Diff line number Diff line
@@ -9341,9 +9341,11 @@ static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)

static void md_end_clone_io(struct bio *bio)
{
	struct md_io_clone *md_io_clone = bio->bi_private;
	struct md_io_clone *md_io_clone = container_of(bio, struct md_io_clone,
						       bio_clone);
	struct bio *orig_bio = md_io_clone->orig_bio;
	struct mddev *mddev = md_io_clone->mddev;
	struct completion *reshape_completion = bio->bi_private;

	if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
		md_bitmap_end(mddev, md_io_clone);
@@ -9355,6 +9357,9 @@ static void md_end_clone_io(struct bio *bio)
		bio_end_io_acct(orig_bio, md_io_clone->start_time);

	bio_put(bio);
	if (unlikely(reshape_completion))
		complete(reshape_completion);
	else
		bio_endio(orig_bio);
	percpu_ref_put(&mddev->active_io);
}
@@ -9380,7 +9385,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
	}

	clone->bi_end_io = md_end_clone_io;
	clone->bi_private = md_io_clone;
	clone->bi_private = NULL;
	*bio = clone;
}

@@ -9391,26 +9396,6 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);

void md_free_cloned_bio(struct bio *bio)
{
	struct md_io_clone *md_io_clone = bio->bi_private;
	struct bio *orig_bio = md_io_clone->orig_bio;
	struct mddev *mddev = md_io_clone->mddev;

	if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
		md_bitmap_end(mddev, md_io_clone);

	if (bio->bi_status && !orig_bio->bi_status)
		orig_bio->bi_status = bio->bi_status;

	if (md_io_clone->start_time)
		bio_end_io_acct(orig_bio, md_io_clone->start_time);

	bio_put(bio);
	percpu_ref_put(&mddev->active_io);
}
EXPORT_SYMBOL_GPL(md_free_cloned_bio);

/* md_allow_write(mddev)
 * Calling this ensures that the array is marked 'active' so that writes
 * may proceed without blocking.  It is important to call this before
+0 −1
Original line number Diff line number Diff line
@@ -920,7 +920,6 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
			struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);

extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+6 −1
Original line number Diff line number Diff line
@@ -6217,7 +6217,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)

	mempool_free(ctx, conf->ctx_pool);
	if (res == STRIPE_WAIT_RESHAPE) {
		md_free_cloned_bio(bi);
		DECLARE_COMPLETION_ONSTACK(done);
		WRITE_ONCE(bi->bi_private, &done);

		bio_endio(bi);

		wait_for_completion(&done);
		return false;
	}