Commit 320f9b1c authored by Uday Shankar's avatar Uday Shankar Committed by Jens Axboe
Browse files

selftests: ublk: test that teardown after incomplete recovery completes



Before the fix, teardown of a ublk server that was attempting to recover
a device, but died when it had submitted a nonempty proper subset of the
fetch commands to any queue would loop forever. Add a test to verify
that, after the fix, teardown completes. This is done by:

- Adding a new argument to the fault_inject target that causes it die
  after fetching a nonempty proper subset of the IOs to a queue
- Using that argument in a new test while trying to recover an
  already-created device
- Attempting to delete the ublk device at the end of the test; this
  hangs forever if teardown from the fault-injected ublk server never
  completed.

It was manually verified that the test passes with the fix and hangs
without it.

Signed-off-by: default avatarUday Shankar <ushankar@purestorage.com>
Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://patch.msgid.link/20260405-cancel-v2-2-02d711e643c2@purestorage.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 0842186d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh
TEST_PROGS += test_generic_12.sh
TEST_PROGS += test_generic_13.sh
TEST_PROGS += test_generic_16.sh
TEST_PROGS += test_generic_17.sh

TEST_PROGS += test_batch_01.sh
TEST_PROGS += test_batch_02.sh
+49 −3
Original line number Diff line number Diff line
@@ -10,11 +10,17 @@

#include "kublk.h"

struct fi_opts {
	long long delay_ns;
	bool die_during_fetch;
};

static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
				      struct ublk_dev *dev)
{
	const struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
	unsigned long dev_size = 250UL << 30;
	struct fi_opts *opts = NULL;

	if (ctx->auto_zc_fallback) {
		ublk_err("%s: not support auto_zc_fallback\n", __func__);
@@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
	};
	ublk_set_integrity_params(ctx, &dev->tgt.params);

	dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
	opts = calloc(1, sizeof(*opts));
	if (!opts) {
		ublk_err("%s: couldn't allocate memory for opts\n", __func__);
		return -ENOMEM;
	}

	opts->delay_ns = ctx->fault_inject.delay_us * 1000;
	opts->die_during_fetch = ctx->fault_inject.die_during_fetch;
	dev->private_data = opts;

	return 0;
}

static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t,
					   struct ublk_queue *q, int tag,
					   bool batch)
{
	struct fi_opts *opts = q->dev->private_data;

	if (!opts->die_during_fetch)
		return;

	/*
	 * Each queue fetches its IOs in increasing order of tags, so
	 * dying just before we're about to fetch tag 1 (regardless of
	 * what queue we're on) guarantees that we've fetched a nonempty
	 * proper subset of the tags on that queue.
	 */
	if (tag == 1) {
		/*
		 * Ensure our commands are actually live in the kernel
		 * before we die.
		 */
		io_uring_submit(&t->ring);
		raise(SIGKILL);
	}
}

static int ublk_fault_inject_queue_io(struct ublk_thread *t,
				      struct ublk_queue *q, int tag)
{
	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
	struct io_uring_sqe *sqe;
	struct fi_opts *opts = q->dev->private_data;
	struct __kernel_timespec ts = {
		.tv_nsec = (long long)q->dev->private_data,
		.tv_nsec = opts->delay_ns,
	};

	ublk_io_alloc_sqes(t, &sqe, 1);
@@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv
{
	static const struct option longopts[] = {
		{ "delay_us", 	1,	NULL,  0  },
		{ "die_during_fetch", 1, NULL, 0  },
		{ 0, 0, 0, 0 }
	};
	int option_idx, opt;

	ctx->fault_inject.delay_us = 0;
	ctx->fault_inject.die_during_fetch = false;
	while ((opt = getopt_long(argc, argv, "",
				  longopts, &option_idx)) != -1) {
		switch (opt) {
		case 0:
			if (!strcmp(longopts[option_idx].name, "delay_us"))
				ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10);
			if (!strcmp(longopts[option_idx].name, "die_during_fetch"))
				ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10);
		}
	}
}

static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops)
{
	printf("\tfault_inject: [--delay_us us (default 0)]\n");
	printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n");
}

const struct ublk_tgt_ops fault_inject_tgt_ops = {
	.name = "fault_inject",
	.init_tgt = ublk_fault_inject_tgt_init,
	.pre_fetch_io = ublk_fault_inject_pre_fetch_io,
	.queue_io = ublk_fault_inject_queue_io,
	.tgt_io_done = ublk_fault_inject_tgt_io_done,
	.parse_cmd_line = ublk_fault_inject_cmd_line,
+7 −0
Original line number Diff line number Diff line
@@ -796,6 +796,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
			q = &t->dev->q[q_id];
			io = &q->ios[tag];
			io->buf_index = j++;
			if (q->tgt_ops->pre_fetch_io)
				q->tgt_ops->pre_fetch_io(t, q, tag, false);
			ublk_queue_io_cmd(t, io);
		}
	} else {
@@ -807,6 +809,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
		for (i = 0; i < q->q_depth; i++) {
			io = &q->ios[i];
			io->buf_index = i;
			if (q->tgt_ops->pre_fetch_io)
				q->tgt_ops->pre_fetch_io(t, q, i, false);
			ublk_queue_io_cmd(t, io);
		}
	}
@@ -983,6 +987,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t)
		if (t->q_map[i] == 0)
			continue;

		if (q->tgt_ops->pre_fetch_io)
			q->tgt_ops->pre_fetch_io(t, q, 0, true);

		ret = ublk_batch_queue_prep_io_cmds(t, q);
		ublk_assert(ret >= 0);
	}
+3 −0
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ struct stripe_ctx {
struct fault_inject_ctx {
	/* fault_inject */
	unsigned long   delay_us;
	bool die_during_fetch;
};

struct dev_ctx {
@@ -138,6 +139,8 @@ struct ublk_tgt_ops {
	int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *);
	void (*deinit_tgt)(struct ublk_dev *);

	void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q,
			     int tag, bool batch);
	int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag);
	void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *,
			    const struct io_uring_cqe *);
+35 −0
Original line number Diff line number Diff line
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh

ERR_CODE=0

_prep_test "fault_inject" "teardown after incomplete recovery"

# First start and stop a ublk server with device configured for recovery
dev_id=$(_add_ublk_dev -t fault_inject -r 1)
_check_add_dev $TID $?
state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
if [ "$state" != "QUIESCED" ]; then
        echo "device isn't quiesced($state) after $action"
        ERR_CODE=255
fi

# Then recover the device, but use --die_during_fetch to have the ublk
# server die while a queue has some (but not all) I/Os fetched
${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1
RECOVER_RES=$?
# 137 is the result when dying of SIGKILL
if (( RECOVER_RES != 137 )); then
        echo "recover command exited with unexpected code ${RECOVER_RES}!"
        ERR_CODE=255
fi

# Clean up the device. This can only succeed once teardown of the above
# exited ublk server completes. So if teardown never completes, we will
# time out here
_ublk_del_dev "${dev_id}"

_cleanup_test "fault_inject"
_show_result $TID $ERR_CODE