Commit c7bd85a7 authored by Tony Battersby's avatar Tony Battersby Committed by Martin K. Petersen
Browse files

scsi: qla2xxx: target: Add back SRR support



Background: loading qla2xxx with "ql2xtgt_tape_enable=1" enables
Sequence Level Error Recovery (SLER), which is most commonly used for
tape drives.  With SLER enabled, if there is a recoverable I/O error
during a SCSI command, a Sequence Retransmission Request (SRR) will be
used to retry the I/O at a low-level completely within the driver
without propagating the error to the upper levels of the SCSI stack.

SRR support was removed in 2017 by commit 2c39b5ca ("qla2xxx: Remove
SRR code"). Add it back, new and improved.

The old removed SRR code used sequence numbers to correlate the SRR
CTIOs with SRR immediate notify messages.  I don't see how that would
work reliably with MSI-X interrupts and multiple queues.  So instead use
the exchange address to find the command associated with the immediate
notify (qlt_srr_to_cmd).

The old removed SRR code had a function qlt_check_srr_debug() to
simulate a SRR, but it didn't work for me.  Instead I just used fiber
optic attenuators attached to the FC cable to reduce the strength of the
signal and induce errors.  Unfortunately this only worked for inducing
SRRs on Data-Out (write) commands, so that is all I was able to test.

The code to build a new scatterlist for a SRR with nonzero offset has
been improved to reduce memory requirements and has been well-tested.
However it does not support protection information.

When a single cmd gets multiple SRRs, the old removed SRR code would
restore the data buffer from the values in cmd->se_cmd before processing
the new SRR.  That might be needed if the offset for the new SRR was
lower than the offset for the previous SRR, but I am not sure if that
can happen.  In my testing, when a single cmd gets multiple SRRs, the
SRR offset always increases or stays the same.  But in case it can
decrease, I added the function qlt_restore_orig_sg().  If this is not
supposed to happen then qlt_restore_orig_sg() can be removed to simplify
the code.

I ran into some HBA firmware bugs with QLE269x, QLE27xx, and QLE28xx
firmware 9.05.xx - 9.08.xx where a SRR would cause the HBA to misbehave
badly.  Since SRRs are rare and therefore difficult to test, I figured
it would be worth checking for the buggy firmware and disabling SLER
with a warning instead of letting others run into the same problem on
the rare occasion that they get a SRR.  This turned out to be difficult
because the firmware version isn't known in the normal NVRAM config
routine, so I added a second NVRAM config routine that is called after
the firmware version is known.

Signed-off-by: default avatarTony Battersby <tonyb@cybernetics.com>
Link: https://patch.msgid.link/654b7181-b79e-40ed-a15b-6d6e441a5d5f@cybernetics.com


Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent 04957d8c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@
 * | Target Mode Management	  |	  0xf09b       | 0xf002		|
 * |                              |                    | 0xf046-0xf049  |
 * | Target Mode Task Management  |	  0x1000d      |		|
 * | Target Mode SRR		  |	  0x11038      |		|
 * ----------------------------------------------------------------------
 */

+1 −0
Original line number Diff line number Diff line
@@ -4369,6 +4369,7 @@ qla2x00_setup_chip(scsi_qla_host_t *vha)
						ha->max_npiv_vports =
						    MIN_MULTI_ID_FABRIC - 1;
				}
				qlt_config_nvram_with_fw_version(vha);
				qla2x00_get_resource_cnts(vha);
				qla_init_iocb_limit(vha);

+1043 −1

File changed.

Preview size limit exceeded, changes collapsed.

+81 −0
Original line number Diff line number Diff line
@@ -184,6 +184,7 @@ struct nack_to_isp {
#define NOTIFY_ACK_SRR_REJECT_REASON_UNABLE_TO_PERFORM	0x9

#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_NO_EXPL		0
#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_INVALID_OX_ID_RX_ID	0x17
#define NOTIFY_ACK_SRR_FLAGS_REJECT_EXPL_UNABLE_TO_SUPPLY_DATA	0x2a

#define NOTIFY_ACK_SUCCESS      0x01
@@ -686,6 +687,8 @@ struct qla_tgt_func_tmpl {
	int (*handle_tmr)(struct qla_tgt_mgmt_cmd *, u64, uint16_t,
			uint32_t);
	struct qla_tgt_cmd *(*get_cmd)(struct fc_port *);
	int (*get_cmd_ref)(struct qla_tgt_cmd *cmd);
	void (*put_cmd_ref)(struct qla_tgt_cmd *cmd);
	void (*rel_cmd)(struct qla_tgt_cmd *);
	void (*free_cmd)(struct qla_tgt_cmd *);
	void (*free_mcmd)(struct qla_tgt_mgmt_cmd *);
@@ -823,7 +826,13 @@ struct qla_tgt {
	int notify_ack_expected;
	int abts_resp_expected;
	int modify_lun_expected;

	spinlock_t srr_lock;
	struct list_head srr_list;
	struct work_struct srr_work;

	atomic_t tgt_global_resets_count;

	struct list_head tgt_list_entry;
};

@@ -861,6 +870,7 @@ enum trace_flags {
	TRC_DATA_IN = BIT_18,
	TRC_ABORT = BIT_19,
	TRC_DIF_ERR = BIT_20,
	TRC_SRR_IMM = BIT_21,
};

struct qla_tgt_cmd {
@@ -881,6 +891,10 @@ struct qla_tgt_cmd {

	unsigned int conf_compl_supported:1;
	unsigned int sg_mapped:1;

	/* Call qlt_free_sg() if set. */
	unsigned int free_sg:1;

	unsigned int write_data_transferred:1;

	/* Set if the SCSI status was sent successfully. */
@@ -892,6 +906,9 @@ struct qla_tgt_cmd {
	unsigned int cmd_in_wq:1;
	unsigned int edif:1;

	/* Set if a SRR was rejected. */
	unsigned int srr_failed:1;

	/* Set if the exchange has been terminated. */
	unsigned int sent_term_exchg:1;

@@ -901,6 +918,7 @@ struct qla_tgt_cmd {
	 */
	unsigned int aborted:1;

	struct qla_tgt_srr *srr;
	struct scatterlist *sg;	/* cmd data buffer SG vector */
	int sg_cnt;		/* SG segments count */
	int bufflen;		/* cmd buffer length */
@@ -940,6 +958,14 @@ struct qla_tgt_cmd {
	uint16_t prot_flags;

	unsigned long jiffies_at_term_exchg;

	/*
	 * jiffies64 when qlt_rdy_to_xfer() or qlt_xmit_response() first
	 * called, or 0 when not in those states.  Used to limit the number of
	 * SRR retries.
	 */
	uint64_t jiffies_at_hw_st_entry;

	uint64_t jiffies_at_alloc;
	uint64_t jiffies_at_free;

@@ -1002,6 +1028,45 @@ struct qla_tgt_prm {
	uint16_t tot_dsds;
};

/*
 * SRR (Sequence Retransmission Request) - resend or re-receive some or all
 * data or status to recover from a transient I/O error.
 */
struct qla_tgt_srr {
	/*
	 * Copy of immediate notify SRR message received from hw; valid only if
	 * imm_ntfy_recvd is true.
	 */
	struct imm_ntfy_from_isp imm_ntfy;

	struct list_head srr_list_entry;

	/* The command affected by this SRR, or NULL if not yet determined. */
	struct qla_tgt_cmd *cmd;

	/* Used to detect if the HBA has been reset since receiving the SRR. */
	uint32_t reset_count;

	/*
	 * The hardware sends two messages for each SRR - an immediate notify
	 * and a CTIO with CTIO_SRR_RECEIVED status.  These keep track of which
	 * messages have been received.  The SRR can be processed once both of
	 * these are true.
	 */
	bool imm_ntfy_recvd;
	bool ctio_recvd;

	/*
	 * This is set to true if the affected command was aborted (cmd may be
	 * set to NULL), in which case the immediate notify exchange also needs
	 * to be aborted.
	 */
	bool aborted;

	/* This is set to true to force the SRR to be rejected. */
	bool reject;
};

/* Check for Switch reserved address */
#define IS_SW_RESV_ADDR(_s_id) \
	((_s_id.b.domain == 0xff) && ((_s_id.b.area & 0xf0) == 0xf0))
@@ -1056,6 +1121,20 @@ static inline uint32_t sid_to_key(const be_id_t s_id)
		s_id.al_pa;
}

/*
 * Free the scatterlist allocated by qlt_set_data_offset().  Call this only if
 * cmd->free_sg is set.
 */
static inline void qlt_free_sg(struct qla_tgt_cmd *cmd)
{
	/*
	 * The scatterlist may be chained to the original scatterlist, but we
	 * only need to free the first segment here since that is the only part
	 * allocated by qlt_set_data_offset().
	 */
	kfree(cmd->sg);
}

/*
 * Exported symbols from qla_target.c LLD logic used by qla2xxx code..
 */
@@ -1064,6 +1143,7 @@ extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, struct rsp_que *,
extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *);
extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t);
extern int qlt_abort_cmd(struct qla_tgt_cmd *);
void qlt_srr_abort(struct qla_tgt_cmd *cmd, bool reject);
void qlt_send_term_exchange(struct qla_qpair *qpair,
	struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked);
extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
@@ -1086,6 +1166,7 @@ extern void qlt_81xx_config_nvram_stage2(struct scsi_qla_host *,
	struct init_cb_81xx *);
extern void qlt_81xx_config_nvram_stage1(struct scsi_qla_host *,
	struct nvram_81xx *);
void qlt_config_nvram_with_fw_version(struct scsi_qla_host *vha);
extern void qlt_modify_vp_config(struct scsi_qla_host *,
	struct vp_config_entry_24xx *);
extern void qlt_probe_one_stage1(struct scsi_qla_host *, struct qla_hw_data *);
+15 −0
Original line number Diff line number Diff line
@@ -291,6 +291,16 @@ static struct qla_tgt_cmd *tcm_qla2xxx_get_cmd(struct fc_port *sess)
	return cmd;
}

static int tcm_qla2xxx_get_cmd_ref(struct qla_tgt_cmd *cmd)
{
	return target_get_sess_cmd(&cmd->se_cmd, true);
}

static void tcm_qla2xxx_put_cmd_ref(struct qla_tgt_cmd *cmd)
{
	target_put_sess_cmd(&cmd->se_cmd);
}

static void tcm_qla2xxx_rel_cmd(struct qla_tgt_cmd *cmd)
{
	target_free_tag(cmd->sess->se_sess, &cmd->se_cmd);
@@ -531,6 +541,9 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
		if (cmd->se_cmd.pi_err)
			transport_generic_request_failure(&cmd->se_cmd,
				cmd->se_cmd.pi_err);
		else if (cmd->srr_failed)
			transport_generic_request_failure(&cmd->se_cmd,
				TCM_SNACK_REJECTED);
		else
			transport_generic_request_failure(&cmd->se_cmd,
				TCM_CHECK_CONDITION_ABORT_CMD);
@@ -1526,6 +1539,8 @@ static const struct qla_tgt_func_tmpl tcm_qla2xxx_template = {
	.handle_data		= tcm_qla2xxx_handle_data,
	.handle_tmr		= tcm_qla2xxx_handle_tmr,
	.get_cmd		= tcm_qla2xxx_get_cmd,
	.get_cmd_ref		= tcm_qla2xxx_get_cmd_ref,
	.put_cmd_ref		= tcm_qla2xxx_put_cmd_ref,
	.rel_cmd		= tcm_qla2xxx_rel_cmd,
	.free_cmd		= tcm_qla2xxx_free_cmd,
	.free_mcmd		= tcm_qla2xxx_free_mcmd,