Commit e0171b87 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'add-devlink-health-reporters-for-nix-block'

George Cherian says:

====================
Add devlink health reporters for NIX block

Devlink health reporters are added for the NIX block.

Address Jakub's comment to add devlink support for error reporting.
https://www.spinics.net/lists/netdev/msg670712.html

This series is in continuation to
https://www.spinics.net/lists/netdev/msg707798.html

Added Documentation for the same.
====================

Link: https://lore.kernel.org/r/20210119100120.2614730-1-george.cherian@marvell.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 9e8789c8 d41b3365
Loading
Loading
Loading
Loading
+70 −0
Original line number Diff line number Diff line
@@ -217,3 +217,73 @@ For example::
	 NPA_AF_ERR:
	        NPA Error Interrupt Reg : 4096
	        AQ Doorbell Error


NIX Reporters
-------------
The NIX reporters are responsible for reporting and recovering the following group of errors:

1. GENERAL events

   - Receive mirror/multicast packet drop due to insufficient buffer.
   - SMQ Flush operation.

2. ERROR events

   - Memory Fault due to WQE read/write from multicast/mirror buffer.
   - Receive multicast/mirror replication list error.
   - Receive packet on an unmapped PF.
   - Fault due to NIX_AQ_INST_S read or NIX_AQ_RES_S write.
   - AQ Doorbell Error.

3. RAS events

   - RAS Error Reporting for NIX Receive Multicast/Mirror Entry Structure.
   - RAS Error Reporting for WQE/Packet Data read from Multicast/Mirror Buffer..
   - RAS Error Reporting for NIX_AQ_INST_S/NIX_AQ_RES_S.

4. RVU events

   - Error due to unmapped slot.

Sample Output::

	~# ./devlink health
	pci/0002:01:00.0:
	  reporter hw_npa_intr
	    state healthy error 0 recover 0 grace_period 0 auto_recover true auto_dump true
	  reporter hw_npa_gen
	    state healthy error 0 recover 0 grace_period 0 auto_recover true auto_dump true
	  reporter hw_npa_err
	    state healthy error 0 recover 0 grace_period 0 auto_recover true auto_dump true
	  reporter hw_npa_ras
	    state healthy error 0 recover 0 grace_period 0 auto_recover true auto_dump true
	  reporter hw_nix_intr
	    state healthy error 1121 recover 1121 last_dump_date 2021-01-19 last_dump_time 05:42:26 grace_period 0 auto_recover true auto_dump true
	  reporter hw_nix_gen
	    state healthy error 949 recover 949 last_dump_date 2021-01-19 last_dump_time 05:42:43 grace_period 0 auto_recover true auto_dump true
	  reporter hw_nix_err
	    state healthy error 1147 recover 1147 last_dump_date 2021-01-19 last_dump_time 05:42:59 grace_period 0 auto_recover true auto_dump true
	  reporter hw_nix_ras
	    state healthy error 409 recover 409 last_dump_date 2021-01-19 last_dump_time 05:43:16 grace_period 0 auto_recover true auto_dump true

Each reporter dumps the

 - Error Type
 - Error Register value
 - Reason in words

For example::

	~# devlink health dump show pci/0002:01:00.0 reporter hw_nix_intr
	 NIX_AF_RVU:
	        NIX RVU Interrupt Reg : 1
	        Unmap Slot Error
	~# devlink health dump show pci/0002:01:00.0 reporter hw_nix_gen
	 NIX_AF_GENERAL:
	        NIX General Interrupt Reg : 1
	        Rx multicast pkt drop
	~# devlink health dump show pci/0002:01:00.0 reporter hw_nix_err
	 NIX_AF_ERR:
	        NIX Error Interrupt Reg : 64
	        Rx on unmapped PF_FUNC
+651 −1

File changed.

Preview size limit exceeded, changes collapsed.

+27 −0
Original line number Diff line number Diff line
@@ -41,11 +41,38 @@ struct rvu_npa_health_reporters {
	struct work_struct              ras_work;
};

enum nix_af_rvu_health {
	NIX_AF_RVU_INTR,
	NIX_AF_RVU_GEN,
	NIX_AF_RVU_ERR,
	NIX_AF_RVU_RAS,
};

struct rvu_nix_event_ctx {
	u64 nix_af_rvu_int;
	u64 nix_af_rvu_gen;
	u64 nix_af_rvu_err;
	u64 nix_af_rvu_ras;
};

struct rvu_nix_health_reporters {
	struct rvu_nix_event_ctx *nix_event_ctx;
	struct devlink_health_reporter *rvu_hw_nix_intr_reporter;
	struct work_struct		intr_work;
	struct devlink_health_reporter *rvu_hw_nix_gen_reporter;
	struct work_struct		gen_work;
	struct devlink_health_reporter *rvu_hw_nix_err_reporter;
	struct work_struct		err_work;
	struct devlink_health_reporter *rvu_hw_nix_ras_reporter;
	struct work_struct		ras_work;
};

struct rvu_devlink {
	struct devlink *dl;
	struct rvu *rvu;
	struct workqueue_struct *devlink_wq;
	struct rvu_npa_health_reporters *rvu_npa_health_reporter;
	struct rvu_nix_health_reporters *rvu_nix_health_reporter;
};

/* Devlink APIs */
+10 −0
Original line number Diff line number Diff line
@@ -74,6 +74,16 @@ enum npa_af_int_vec_e {
	NPA_AF_INT_VEC_CNT	= 0x5,
};

/* NIX Admin function Interrupt Vector Enumeration */
enum nix_af_int_vec_e {
	NIX_AF_INT_VEC_RVU	= 0x0,
	NIX_AF_INT_VEC_GEN	= 0x1,
	NIX_AF_INT_VEC_AQ_DONE	= 0x2,
	NIX_AF_INT_VEC_AF_ERR	= 0x3,
	NIX_AF_INT_VEC_POISON	= 0x4,
	NIX_AF_INT_VEC_CNT	= 0x5,
};

/**
 * RVU PF Interrupt Vector Enumeration
 */