Commit ad97840f authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: Introduce funcs for generating cper record



Introduce new functions that are used to generate
cper ue or ce records.

v2: return -ENOMEM instead of false
v2: check return value of fill section function

Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarXiang Liu <xiang.liu@amd.com>
Reviewed-by: default avatarYang Wang <keivnyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 56316ee9
Loading
Loading
Loading
Loading
+1 −11
Original line number Diff line number Diff line
@@ -30,16 +30,6 @@

typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);

struct aca_banks {
	int nr_banks;
	struct list_head list;
};

struct aca_hwip {
	int hwid;
	int mcatype;
};

static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
	ACA_BANK_HWID(SMU,	0x01,	0x01),
	ACA_BANK_HWID(PCS_XGMI, 0x50,	0x00),
@@ -111,7 +101,7 @@ static struct aca_regs_dump {
	{"STATUS",		ACA_REG_IDX_STATUS},
	{"ADDR",		ACA_REG_IDX_ADDR},
	{"MISC",		ACA_REG_IDX_MISC0},
	{"CONFIG",		ACA_REG_IDX_CONFG},
	{"CONFIG",		ACA_REG_IDX_CONFIG},
	{"IPID",		ACA_REG_IDX_IPID},
	{"SYND",		ACA_REG_IDX_SYND},
	{"DESTAT",		ACA_REG_IDX_DESTAT},
+11 −1
Original line number Diff line number Diff line
@@ -81,7 +81,7 @@ enum aca_reg_idx {
	ACA_REG_IDX_STATUS		= 1,
	ACA_REG_IDX_ADDR		= 2,
	ACA_REG_IDX_MISC0		= 3,
	ACA_REG_IDX_CONFG		= 4,
	ACA_REG_IDX_CONFIG		= 4,
	ACA_REG_IDX_IPID		= 5,
	ACA_REG_IDX_SYND		= 6,
	ACA_REG_IDX_DESTAT		= 8,
@@ -114,6 +114,11 @@ enum aca_smu_type {
	ACA_SMU_TYPE_COUNT,
};

struct aca_hwip {
	int hwid;
	int mcatype;
};

struct aca_bank {
	enum aca_error_type aca_err_type;
	enum aca_smu_type smu_err_type;
@@ -125,6 +130,11 @@ struct aca_bank_node {
	struct list_head node;
};

struct aca_banks {
	int nr_banks;
	struct list_head list;
};

struct aca_bank_info {
	int die_id;
	int socket_id;
+108 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
#include <linux/list.h>
#include "amdgpu.h"

static const guid_t MCE			= CPER_NOTIFY_MCE;
@@ -257,6 +258,113 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
	return hdr;
}

int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
				   struct aca_bank *bank)
{
	struct cper_hdr *fatal = NULL;
	struct cper_sec_crashdump_reg_data reg_data = { 0 };
	int ret;

	fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
	if (!fatal) {
		dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
		return -ENOMEM;
	}

	reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
	reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
	reg_data.addr_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
	reg_data.addr_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
	reg_data.ipid_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
	reg_data.ipid_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
	reg_data.synd_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
	reg_data.synd_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);

	amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
	ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
	if (ret)
		return ret;

	/*TODO: commit the cper entry to cper ring */

	return 0;
}

static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
								enum aca_error_type aca_err_type)
{
	switch (aca_err_type) {
	case ACA_ERROR_TYPE_UE:
		return CPER_SEV_FATAL;
	case ACA_ERROR_TYPE_CE:
		return CPER_SEV_NON_FATAL_CORRECTED;
	case ACA_ERROR_TYPE_DEFERRED:
		return CPER_SEV_NON_FATAL_UNCORRECTED;
	default:
		dev_err(adev->dev, "Unknown ACA error type!\n");
		return CPER_SEV_FATAL;
	}
}

int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
				    struct aca_banks *banks,
				    uint16_t bank_count)
{
	struct cper_hdr *corrected = NULL;
	enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
	uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
	struct aca_bank_node *node;
	struct aca_bank *bank;
	uint32_t i = 0;
	int ret;

	corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
	if (!corrected) {
		dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
		return -ENOMEM;
	}

	/* Raise severity if any DE is detected in the ACA bank list */
	list_for_each_entry(node, &banks->list, node) {
		bank = &node->bank;
		if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
			sev = CPER_SEV_NON_FATAL_UNCORRECTED;
			break;
		}
	}

	amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);

	/* Combine CE and UE in cper record */
	list_for_each_entry(node, &banks->list, node) {
		bank = &node->bank;
		reg_data[CPER_ACA_REG_CTL_LO]    = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
		reg_data[CPER_ACA_REG_CTL_HI]    = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
		reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
		reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
		reg_data[CPER_ACA_REG_ADDR_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
		reg_data[CPER_ACA_REG_ADDR_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
		reg_data[CPER_ACA_REG_MISC0_LO]  = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
		reg_data[CPER_ACA_REG_MISC0_HI]  = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
		reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
		reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
		reg_data[CPER_ACA_REG_IPID_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
		reg_data[CPER_ACA_REG_IPID_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
		reg_data[CPER_ACA_REG_SYND_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
		reg_data[CPER_ACA_REG_SYND_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);

		ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
				amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
				reg_data, CPER_ACA_REG_COUNT);
		if (ret)
			return ret;
	}

	/*TODO: commit the cper entry to cper ring */

	return 0;
}

int amdgpu_cper_init(struct amdgpu_device *adev)
{
	mutex_init(&adev->cper.cper_lock);
+8 −1
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#define __AMDGPU_CPER_H__

#include "amd_cper.h"
#include "amdgpu_aca.h"

#define CPER_MAX_ALLOWED_COUNT		0x1000
#define HDR_LEN				(sizeof(struct cper_hdr))
@@ -84,7 +85,13 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
					 enum amdgpu_cper_type type,
					 uint16_t section_count);

/* UE must be encoded into separated cper entries, 1 UE 1 cper */
int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
				   struct aca_bank *bank);
/* CEs and DEs are combined into 1 cper entry */
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
				    struct aca_banks *banks,
				    uint16_t bank_count);
int amdgpu_cper_init(struct amdgpu_device *adev);
int amdgpu_cper_fini(struct amdgpu_device *adev);