Commit abc3b5d2 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher
Browse files

drm/amdgpu: add new aca_smu_type support



Add new types to distinguish between ACA error type and smu mca type.

e.g.:
the ACA_ERROR_TYPE_DEFERRED is not matched any smu mca valid bank
channel, so add new type 'aca_smu_type' to distinguish aca error type
and smu mca type.

Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a43dbeab
Loading
Loading
Loading
Loading
+45 −24
Original line number Diff line number Diff line
@@ -28,7 +28,7 @@

#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}

typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data);
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);

struct aca_banks {
	int nr_banks;
@@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)
	}
}

static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count)
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
{
	struct amdgpu_aca *aca = &adev->aca;
	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
@@ -127,7 +127,7 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
			 idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
}

static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type,
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
				       int start, int count,
				       struct aca_banks *banks)
{
@@ -143,13 +143,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
		return -EOPNOTSUPP;

	switch (type) {
	case ACA_ERROR_TYPE_UE:
	case ACA_SMU_TYPE_UE:
		max_count = smu_funcs->max_ue_bank_count;
		break;
	case ACA_ERROR_TYPE_CE:
	case ACA_SMU_TYPE_CE:
		max_count = smu_funcs->max_ce_bank_count;
		break;
	case ACA_ERROR_TYPE_DEFERRED:
	default:
		return -EINVAL;
	}
@@ -164,6 +163,8 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
		if (ret)
			return ret;

		bank.type = type;

		aca_smu_bank_dump(adev, i, count, &bank);

		ret = aca_banks_add_bank(banks, &bank);
@@ -195,7 +196,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t
	return hwip->hwid == hwid && hwip->mcatype == mcatype;
}

static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type)
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
	const struct aca_bank_ops *bank_ops = handle->bank_ops;

@@ -297,7 +298,7 @@ static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type,
}

static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank,
				    enum aca_error_type type, struct aca_bank_report *report)
				    enum aca_smu_type type, struct aca_bank_report *report)
{
	const struct aca_bank_ops *bank_ops = handle->bank_ops;

@@ -313,12 +314,24 @@ static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *
}

static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
				      enum aca_error_type type, void *data)
				      enum aca_smu_type smu_type, void *data)
{
	struct aca_bank_report report;
	enum aca_error_type type;
	int ret;

	ret = aca_generate_bank_report(handle, bank, type, &report);
	switch (smu_type) {
	case ACA_SMU_TYPE_UE:
		type = ACA_ERROR_TYPE_UE;
		break;
	case ACA_SMU_TYPE_CE:
		type = ACA_ERROR_TYPE_CE;
		break;
	default:
		return -EINVAL;
	}

	ret = aca_generate_bank_report(handle, bank, smu_type, &report);
	if (ret)
		return ret;

@@ -333,7 +346,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
}

static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
			     enum aca_error_type type, bank_handler_t handler, void *data)
			     enum aca_smu_type type, bank_handler_t handler, void *data)
{
	struct aca_handle *handle;
	int ret;
@@ -354,7 +367,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba
}

static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
			      enum aca_error_type type, bank_handler_t handler, void *data)
			      enum aca_smu_type type, bank_handler_t handler, void *data)
{
	struct aca_bank_node *node;
	struct aca_bank *bank;
@@ -378,7 +391,7 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *
	return 0;
}

static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type,
static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
			    bank_handler_t handler, void *data)
{
	struct amdgpu_aca *aca = &adev->aca;
@@ -389,10 +402,6 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
	if (list_empty(&aca->mgr.list))
		return 0;

	/* NOTE: pmfw is only support UE and CE */
	if (type == ACA_ERROR_TYPE_DEFERRED)
		type = ACA_ERROR_TYPE_CE;

	ret = aca_smu_get_valid_aca_count(adev, type, &count);
	if (ret)
		return ret;
@@ -479,10 +488,22 @@ static int aca_log_aca_error(struct aca_handle *handle, enum aca_error_type type
static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
				struct ras_err_data *err_data)
{
	enum aca_smu_type smu_type;
	int ret;

	switch (type) {
	case ACA_ERROR_TYPE_UE:
		smu_type = ACA_SMU_TYPE_UE;
		break;
	case ACA_ERROR_TYPE_CE:
		smu_type = ACA_SMU_TYPE_CE;
		break;
	default:
		return -EINVAL;
	}

	/* udpate aca bank to aca source error_cache first */
	ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL);
	ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, NULL);
	if (ret)
		return ret;

@@ -784,7 +805,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
	return 0;
}

static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx)
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
{
	struct aca_bank_info info;
	int i, ret;
@@ -793,7 +814,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e
	if (ret)
		return;

	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_ERROR_TYPE_UE ? "UE" : "CE");
	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_SMU_TYPE_UE ? "UE" : "CE");
	seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
		   idx, info.socket_id, info.die_id, info.hwid, info.mcatype);

@@ -807,7 +828,7 @@ struct aca_dump_context {
};

static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
				 enum aca_error_type type, void *data)
				 enum aca_smu_type type, void *data)
{
	struct aca_dump_context *ctx = (struct aca_dump_context *)data;

@@ -816,7 +837,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
	return handler_aca_log_bank_error(handle, bank, type, NULL);
}

static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
	struct aca_dump_context context = {
@@ -829,7 +850,7 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)

static int aca_dump_ce_show(struct seq_file *m, void *unused)
{
	return aca_dump_show(m, ACA_ERROR_TYPE_CE);
	return aca_dump_show(m, ACA_SMU_TYPE_CE);
}

static int aca_dump_ce_open(struct inode *inode, struct file *file)
@@ -847,7 +868,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {

static int aca_dump_ue_show(struct seq_file *m, void *unused)
{
	return aca_dump_show(m, ACA_ERROR_TYPE_UE);
	return aca_dump_show(m, ACA_SMU_TYPE_UE);
}

static int aca_dump_ue_open(struct inode *inode, struct file *file)
+11 −4
Original line number Diff line number Diff line
@@ -99,7 +99,14 @@ enum aca_error_type {
	ACA_ERROR_TYPE_COUNT
};

enum aca_smu_type {
	ACA_SMU_TYPE_UE = 0,
	ACA_SMU_TYPE_CE,
	ACA_SMU_TYPE_COUNT,
};

struct aca_bank {
	enum aca_smu_type type;
	u64 regs[ACA_MAX_REGS_COUNT];
};

@@ -157,9 +164,9 @@ struct aca_handle {
};

struct aca_bank_ops {
	int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
	int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
					struct aca_bank_report *report, void *data);
	bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
	bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
				  void *data);
};

@@ -167,8 +174,8 @@ struct aca_smu_funcs {
	int max_ue_bank_count;
	int max_ce_bank_count;
	int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
	int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count);
	int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank);
	int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
	int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
};

struct amdgpu_aca {
+13 −5
Original line number Diff line number Diff line
@@ -1035,12 +1035,12 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
	return 0;
}

static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
						struct aca_bank_report *report, void *data)
{
	struct amdgpu_device *adev = handle->adev;
	const char *error_str;
	u64 status;
	u64 status, count;
	int ret, ext_error_code;

	ret = aca_bank_info_decode(bank, &report->info);
@@ -1055,9 +1055,17 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc
	if (error_str)
		dev_info(adev->dev, "%s detected\n", error_str);

	if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
	    (type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
		report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
	count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
	switch (type) {
	case ACA_SMU_TYPE_UE:
		report->count[ACA_ERROR_TYPE_UE] = ext_error_code == 0 ? count : 0ULL;
		break;
	case ACA_SMU_TYPE_CE:
		report->count[ACA_ERROR_TYPE_CE] = ext_error_code == 6 ? count : 0ULL;
		break;
	default:
		return -EINVAL;
	}

	return 0;
}
+20 −17
Original line number Diff line number Diff line
@@ -681,19 +681,13 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {
};

static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
					       struct aca_bank *bank, enum aca_error_type type,
					       struct aca_bank *bank, enum aca_smu_type type,
					       struct aca_bank_report *report, void *data)
{
	u64 status, misc0;
	u64 misc0;
	u32 instlo;
	int ret;

	status = bank->regs[ACA_REG_IDX_STATUS];
	if ((type == ACA_ERROR_TYPE_UE &&
	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
	    (type == ACA_ERROR_TYPE_CE &&
	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {

	ret = aca_bank_info_decode(bank, &report->info);
	if (ret)
		return ret;
@@ -704,14 +698,23 @@ static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
	report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;

	misc0 = bank->regs[ACA_REG_IDX_MISC0];
		report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);

	switch (type) {
	case ACA_SMU_TYPE_UE:
		report->count[ACA_ERROR_TYPE_UE] = 1ULL;
		break;
	case ACA_SMU_TYPE_CE:
		report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0);
		break;
	default:
		return -EINVAL;
	}

	return 0;
}

static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
					 enum aca_error_type type, void *data)
					 enum aca_smu_type type, void *data)
{
	u32 instlo;

+18 −15
Original line number Diff line number Diff line
@@ -707,24 +707,27 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
};

static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle,
					       struct aca_bank *bank, enum aca_error_type type,
					       struct aca_bank *bank, enum aca_smu_type type,
					       struct aca_bank_report *report, void *data)
{
	u64 status, misc0;
	u64 misc0;
	int ret;

	status = bank->regs[ACA_REG_IDX_STATUS];
	if ((type == ACA_ERROR_TYPE_UE &&
	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
	    (type == ACA_ERROR_TYPE_CE &&
	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {

	ret = aca_bank_info_decode(bank, &report->info);
	if (ret)
		return ret;

	misc0 = bank->regs[ACA_REG_IDX_MISC0];
		report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);

	switch (type) {
	case ACA_SMU_TYPE_UE:
		report->count[ACA_ERROR_TYPE_UE] = 1ULL;
		break;
	case ACA_SMU_TYPE_CE:
		report->count[ACA_ERROR_TYPE_CE] = ACA_REG__MISC0__ERRCNT(misc0);
		break;
	default:
		return -EINVAL;
	}

	return 0;
@@ -741,7 +744,7 @@ static int mmhub_v1_8_err_codes[] = {
};

static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
					 enum aca_error_type type, void *data)
					 enum aca_smu_type type, void *data)
{
	u32 instlo;

Loading