Commit 5b1270be authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher
Browse files

drm/amdgpu: add ras_err_info to identify RAS error source



introduced "ras_err_info" to better identify a RAS ERROR source.

NOTE:
For legacy chips, keep the original RAS error print format.

v1:
RAS errors may come from different dies during a RAS error query,
therefore, need a new data structure to identify the source of RAS ERROR.

v2:
- use new data structure 'amdgpu_smuio_mcm_config_info' instead of
  ras_err_id (in v1 patch)
- refine ras error dump function name
- refine ras error dump log format

Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6a1c31c7
Loading
Loading
Loading
Loading
+249 −45
Original line number Diff line number Diff line
@@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)

static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
{
	struct ras_err_data err_data = {0, 0, 0, NULL};
	struct ras_err_data err_data;
	struct eeprom_table_record err_rec;
	int ret;

	if ((address >= adev->gmc.mc_vram_size) ||
	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
@@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
		return 0;
	}

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return ret;

	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
	err_data.err_addr = &err_rec;
	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
@@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
		amdgpu_ras_save_bad_pages(adev, NULL);
	}

	amdgpu_ras_error_data_fini(&err_data);

	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
	dev_warn(adev->dev, "Clear EEPROM:\n");
	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
@@ -1015,46 +1022,57 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
	}
}

/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				  struct ras_query_if *info)
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
					      struct ras_query_if *query_if,
					      struct ras_err_data *err_data,
					      bool is_ue)
{
	struct amdgpu_ras_block_object *block_obj = NULL;
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
	struct ras_err_data err_data = {0, 0, 0, NULL};
	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
	const char *blk_name = get_ras_block_str(&query_if->head);
	struct amdgpu_smuio_mcm_config_info *mcm_info;
	struct ras_err_node *err_node;
	struct ras_err_info *err_info;

	if (!obj)
		return -EINVAL;
	if (is_ue)
		dev_info(adev->dev, "%ld uncorrectable hardware errors detected in %s block\n",
			 ras_mgr->err_data.ue_count, blk_name);
	else
		dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n",
			 ras_mgr->err_data.ue_count, blk_name);

	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
		amdgpu_ras_get_ecc_info(adev, &err_data);
	} else {
		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
		if (!block_obj || !block_obj->hw_ops)   {
			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
				     get_ras_block_str(&info->head));
			return -EINVAL;
	for_each_ras_error(err_node, err_data) {
		err_info = &err_node->err_info;
		mcm_info = &err_info->mcm_info;
		if (is_ue && err_info->ue_count) {
			dev_info(adev->dev, "socket: %d, die: %d "
				 "%lld uncorrectable hardware errors detected in %s block\n",
				 mcm_info->socket_id,
				 mcm_info->die_id,
				 err_info->ue_count,
				 blk_name);
		} else if (!is_ue && err_info->ce_count) {
			dev_info(adev->dev, "socket: %d, die: %d "
				 "%lld correctable hardware errors detected in %s block\n",
				 mcm_info->socket_id,
				 mcm_info->die_id,
				 err_info->ue_count,
				 blk_name);
		}

		if (block_obj->hw_ops->query_ras_error_count)
			block_obj->hw_ops->query_ras_error_count(adev, &err_data);

		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
				if (block_obj->hw_ops->query_ras_error_status)
					block_obj->hw_ops->query_ras_error_status(adev);
	}
}

	obj->err_data.ue_count += err_data.ue_count;
	obj->err_data.ce_count += err_data.ce_count;

	info->ue_count = obj->err_data.ue_count;
	info->ce_count = obj->err_data.ce_count;
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
					     struct ras_query_if *query_if,
					     struct ras_err_data *err_data)
{
	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
	const char *blk_name = get_ras_block_str(&query_if->head);

	if (err_data.ce_count) {
		if (!adev->aid_mask &&
	if (err_data->ce_count) {
		if (!list_empty(&err_data->err_node_list)) {
			amdgpu_ras_error_print_error_data(adev, query_if,
							  err_data, false);
		} else if (!adev->aid_mask &&
			   adev->smuio.funcs &&
			   adev->smuio.funcs->get_socket_id &&
			   adev->smuio.funcs->get_die_id) {
@@ -1064,18 +1082,22 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				 "action is needed.\n",
				 adev->smuio.funcs->get_socket_id(adev),
				 adev->smuio.funcs->get_die_id(adev),
					obj->err_data.ce_count,
					get_ras_block_str(&info->head));
				 ras_mgr->err_data.ce_count,
				 blk_name);
		} else {
			dev_info(adev->dev, "%ld correctable hardware errors "
				 "detected in %s block, no user "
				 "action is needed.\n",
					obj->err_data.ce_count,
					get_ras_block_str(&info->head));
				 ras_mgr->err_data.ce_count,
				 blk_name);
		}
	}
	if (err_data.ue_count) {
		if (!adev->aid_mask &&

	if (err_data->ue_count) {
		if (!list_empty(&err_data->err_node_list)) {
			amdgpu_ras_error_print_error_data(adev, query_if,
							  err_data, true);
		} else if (!adev->aid_mask &&
			   adev->smuio.funcs &&
			   adev->smuio.funcs->get_socket_id &&
			   adev->smuio.funcs->get_die_id) {
@@ -1084,17 +1106,68 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				 "detected in %s block\n",
				 adev->smuio.funcs->get_socket_id(adev),
				 adev->smuio.funcs->get_die_id(adev),
					obj->err_data.ue_count,
					get_ras_block_str(&info->head));
				 ras_mgr->err_data.ue_count,
				 blk_name);
		} else {
			dev_info(adev->dev, "%ld uncorrectable hardware errors "
				 "detected in %s block\n",
					obj->err_data.ue_count,
				 ras_mgr->err_data.ue_count,
				 blk_name);
		}
	}

}

/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				  struct ras_query_if *info)
{
	struct amdgpu_ras_block_object *block_obj = NULL;
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
	struct ras_err_data err_data;
	int ret;

	if (!obj)
		return -EINVAL;

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return ret;

	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
		amdgpu_ras_get_ecc_info(adev, &err_data);
	} else {
		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
		if (!block_obj || !block_obj->hw_ops)   {
			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
				     get_ras_block_str(&info->head));
			ret = -EINVAL;
			goto out_fini_err_data;
		}

		if (block_obj->hw_ops->query_ras_error_count)
			block_obj->hw_ops->query_ras_error_count(adev, &err_data);

		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
				if (block_obj->hw_ops->query_ras_error_status)
					block_obj->hw_ops->query_ras_error_status(adev);
			}
	}

	return 0;
	obj->err_data.ue_count += err_data.ue_count;
	obj->err_data.ce_count += err_data.ce_count;

	info->ue_count = obj->err_data.ue_count;
	info->ce_count = obj->err_data.ce_count;

	amdgpu_ras_error_generate_report(adev, info, &err_data);

out_fini_err_data:
	amdgpu_ras_error_data_fini(&err_data);

	return ret;
}

int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
@@ -1744,12 +1817,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
				struct amdgpu_iv_entry *entry)
{
	struct ras_ih_data *data = &obj->ih_data;
	struct ras_err_data err_data = {0, 0, 0, NULL};
	struct ras_err_data err_data;
	int ret;

	if (!data->cb)
		return;

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return;

	/* Let IP handle its data, maybe we need get the output
	 * from the callback to update the error type/count, etc
	 */
@@ -1766,6 +1843,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
		obj->err_data.ue_count += err_data.ue_count;
		obj->err_data.ce_count += err_data.ce_count;
	}

	amdgpu_ras_error_data_fini(&err_data);
}

static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
@@ -3383,3 +3462,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
		WREG32(err_status_hi_offset, 0);
	}
}

int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
{
	memset(err_data, 0, sizeof(*err_data));

	INIT_LIST_HEAD(&err_data->err_node_list);

	return 0;
}

static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
{
	if (!err_node)
		return;

	list_del(&err_node->node);
	kvfree(err_node);
}

void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
{
	struct ras_err_node *err_node, *tmp;

	list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) {
		amdgpu_ras_error_node_release(err_node);
		list_del(&err_node->node);
	}
}

static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
							     struct amdgpu_smuio_mcm_config_info *mcm_info)
{
	struct ras_err_node *err_node;
	struct amdgpu_smuio_mcm_config_info *ref_id;

	if (!err_data || !mcm_info)
		return NULL;

	for_each_ras_error(err_node, err_data) {
		ref_id = &err_node->err_info.mcm_info;
		if ((mcm_info->socket_id >= 0 && mcm_info->socket_id != ref_id->socket_id) ||
		    (mcm_info->die_id >= 0 && mcm_info->die_id != ref_id->die_id))
			continue;

		return err_node;
	}

	return NULL;
}

static struct ras_err_node *amdgpu_ras_error_node_new(void)
{
	struct ras_err_node *err_node;

	err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL);
	if (!err_node)
		return NULL;

	INIT_LIST_HEAD(&err_node->node);

	return err_node;
}

static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
						      struct amdgpu_smuio_mcm_config_info *mcm_info)
{
	struct ras_err_node *err_node;

	err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info);
	if (err_node)
		return &err_node->err_info;

	err_node = amdgpu_ras_error_node_new();
	if (!err_node)
		return NULL;

	memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));

	err_data->err_list_count++;
	list_add_tail(&err_node->node, &err_data->err_node_list);

	return &err_node->err_info;
}

int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
					struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
{
	struct ras_err_info *err_info;

	if (!err_data || !mcm_info)
		return -EINVAL;

	if (!count)
		return 0;

	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
	if (!err_info)
		return -EINVAL;

	err_info->ue_count += count;
	err_data->ue_count += count;

	return 0;
}

int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
					struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
{
	struct ras_err_info *err_info;

	if (!err_data || !mcm_info)
		return -EINVAL;

	if (!count)
		return 0;

	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
	if (!err_info)
		return -EINVAL;

	err_info->ce_count += count;
	err_data->ce_count += count;

	return 0;
}
+25 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"

struct amdgpu_iv_entry;

@@ -443,13 +444,29 @@ struct ras_fs_data {
	char debugfs_name[32];
};

struct ras_err_info {
	struct amdgpu_smuio_mcm_config_info mcm_info;
	u64 ce_count;
	u64 ue_count;
};

struct ras_err_node {
	struct list_head node;
	struct ras_err_info err_info;
};

struct ras_err_data {
	unsigned long ue_count;
	unsigned long ce_count;
	unsigned long err_addr_cnt;
	struct eeprom_table_record *err_addr;
	u32 err_list_count;
	struct list_head err_node_list;
};

#define for_each_ras_error(err_node, err_data) \
	list_for_each_entry(err_node, &(err_data)->err_node_list, node)

struct ras_err_handler_data {
	/* point to bad page records array */
	struct eeprom_table_record *bps;
@@ -773,4 +790,12 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
					   uint32_t reg_list_size,
					   uint32_t instance);

int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
					struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
					struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);

#endif
+5 −0
Original line number Diff line number Diff line
@@ -30,6 +30,11 @@ enum amdgpu_pkg_type {
	AMDGPU_PKG_TYPE_UNKNOWN,
};

struct amdgpu_smuio_mcm_config_info {
	int socket_id;
	int die_id;
};

struct amdgpu_smuio_funcs {
	u32 (*get_rom_index_offset)(struct amdgpu_device *adev);
	u32 (*get_rom_data_offset)(struct amdgpu_device *adev);
+21 −6
Original line number Diff line number Diff line
@@ -45,8 +45,12 @@ static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
{
	struct ras_err_data err_data = {0, 0, 0, NULL};
	int ret = AMDGPU_RAS_FAIL;
	struct ras_err_data err_data;
	int ret;

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return ret;

	err_data.err_addr =
		kcalloc(adev->umc.max_ras_err_cnt_per_query,
@@ -54,7 +58,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
	if (!err_data.err_addr) {
		dev_warn(adev->dev,
			"Failed to alloc memory for umc error record in MCA notifier!\n");
		return AMDGPU_RAS_FAIL;
		ret = AMDGPU_RAS_FAIL;
		goto out_fini_err_data;
	}

	/*
@@ -63,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
					ch_inst, umc_inst);
	if (ret)
		goto out;
		goto out_free_err_addr;

	if (amdgpu_bad_page_threshold != 0) {
		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
@@ -71,8 +76,12 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
		amdgpu_ras_save_bad_pages(adev, NULL);
	}

out:
out_free_err_addr:
	kfree(err_data.err_addr);

out_fini_err_data:
	amdgpu_ras_error_data_fini(&err_data);

	return ret;
}

@@ -182,18 +191,24 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
	}

	if (!amdgpu_sriov_vf(adev)) {
		struct ras_err_data err_data = {0, 0, 0, NULL};
		struct ras_err_data err_data;
		struct ras_common_if head = {
			.block = AMDGPU_RAS_BLOCK__UMC,
		};
		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);

		ret = amdgpu_ras_error_data_init(&err_data);
		if (ret)
			return ret;

		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);

		if (ret == AMDGPU_RAS_SUCCESS && obj) {
			obj->err_data.ue_count += err_data.ue_count;
			obj->err_data.ce_count += err_data.ce_count;
		}

		amdgpu_ras_error_data_fini(&err_data);
	} else {
		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
			adev->virt.ops->ras_poison_handler(adev);
+6 −1
Original line number Diff line number Diff line
@@ -365,9 +365,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
{
	uint32_t bif_doorbell_intr_cntl;
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
	struct ras_err_data err_data = {0, 0, 0, NULL};
	struct ras_err_data err_data;
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

	if (amdgpu_ras_error_data_init(&err_data))
		return;

	if (adev->asic_type == CHIP_ALDEBARAN)
		bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL_ALDE);
	else
@@ -418,6 +421,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
		 */
		amdgpu_ras_reset_gpu(adev);
	}

	amdgpu_ras_error_data_fini(&err_data);
}

static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
Loading