Commit e82f9aac authored by Gangliang Xie's avatar Gangliang Xie Committed by Alex Deucher
Browse files

drm/amd/ras: add check func for pmfw eeprom



add check func for pmfw eeprom

Signed-off-by: default avatarGangliang Xie <ganglxie@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent b2d13a41
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -137,6 +137,7 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
		break;
	case RAS_EVENT_ID__DEVICE_RMA:
		ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
		if (!ras_fw_eeprom_supported(ras_core))
			ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
		break;
	case RAS_EVENT_ID__RESET_GPU:
+7 −0
Original line number Diff line number Diff line
@@ -50,6 +50,13 @@
#define GPU_RESET_CAUSE_FATAL   (RAS_CORE_RESET_GPU | 0x0002)
#define GPU_RESET_CAUSE_RMA     (RAS_CORE_RESET_GPU | 0x0004)

enum ras_gpu_health_status {
	RAS_GPU_HEALTH_NONE = 0,
	RAS_GPU_HEALTH_USABLE = 1,
	RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
	RAS_GPU_IN_BAD_STATUS = 3,
};

enum ras_core_fw_feature_flags {
	RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
};
+4 −1
Original line number Diff line number Diff line
@@ -403,6 +403,9 @@ int ras_core_hw_init(struct ras_core_context *ras_core)
		goto init_err6;
	}

	if (ras_fw_eeprom_supported(ras_core))
		ret = ras_fw_eeprom_check_storage_status(ras_core);
	else
		ret = ras_eeprom_check_storage_status(ras_core);
	if (ret)
		goto init_err6;
+0 −7
Original line number Diff line number Diff line
@@ -57,13 +57,6 @@ do { \
	(RECORD)->retired_row_pfn = tmp; \
} while (0)

enum ras_gpu_health_status {
	RAS_GPU_HEALTH_NONE = 0,
	RAS_GPU_HEALTH_USABLE = 1,
	RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
	RAS_GPU_IN_BAD_STATUS = 3,
};

enum ras_eeprom_err_type {
	RAS_EEPROM_ERR_NA,
	RAS_EEPROM_ERR_RECOVERABLE,
+51 −0
Original line number Diff line number Diff line
@@ -453,3 +453,54 @@ int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core)

	return 0;
}

int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core)
{
	struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
	int bad_page_count;

	bad_page_count = ras_umc_get_badpage_count(ras_core);

	if ((control->record_threshold_count < bad_page_count) &&
	    (control->record_threshold_config != 0)) {
		RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
				bad_page_count, control->record_threshold_count);
		if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
			(control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
			RAS_DEV_WARN(ras_core->dev,
			"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
		} else {
			ras_core->is_rma = true;
			RAS_DEV_ERR(ras_core->dev,
			"User defined threshold is set, runtime service will be halt when threshold is reached\n");
		}
		return 0;
	}

	RAS_DEV_INFO(ras_core->dev,
			"Found existing EEPROM table with %d records\n",
			bad_page_count);
	/* Warn if we are at 90% of the threshold or above
	 */
	if (10 * bad_page_count >= 9 * control->record_threshold_count)
		RAS_DEV_WARN(ras_core->dev,
			"RAS records:%u exceeds 90%% of threshold:%d\n",
			bad_page_count,
			control->record_threshold_count);

	return 0;
}

enum ras_gpu_health_status
	ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core)
{
	struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;

	if (!control->record_threshold_config)
		return RAS_GPU_HEALTH_NONE;

	if (ras_core->is_rma)
		return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;

	return RAS_GPU_HEALTH_USABLE;
}
Loading