Commit b95fa494 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: add RAS is_rma flag



Set the flag to true if bad page number reaches threshold.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 15c2990e
Loading
Loading
Loading
Loading
+4 −5
Original line number Diff line number Diff line
@@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data **data;
	u32  max_eeprom_records_count = 0;
	bool exc_err_limit = false;
	int ret;

	if (!con || amdgpu_sriov_vf(adev))
@@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	 */
	if (adev->gmc.xgmi.pending_reset)
		return 0;
	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
	/*
	 * This calling fails when exc_err_limit is true or
	 * This calling fails when is_rma is true or
	 * ret != 0.
	 */
	if (exc_err_limit || ret)
	if (con->is_rma || ret)
		goto free;

	if (con->eeprom_control.ras_num_recs) {
@@ -3016,7 +3015,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	 * Except error threshold exceeding case, other failure cases in this
	 * function would not fail amdgpu driver init.
	 */
	if (!exc_err_limit)
	if (!con->is_rma)
		ret = 0;
	else
		ret = -EINVAL;
+1 −0
Original line number Diff line number Diff line
@@ -522,6 +522,7 @@ struct amdgpu_ras {
	bool update_channel_flag;
	/* Record status of smu mca debug mode */
	bool is_aca_debug_mode;
	bool is_rma;

	/* Record special requirements of gpu reset caller */
	uint32_t  gpu_reset_flags;
+6 −4
Original line number Diff line number Diff line
@@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
			control->tbl_rai.health_percent = 0;
		}

		if (amdgpu_bad_page_threshold != -1)
			ras->is_rma = true;

		/* ignore the -ENOTSUPP return value */
		amdgpu_dpm_send_rma_reason(adev);
	}
@@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
	return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
}

int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
			   bool *exceed_err_limit)
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
{
	struct amdgpu_device *adev = to_amdgpu_device(control);
	unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	int res;

	*exceed_err_limit = false;
	ras->is_rma = false;

	if (!__is_ras_eeprom_supported(adev))
		return 0;
@@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
				dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
				res = 0;
			} else {
				*exceed_err_limit = true;
				ras->is_rma = true;
				dev_err(adev->dev,
					"RAS records:%d exceed threshold:%d, "
					"GPU will not be initialized. Replace this GPU or increase the threshold",
+1 −2
Original line number Diff line number Diff line
@@ -129,8 +129,7 @@ struct eeprom_table_record {
	unsigned char mcumc_id;
} __packed;

int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
			   bool *exceed_err_limit);
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);

int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);