Commit c154a96b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: load RAS bad page from PMFW in page retirement



In legacy way, bad page is queried from MCA registers, switch to
getting it from PMFW when PMFW manages eeprom data.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2a084f4a
Loading
Loading
Loading
Loading
+7 −1
Original line number Diff line number Diff line
@@ -3300,7 +3300,13 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
	mutex_lock(&con->recovery_lock);
	control = &con->eeprom_control;
	data = con->eh_data;
	unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
	if (amdgpu_ras_smu_eeprom_supported(adev))
		unit_num = control->ras_num_recs -
			control->ras_num_recs_old;
	else
		unit_num = data->count / adev->umc.retire_unit -
			control->ras_num_recs;

	save_count = con->bad_page_num - control->ras_num_bad_pages;
	mutex_unlock(&con->recovery_lock);

+83 −54
Original line number Diff line number Diff line
@@ -96,19 +96,35 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
{
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct amdgpu_ras_eeprom_control *control = &con->eeprom_control;
	unsigned int error_query_mode;
	int ret = 0;
	unsigned long err_count;

	amdgpu_ras_get_error_query_mode(adev, &error_query_mode);

	err_data->err_addr =
		kcalloc(adev->umc.max_ras_err_cnt_per_query,
			sizeof(struct eeprom_table_record), GFP_KERNEL);

	/* still call query_ras_error_address to clear error status
	 * even NOMEM error is encountered
	 */
	if (!err_data->err_addr)
		dev_warn(adev->dev,
			"Failed to alloc memory for umc error address record!\n");
	else
		err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;

	mutex_lock(&con->page_retirement_lock);
	if (!amdgpu_ras_smu_eeprom_supported(adev)) {
		ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
		if (ret == -EOPNOTSUPP &&
		    error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
			if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
			    adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
				adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
								ras_error_status);

			if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
			    adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
@@ -121,21 +137,24 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
				 * even NOMEM error is encountered
				 */
				if (!err_data->err_addr)
				dev_warn(adev->dev, "Failed to alloc memory for "
						"umc error address record!\n");
					dev_warn(adev->dev,
						"Failed to alloc memory for umc error address record!\n");
				else
				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
					err_data->err_addr_len =
						adev->umc.max_ras_err_cnt_per_query;

				/* umc query_ras_error_address is also responsible for clearing
				 * error status
				 */
			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
				adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev,
								ras_error_status);
			}
		} else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
		    (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
			if (adev->umc.ras &&
			    adev->umc.ras->ecc_info_query_ras_error_count)
		    adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
				adev->umc.ras->ecc_info_query_ras_error_count(adev,
								ras_error_status);

			if (adev->umc.ras &&
			    adev->umc.ras->ecc_info_query_ras_error_address &&
@@ -148,15 +167,25 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
				 * even NOMEM error is encountered
				 */
				if (!err_data->err_addr)
				dev_warn(adev->dev, "Failed to alloc memory for "
						"umc error address record!\n");
					dev_warn(adev->dev,
						"Failed to alloc memory for umc error address record!\n");
				else
				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
					err_data->err_addr_len =
						adev->umc.max_ras_err_cnt_per_query;

				/* umc query_ras_error_address is also responsible for clearing
				 * error status
				 */
			adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
				adev->umc.ras->ecc_info_query_ras_error_address(adev,
								ras_error_status);
			}
		}
	} else {
		if (!amdgpu_ras_eeprom_update_record_num(control)) {
			err_data->err_addr_cnt = err_data->de_count =
				control->ras_num_recs -	control->ras_num_recs_old;
			amdgpu_ras_eeprom_read_idx(control, err_data->err_addr,
				control->ras_num_recs_old, err_data->de_count);
		}
	}

@@ -166,7 +195,7 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
		if ((amdgpu_bad_page_threshold != 0) &&
			err_data->err_addr_cnt) {
			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
						err_data->err_addr_cnt, false);
				err_data->err_addr_cnt, amdgpu_ras_smu_eeprom_supported(adev));
			amdgpu_ras_save_bad_pages(adev, &err_count);

			amdgpu_dpm_send_hbm_bad_pages_num(adev,