Commit 0153d276 authored by ganglxie's avatar ganglxie Committed by Alex Deucher
Browse files

drm/amdgpu: Refine bad page adding



bad page adding can be simpler with nps info

Signed-off-by: default avatarganglxie <ganglxie@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e6aae1db
Loading
Loading
Loading
Loading
+104 −92
Original line number Diff line number Diff line
@@ -2799,20 +2799,100 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
		return  -EINVAL;
}

static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
					struct eeprom_table_record *bps, int count)
{
	int j;
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data = con->eh_data;

	for (j = 0; j < count; j++) {
		if (amdgpu_ras_check_bad_page_unlock(con,
			bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
			continue;

		if (!data->space_left &&
		    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
			return -ENOMEM;
		}

		amdgpu_ras_reserve_page(adev, bps[j].retired_page);

		memcpy(&data->bps[data->count], &(bps[j]),
				sizeof(struct eeprom_table_record));
		data->count++;
		data->space_left--;
	}

	return 0;
}

static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
				struct eeprom_table_record *bps, struct ras_err_data *err_data,
				enum amdgpu_memory_partition nps)
{
	int i = 0;
	enum amdgpu_memory_partition save_nps;

	save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;

	for (i = 0; i < adev->umc.retire_unit; i++)
		bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);

	if (save_nps) {
		if (save_nps == nps) {
			if (amdgpu_umc_pages_in_a_row(adev, err_data,
					bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
				return -EINVAL;
		} else {
			if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
				return -EINVAL;
		}
	} else {
		if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
			if (nps == AMDGPU_NPS1_PARTITION_MODE)
				memcpy(err_data->err_addr, bps,
					sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
			else
				return -EOPNOTSUPP;
		}
	}

	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
}

static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
				struct eeprom_table_record *bps, struct ras_err_data *err_data,
				enum amdgpu_memory_partition nps)
{
	enum amdgpu_memory_partition save_nps;

	save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
	bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);

	if (save_nps == nps) {
		if (amdgpu_umc_pages_in_a_row(adev, err_data,
				bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
			return -EINVAL;
	} else {
		if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
			return -EINVAL;
	}
	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
									adev->umc.retire_unit);
}

/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
		struct eeprom_table_record *bps, int pages, bool from_rom)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data;
	struct ras_err_data err_data;
	struct eeprom_table_record *err_rec;
	struct amdgpu_ras_eeprom_control *control =
			&adev->psp.ras_context.ras->eeprom_control;
	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
	int ret = 0;
	uint32_t i, j, loop_cnt = 1;
	bool find_pages_per_pa = false;
	uint32_t i;

	if (!con || !con->eh_data || !bps || pages <= 0)
		return 0;
@@ -2823,114 +2903,46 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
				sizeof(struct eeprom_table_record), GFP_KERNEL);
		if (!err_data.err_addr) {
			dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
			ret = -ENOMEM;
			goto out;
			return -ENOMEM;
		}

		err_rec = err_data.err_addr;
		loop_cnt = adev->umc.retire_unit;
		if (adev->gmc.gmc_funcs->query_mem_partition_mode)
			nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
	}

	mutex_lock(&con->recovery_lock);
	data = con->eh_data;
	if (!data) {
		/* Returning 0 as the absence of eh_data is acceptable */
		goto free;
	}

	if (from_rom) {
		for (i = 0; i < pages; i++) {
		if (from_rom &&
		    control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
			if (!find_pages_per_pa) {
				if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
					if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
						/* may use old RAS TA, use PA to find pages in
						 * one row
						 */
						if (amdgpu_umc_pages_in_a_row(adev, &err_data,
									      bps[i].retired_page <<
									      AMDGPU_GPU_PAGE_SHIFT)) {
							ret = -EINVAL;
			if (control->ras_num_recs - i >= adev->umc.retire_unit) {
				if ((bps[i].address == bps[i + 1].address) &&
				    (bps[i].mem_channel == bps[i + 1].mem_channel)) {
					//deal with retire_unit records a time
					ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
									&bps[i], &err_data, nps);
					if (ret)
						goto free;
					i += (adev->umc.retire_unit - 1);
				} else {
							find_pages_per_pa = true;
						}
					} else {
						/* unsupported cases */
						ret = -EOPNOTSUPP;
						goto free;
					}
					break;
				}
			} else {
				if (amdgpu_umc_pages_in_a_row(adev, &err_data,
						bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
					ret = -EINVAL;
					goto free;
				}
				break;
			}
		} else {
			if (from_rom && !find_pages_per_pa) {
				if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
					/* bad page in any NPS mode in eeprom */
					if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
						ret = -EINVAL;
						goto free;
		}
				} else {
					/* legacy bad page in eeprom, generated only in
					 * NPS1 mode
					 */
					if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
						/* old RAS TA or ASICs which don't support to
						 * convert addrss via mca address
						 */
						if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
							find_pages_per_pa = true;
							err_rec = &bps[i];
							loop_cnt = 1;
						} else {
							/* non-nps1 mode, old RAS TA
							 * can't support it
							 */
							ret = -EOPNOTSUPP;
		for (; i < pages; i++) {
			ret = __amdgpu_ras_convert_rec_from_rom(adev,
				&bps[i], &err_data, nps);
			if (ret)
				goto free;
		}
					}
				}

				if (!find_pages_per_pa)
					i += (adev->umc.retire_unit - 1);
	} else {
				err_rec = &bps[i];
			}
		}

		for (j = 0; j < loop_cnt; j++) {
			if (amdgpu_ras_check_bad_page_unlock(con,
				err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
				continue;

			if (!data->space_left &&
			    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
				ret = -ENOMEM;
				goto free;
			}

			amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);

			memcpy(&data->bps[data->count], &(err_rec[j]),
					sizeof(struct eeprom_table_record));
			data->count++;
			data->space_left--;
		}
		ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
	}

free:
	if (from_rom)
		kfree(err_data.err_addr);
out:
	mutex_unlock(&con->recovery_lock);

	return ret;