Commit 0eecff79 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: do RAS MCA2PA conversion in device init phase



NPS mode is introduced, the value of memory physical address (PA)
related to a MCA address varies per nps mode. We need to rely on
MCA address and convert it into PA accroding to nps mode.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 772df3df
Loading
Loading
Loading
Loading
+82 −12
Original line number Diff line number Diff line
@@ -2717,40 +2717,110 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
	return 0;
}

static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
			struct eeprom_table_record *bps,
			struct ras_err_data *err_data)
{
	struct ta_ras_query_address_input addr_in;
	uint32_t socket = 0;
	int ret = 0;

	if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
		socket = adev->smuio.funcs->get_socket_id(adev);

	/* reinit err_data */
	err_data->err_addr_cnt = 0;
	err_data->err_addr_len = adev->umc.retire_unit;

	memset(&addr_in, 0, sizeof(addr_in));
	addr_in.ma.err_addr = bps->address;
	addr_in.ma.socket_id = socket;
	addr_in.ma.ch_inst = bps->mem_channel;
	/* tell RAS TA the node instance is not used */
	addr_in.ma.node_inst = TA_RAS_INV_NODE;

	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
		ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
				&addr_in, NULL, false);

	return ret;
}

/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
		struct eeprom_table_record *bps, int pages)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data;
	struct ras_err_data err_data;
	struct eeprom_table_record *err_rec;
	int ret = 0;
	uint32_t i;
	uint32_t i, j, loop_cnt = 1;
	bool is_mca_add = true;

	if (!con || !con->eh_data || !bps || pages <= 0)
		return 0;

	if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
		is_mca_add = false;
	} else {
		if ((pages > 1) &&
		    (bps[0].address == bps[1].address) &&
		    (bps[0].mem_channel == bps[1].mem_channel))
			is_mca_add = false;
	}

	mutex_lock(&con->recovery_lock);
	data = con->eh_data;
	if (!data)
		goto out;

	if (is_mca_add) {
		err_data.err_addr =
			kcalloc(adev->umc.retire_unit,
				sizeof(struct eeprom_table_record), GFP_KERNEL);
		if (!err_data.err_addr) {
			dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
			ret = -ENOMEM;
			goto out;
		}

		loop_cnt = adev->umc.retire_unit;
	}

	for (i = 0; i < pages; i++) {
		if (is_mca_add) {
			if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
				goto free;

			err_rec = err_data.err_addr;
		} else {
			err_rec = &bps[i];
		}

		for (j = 0; j < loop_cnt; j++) {
			if (amdgpu_ras_check_bad_page_unlock(con,
			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
				err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
				continue;

			if (!data->space_left &&
			    amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
				ret = -ENOMEM;
			goto out;
				goto free;
			}

		amdgpu_ras_reserve_page(adev, bps[i].retired_page);
			amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);

		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
			memcpy(&data->bps[data->count], &(err_rec[j]),
					sizeof(struct eeprom_table_record));
			data->count++;
			data->space_left--;
		}
	}

free:
	if (is_mca_add)
		kfree(err_data.err_addr);
out:
	mutex_unlock(&con->recovery_lock);