Commit a6b5a7a0 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: query bad page info of ras module



Query bad page info of ras module.

V2:
  Update code to reuse bad page output code.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 62902b88
Loading
Loading
Loading
Loading
+98 −44
Original line number Diff line number Diff line
@@ -1782,7 +1782,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
/* sysfs begin */

static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage **bps, unsigned int *count);
		struct ras_badpage *bps, uint32_t count, uint32_t start);
static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage *bps, uint32_t count, uint32_t start);

static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
{
@@ -1840,19 +1842,50 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
	unsigned int end = div64_ul(ppos + count - 1, element_size);
	ssize_t s = 0;
	struct ras_badpage *bps = NULL;
	unsigned int bps_count = 0;
	int bps_count = 0, i, status;
	uint64_t address;

	memset(buf, 0, count);

	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
	bps_count = end - start;
	bps = kmalloc_array(bps_count, sizeof(*bps), GFP_KERNEL);
	if (!bps)
		return 0;

	memset(bps, 0, sizeof(*bps) * bps_count);

	if (amdgpu_uniras_enabled(adev))
		bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start);
	else
		bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start);

	if (bps_count <= 0) {
		kfree(bps);
		return 0;
	}

	for (i = 0; i < bps_count; i++) {
		address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT;
		if (amdgpu_ras_check_critical_address(adev, address))
			continue;

		bps[i].size = AMDGPU_GPU_PAGE_SIZE;

		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
					address);
		if (status == -EBUSY)
			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
		else if (status == -ENOENT)
			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
		else
			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED;

	for (; start < end && start < bps_count; start++)
		s += scnprintf(&buf[s], element_size + 1,
				"0x%08x : 0x%08x : %1s\n",
				bps[start].bp,
				bps[start].size,
				amdgpu_ras_badpage_flags_str(bps[start].flags));
				bps[i].bp,
				bps[i].size,
				amdgpu_ras_badpage_flags_str(bps[i].flags));
	}

	kfree(bps);

@@ -2645,62 +2678,83 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
	}
}

/* recovery begin */

/* return 0 on success.
 * caller need free bps.
 */
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage **bps, unsigned int *count)
		struct ras_badpage *bps, uint32_t count, uint32_t start)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data;
	int i = 0;
	int ret = 0, status;
	int r = 0;
	uint32_t i;

	if (!con || !con->eh_data || !bps || !count)
		return -EINVAL;

	mutex_lock(&con->recovery_lock);
	data = con->eh_data;
	if (!data || data->count == 0) {
		*bps = NULL;
		ret = -EINVAL;
		goto out;
	if (start < data->count) {
		for (i = start; i < data->count; i++) {
			if (!data->bps[i].ts)
				continue;

			bps[r].bp = data->bps[i].retired_page;
			r++;
			if (r >= count)
				break;
		}
	}
	mutex_unlock(&con->recovery_lock);

	*bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL);
	if (!*bps) {
		ret = -ENOMEM;
		goto out;
	return r;
}

	for (; i < data->count; i++) {
		if (!data->bps[i].ts)
			continue;
static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
		struct ras_badpage *bps, uint32_t count, uint32_t start)
{
	struct ras_cmd_bad_pages_info_req cmd_input;
	struct ras_cmd_bad_pages_info_rsp *output;
	uint32_t group, start_group, end_group;
	uint32_t pos, pos_in_group;
	int r = 0, i;

		(*bps)[i] = (struct ras_badpage){
			.bp = data->bps[i].retired_page,
			.size = AMDGPU_GPU_PAGE_SIZE,
			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
		};
	if (!bps || !count)
		return -EINVAL;

	output = kmalloc(sizeof(*output), GFP_KERNEL);
	if (!output)
		return -ENOMEM;

	memset(&cmd_input, 0, sizeof(cmd_input));

		if (amdgpu_ras_check_critical_address(adev,
			data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
	start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
	end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) /
				RAS_CMD_MAX_BAD_PAGES_PER_GROUP;

	pos = start;
	for (group = start_group; group < end_group; group++) {
		memset(output, 0, sizeof(*output));
		cmd_input.group_index = group;
		if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES,
			&cmd_input, sizeof(cmd_input), output, sizeof(*output)))
			goto out;

		if (pos >= output->bp_total_cnt)
			goto out;

		pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
		for (i = pos_in_group; i < output->bp_in_group; i++, pos++) {
			if (!output->records[i].ts)
				continue;

		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
				data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
		if (status == -EBUSY)
			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
		else if (status == -ENOENT)
			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
			bps[r].bp = output->records[i].retired_page;
			r++;
			if (r >= count)
				goto out;
		}
	}

	*count = con->bad_page_num;
out:
	mutex_unlock(&con->recovery_lock);
	return ret;
	kfree(output);
	return r;
}

static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,