Commit 11dcf72e authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amd/ras: Support high-frequency querying sriov ras block error count



Support high-frequency querying sriov ras block error count:
1. Create shared memory and fills it with RAS_CMD__GET_LAL_LOC_STATUS
   ras command.
2. The RAS_CMD_GET_ALL_BLOCK_ECC_STATUS command and shared
   memory are registered to sriov host ras auto-update list
   via RAS_CMD_SET_CMD_AUTO_UPDATE command.
3. Once sriov host detects ras error, it will automatically execute
   RAS_CMD__GET_ALL_BLOCK_ECC_STATUS command and write the result to
   shared memory.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent fcfa8dbb
Loading
Loading
Loading
Loading
+112 −0
Original line number Diff line number Diff line
@@ -235,9 +235,90 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
	return RAS_CMD__SUCCESS;
}

static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
			struct vram_blocks_ecc *blks_ecc)
{
	struct ras_cmd_ctx *rcmd;

	if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
		return -EINVAL;

	rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;

	rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
	rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
	rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);

	return 0;
}

static int __set_cmd_auto_update(struct amdgpu_device *adev,
			enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
{
	struct ras_cmd_auto_update_req req = {0};
	struct ras_cmd_auto_update_rsp rsp = {0};
	int ret;

	req.mode = reg ? 1 : 0;
	req.cmd_id = cmd_id;
	req.addr = gpa_addr;
	req.len = len;
	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
		&req, sizeof(req), &rsp, sizeof(rsp));

	return ret;
}

static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
				struct ras_cmd_ctx *cmd, void *data)
{
	struct amdgpu_device *adev = ras_core->dev;
	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
	struct amdgpu_virt_ras_cmd *virt_ras =
			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
	struct ras_cmd_ctx *blks_ecc_cmd_ctx;
	struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
	struct ras_cmd_block_ecc_info_req *input_data =
			(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
	struct ras_cmd_block_ecc_info_rsp *output_data =
			(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
	int ret = 0;

	if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;

	if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
		return RAS_CMD__ERROR_INVALID_INPUT_DATA;

	if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
		return RAS_CMD__ERROR_GENERIC;

	if (!virt_ras->blocks_ecc.auto_update_actived) {
		ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
				blks_ecc->mc_addr - adev->gmc.vram_start,
				blks_ecc->size, true);
		if (ret)
			return ret;

		blks_ecc->auto_update_actived = true;
	}

	blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
	blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;

	output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
	output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
	output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;

	cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
	return RAS_CMD__SUCCESS;
}

static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
	{RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
	{RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
	{RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
};

int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
@@ -294,10 +375,41 @@ int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)

int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
{
	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
	struct amdgpu_virt_ras_cmd *virt_ras =
			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;

	memset(blks_ecc, 0, sizeof(*blks_ecc));
	blks_ecc->size = PAGE_SIZE;
	if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
			&blks_ecc->bo, &blks_ecc->mc_addr,
			(void **)&blks_ecc->cpu_addr))
		return -ENOMEM;

	return 0;
}

int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
{
	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
	struct amdgpu_virt_ras_cmd *virt_ras =
			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;

	if (blks_ecc->bo) {
		__set_cmd_auto_update(adev,
			RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
			blks_ecc->mc_addr - adev->gmc.vram_start,
			blks_ecc->size, false);

		memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
		amdgpu_bo_free_kernel(&blks_ecc->bo,
			&blks_ecc->mc_addr, &blks_ecc->cpu_addr);

		memset(blks_ecc, 0, sizeof(*blks_ecc));
	}

	return 0;
}
+9 −0
Original line number Diff line number Diff line
@@ -30,8 +30,17 @@ struct remote_batch_trace_mgr {
	struct ras_cmd_batch_trace_record_rsp  batch_trace;
};

struct vram_blocks_ecc {
	struct amdgpu_bo *bo;
	uint64_t mc_addr;
	void *cpu_addr;
	uint32_t size;
	bool auto_update_actived;
};

struct amdgpu_virt_ras_cmd {
	struct remote_batch_trace_mgr batch_mgr;
	struct vram_blocks_ecc blocks_ecc;
};

int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev);
+33 −0
Original line number Diff line number Diff line
@@ -75,6 +75,8 @@ enum ras_cmd_id {
	RAS_CMD__GET_CPER_RECORD,
	RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
	RAS_CMD__GET_BATCH_TRACE_RECORD,
	RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
	RAS_CMD__SET_CMD_AUTO_UPDATE,
	RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
};

@@ -411,6 +413,37 @@ struct ras_cmd_batch_trace_record_rsp {
	struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
};

struct ras_cmd_auto_update_req {
	struct ras_cmd_dev_handle dev;
	uint32_t mode;
	uint32_t cmd_id;
	uint64_t addr;
	uint32_t len;
	uint32_t reserved[5];
};

struct ras_cmd_auto_update_rsp {
	uint32_t version;
	uint32_t reserved[4];
};

struct ras_cmd_blocks_ecc_req {
	struct ras_cmd_dev_handle dev;
};

struct ras_cmd_block_ecc {
	uint32_t ce_count;
	uint32_t ue_count;
	uint32_t de_count;
};

#define MAX_RAS_BLOCK_NUM  20
struct ras_cmd_blocks_ecc_rsp {
	uint32_t version;
	uint32_t reserved[5];
	struct ras_cmd_block_ecc blocks[MAX_RAS_BLOCK_NUM];
};

#pragma pack(pop)

int ras_cmd_init(struct ras_core_context *ras_core);