Commit 25854131 authored by Lizhi Hou's avatar Lizhi Hou
Browse files

accel/amdxdna: Support retrieving hardware context debug information



The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.

When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.

Reviewed-by: default avatarMario Limonciello <mario.limonciello@amd.com>
Signed-off-by: default avatarLizhi Hou <lizhi.hou@amd.com>
Link: https://patch.msgid.link/20260317044906.1513133-1-lizhi.hou@amd.com
parent a4697496
Loading
Loading
Loading
Loading
+77 −8
Original line number Diff line number Diff line
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");

#define HWCTX_MAX_TIMEOUT	60000 /* milliseconds */

struct aie2_ctx_health {
	struct amdxdna_ctx_health header;
	u32 txn_op_idx;
	u32 ctx_pc;
	u32 fatal_error_type;
	u32 fatal_error_exception_type;
	u32 fatal_error_exception_pc;
	u32 fatal_error_app_module;
};

static void aie2_job_release(struct kref *ref)
{
	struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
	wake_up(&job->hwctx->priv->job_free_wq);
	if (job->out_fence)
		dma_fence_put(job->out_fence);
	kfree(job->aie2_job_health);
	kfree(job);
}

@@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
	aie2_job_put(job);
}

static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
{
	struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
	struct amdxdna_dev *xdna = job->hwctx->client->xdna;
	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
	struct app_health_report *report = job->aie2_job_health;
	u32 fail_cmd_idx = 0;

	if (!report)
		goto set_timeout;

	XDNA_ERR(xdna, "Firmware timeout state capture:");
	XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
	XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
	XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
	XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
	XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
	XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
	XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type);
	XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type);
	XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc);
	XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module);
	XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index);
	XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);

	fail_cmd_idx = report->run_list_id;
	aie2_health = kzalloc_obj(*aie2_health);
	if (!aie2_health)
		goto set_timeout;

	aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
	aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
	aie2_health->txn_op_idx = report->txn_op_id;
	aie2_health->ctx_pc = report->ctx_pc;
	aie2_health->fatal_error_type = report->fatal_info.fatal_type;
	aie2_health->fatal_error_exception_type = report->fatal_info.exception_type;
	aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
	aie2_health->fatal_error_app_module = report->fatal_info.app_module;

set_timeout:
	amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
			      aie2_health, sizeof(*aie2_health));
}

static int
aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
{
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
	cmd_abo = job->cmd_bo;

	if (unlikely(job->job_timeout)) {
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
		aie2_set_cmd_timeout(job);
		ret = -EINVAL;
		goto out;
	}

	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
		ret = -EINVAL;
		goto out;
	}
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
	if (status == AIE2_STATUS_SUCCESS)
		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
	else
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);

out:
	aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
	struct amdxdna_sched_job *job = handle;
	struct amdxdna_gem_obj *cmd_abo;
	struct amdxdna_dev *xdna;
	u32 fail_cmd_idx = 0;
	u32 fail_cmd_status;
	u32 fail_cmd_idx;
	u32 cmd_status;
	int ret = 0;

	cmd_abo = job->cmd_bo;

	if (unlikely(job->job_timeout)) {
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
		aie2_set_cmd_timeout(job);
		ret = -EINVAL;
		goto out;
	}

	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
		ret = -EINVAL;
		goto out;
	}
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
		 fail_cmd_idx, fail_cmd_status);

	if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
		ret = -EINVAL;
	} else {
		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
	}

out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
{
	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
	struct amdxdna_hwctx *hwctx = job->hwctx;
	struct app_health_report *report;
	struct amdxdna_dev *xdna;
	int ret;

	xdna = hwctx->client->xdna;
	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
	job->job_timeout = true;

	mutex_lock(&xdna->dev_lock);
	report = kzalloc_obj(*report);
	if (!report)
		goto reset_hwctx;

	ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
	if (ret)
		kfree(report);
	else
		job->aie2_job_health = report;

reset_hwctx:
	aie2_hwctx_stop(xdna, hwctx, sched_job);

	aie2_hwctx_restart(xdna, hwctx);
+41 −0
Original line number Diff line number Diff line
@@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *

	return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
}

int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
			  struct app_health_report *report)
{
	DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
	struct amdxdna_dev *xdna = ndev->xdna;
	struct app_health_report *buf;
	dma_addr_t dma_addr;
	u32 buf_size;
	int ret;

	if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
		XDNA_DBG(xdna, "App health feature not supported");
		return -EOPNOTSUPP;
	}

	buf_size = sizeof(*report);
	buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
	if (IS_ERR(buf)) {
		XDNA_ERR(xdna, "Failed to allocate buffer for app health");
		return PTR_ERR(buf);
	}

	req.buf_addr = dma_addr;
	req.context_id = context_id;
	req.buf_size = buf_size;

	drm_clflush_virt_range(buf, sizeof(*report));
	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
	if (ret) {
		XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
		goto free_buf;
	}

	/* Copy the report to caller's buffer */
	memcpy(report, buf, sizeof(*report));

free_buf:
	aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
	return ret;
}
+52 −0
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
	MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
	MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
	MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
	MSG_OP_GET_APP_HEALTH              = 0x114,
	MSG_OP_MAX_DRV_OPCODE,
	MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
	MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
struct config_debug_bo_resp {
	enum aie2_msg_status	status;
} __packed;

struct fatal_error_info {
	__u32 fatal_type;         /* Fatal error type */
	__u32 exception_type;     /* Only valid if fatal_type is a specific value */
	__u32 exception_argument; /* Argument based on exception type */
	__u32 exception_pc;       /* Program Counter at the time of the exception */
	__u32 app_module;         /* Error module name */
	__u32 task_index;         /* Index of the task in which the error occurred */
	__u32 reserved[128];
};

struct app_health_report {
	__u16 major;
	__u16 minor;
	__u32 size;
	__u32 context_id;
	/*
	 * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT
	 * application. Before execution begins or after successful completion, the value is set
	 * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
	 * opcode's PC value.
	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
	 * Proper interpretation requires familiarity with the implementation details.
	 */
	__u32 dpu_pc;
	/*
	 * Index of the last initiated TXN opcode.
	 * Before execution starts or after successful completion, the value is set to UINT_MAX.
	 * If execution halts prematurely due to an error, this field retains the opcode's ID.
	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
	 * Proper interpretation requires familiarity with the implementation details.
	 */
	__u32 txn_op_id;
	/* The PC of the context at the time of the report */
	__u32 ctx_pc;
	struct fatal_error_info		fatal_info;
	/* Index of the most recently executed run list entry. */
	__u32 run_list_id;
};

struct get_app_health_req {
	__u32 context_id;
	__u32 buf_size;
	__u64 buf_addr;
} __packed;

struct get_app_health_resp {
	enum aie2_msg_status status;
	__u32 required_buffer_size;
	__u32 reserved[7];
} __packed;
#endif /* _AIE2_MSG_PRIV_H_ */
+14 −0
Original line number Diff line number Diff line
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
	struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
	struct amdxdna_drm_get_array *array_args = arg;
	struct amdxdna_drm_hwctx_entry __user *buf;
	struct app_health_report report;
	struct amdxdna_dev_hdl *ndev;
	u32 size;
	int ret;

	if (!array_args->num_element)
		return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
	tmp->latency = hwctx->qos.latency;
	tmp->frame_exec_time = hwctx->qos.frame_exec_time;
	tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
	ndev = hwctx->client->xdna->dev_handle;
	ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
	if (!ret) {
		/* Fill in app health report fields */
		tmp->txn_op_idx = report.txn_op_id;
		tmp->ctx_pc = report.ctx_pc;
		tmp->fatal_error_type = report.fatal_info.fatal_type;
		tmp->fatal_error_exception_type = report.fatal_info.exception_type;
		tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
		tmp->fatal_error_app_module = report.fatal_info.app_module;
	}

	buf = u64_to_user_ptr(array_args->buffer);
	size = min(sizeof(*tmp), array_args->element_size);
+5 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <linux/limits.h>
#include <linux/semaphore.h>

#include "aie2_msg_priv.h"
#include "amdxdna_mailbox.h"

#define AIE2_INTERVAL	20000	/* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
	AIE2_NPU_COMMAND,
	AIE2_PREEMPT,
	AIE2_TEMPORAL_ONLY,
	AIE2_APP_HEALTH,
	AIE2_FEATURE_MAX
};

@@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
	u32 min_minor;
};

#define AIE2_ALL_FEATURES	GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)
#define AIE2_FEATURE_ON(ndev, feature)	test_bit(feature, &(ndev)->feature_mask)

struct amdxdna_dev_priv {
@@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver
int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
				struct amdxdna_fw_ver *fw_ver);
int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
			  struct app_health_report *report);
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
Loading