Commit ade00a6c authored by Karol Wachowski's avatar Karol Wachowski
Browse files

accel/ivpu: Perform engine reset instead of device recovery on TDR



Replace full device recovery on TDR timeout with per-context abort,
allowing individual context handling instead of resetting the entire
device.

Extend ivpu_jsm_reset_engine() to return the list of contexts impacted
by the engine reset and use that information to abort only the affected
contexts.

Only check for potentially faulty contexts when the engine reset was not
triggered by an MMU fault or a job completion error status. This prevents
misidentifying non-guilty contexts that happened to be running at the
time of the fault.

Trigger full device recovery if no contexts were marked by engine reset
if triggered by job completion timeout, as there is no way to identify
guilty one.

Add engine reset counter to debugfs for engine resets bookkeeping
for debugging/testing purposes.

Reviewed-by: default avatarLizhi Hou <lizhi.hou@amd.com>
Signed-off-by: default avatarKarol Wachowski <karol.wachowski@linux.intel.com>
Link: https://patch.msgid.link/20260318093927.4080303-1-karol.wachowski@linux.intel.com
parent d51f2179
Loading
Loading
Loading
Loading
+12 −2
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2020-2024 Intel Corporation
 * Copyright (C) 2020-2026 Intel Corporation
 */

#include <linux/debugfs.h>
@@ -127,6 +127,14 @@ static int firewall_irq_counter_show(struct seq_file *s, void *v)
	return 0;
}

static int engine_reset_counter_show(struct seq_file *s, void *v)
{
	struct ivpu_device *vdev = seq_to_ivpu(s);

	seq_printf(s, "%d\n", atomic_read(&vdev->pm->engine_reset_counter));
	return 0;
}

static const struct drm_debugfs_info vdev_debugfs_list[] = {
	{"bo_list", bo_list_show, 0},
	{"fw_name", fw_name_show, 0},
@@ -137,6 +145,7 @@ static const struct drm_debugfs_info vdev_debugfs_list[] = {
	{"reset_counter", reset_counter_show, 0},
	{"reset_pending", reset_pending_show, 0},
	{"firewall_irq_counter", firewall_irq_counter_show, 0},
	{"engine_reset_counter", engine_reset_counter_show, 0},
};

static int dvfs_mode_get(void *data, u64 *dvfs_mode)
@@ -352,8 +361,9 @@ static const struct file_operations ivpu_force_recovery_fops = {
static int ivpu_reset_engine_fn(void *data, u64 val)
{
	struct ivpu_device *vdev = (struct ivpu_device *)data;
	struct vpu_jsm_msg resp;

	return ivpu_jsm_reset_engine(vdev, (u32)val);
	return ivpu_jsm_reset_engine(vdev, (u32)val, &resp);
}

DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n");
+1 −0
Original line number Diff line number Diff line
@@ -665,6 +665,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
	vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
	atomic64_set(&vdev->unique_id_counter, 0);
	atomic_set(&vdev->job_timeout_counter, 0);
	atomic_set(&vdev->faults_detected, 0);
	xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
	xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
	xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
+2 −1
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020-2025 Intel Corporation
 * Copyright (C) 2020-2026 Intel Corporation
 */

#ifndef __IVPU_DRV_H__
@@ -168,6 +168,7 @@ struct ivpu_device {
	struct xarray submitted_jobs_xa;
	struct ivpu_ipc_consumer job_done_consumer;
	atomic_t job_timeout_counter;
	atomic_t faults_detected;

	atomic64_t unique_id_counter;

+48 −2
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2020-2025 Intel Corporation
 * Copyright (C) 2020-2026 Intel Corporation
 */

#include <drm/drm_file.h>
@@ -607,6 +607,7 @@ bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_
		 * status and ensure both are handled in the same way
		 */
		job->file_priv->has_mmu_faults = true;
		atomic_set(&vdev->faults_detected, 1);
		queue_work(system_percpu_wq, &vdev->context_abort_work);
		return true;
	}
@@ -1115,6 +1116,51 @@ void ivpu_job_done_consumer_fini(struct ivpu_device *vdev)
	ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer);
}

static int reset_engine_and_mark_faulty_contexts(struct ivpu_device *vdev)
{
	u32 num_impacted_contexts;
	struct vpu_jsm_msg resp;
	int ret;
	u32 i;

	ret = ivpu_jsm_reset_engine(vdev, 0, &resp);
	if (ret)
		return ret;

	/*
	 * If faults are detected, ignore guilty contexts from engine reset as NPU may not be stuck
	 * and could return currently running good context and faulty contexts are already marked
	 */
	if (atomic_cmpxchg(&vdev->faults_detected, 1, 0) == 1)
		return 0;

	num_impacted_contexts = resp.payload.engine_reset_done.num_impacted_contexts;

	ivpu_warn_ratelimited(vdev, "Engine reset performed, impacted contexts: %u\n",
			      num_impacted_contexts);

	if (!in_range(num_impacted_contexts, 1, VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS - 1)) {
		ivpu_pm_trigger_recovery(vdev, "Cannot determine guilty contexts");
		return -EIO;
	}

	/* No faults detected, NPU likely got stuck. Mark returned contexts as guilty */
	guard(mutex)(&vdev->context_list_lock);

	for (i = 0; i < num_impacted_contexts; i++) {
		u32 ssid = resp.payload.engine_reset_done.impacted_contexts[i].host_ssid;
		struct ivpu_file_priv *file_priv = xa_load(&vdev->context_xa, ssid);

		if (file_priv) {
			mutex_lock(&file_priv->lock);
			file_priv->has_mmu_faults = true;
			mutex_unlock(&file_priv->lock);
		}
	}

	return 0;
}

void ivpu_context_abort_work_fn(struct work_struct *work)
{
	struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work);
@@ -1127,7 +1173,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
		return;

	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
		if (ivpu_jsm_reset_engine(vdev, 0))
		if (reset_engine_and_mark_faulty_contexts(vdev))
			goto runtime_put;

	mutex_lock(&vdev->context_list_lock);
+15 −4
Original line number Diff line number Diff line
@@ -151,10 +151,9 @@ int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat)
	return ret;
}

int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *resp)
{
	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_ENGINE_RESET };
	struct vpu_jsm_msg resp;
	int ret;

	if (engine != VPU_ENGINE_COMPUTE)
@@ -162,14 +161,17 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)

	req.payload.engine_reset.engine_idx = engine;

	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, resp,
				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
	if (ret) {
		ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
		ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
		return ret;
	}

	return ret;
	atomic_inc(&vdev->pm->engine_reset_counter);

	return 0;
}

int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id)
@@ -554,6 +556,15 @@ int ivpu_jsm_dct_disable(struct ivpu_device *vdev)
}

int ivpu_jsm_state_dump(struct ivpu_device *vdev)
{
	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
	struct vpu_jsm_msg resp;

	return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_STATE_DUMP_RSP, &resp,
					      VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
}

int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev)
{
	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };

Loading