Unverified Commit 8ed9aaae authored by Rodrigo Vivi's avatar Rodrigo Vivi
Browse files

drm/xe: Force wedged state and block GT reset upon any GPU hang



In many validation situations when debugging GPU Hangs,
it is useful to preserve the GT situation from the moment
that the timeout occurred.

This patch introduces a module parameter that could be used
on situations like this.

If xe.wedged module parameter is set to 2, Xe will be declared
wedged on every single execution timeout (a.k.a. GPU hang) right
after devcoredump snapshot capture and without attempting any
kind of GT reset and blocking entirely any kind of execution.

v2: Really block gt_reset from guc side. (Lucas)
    s/wedged/busted (Lucas)

v3: - s/busted/wedged
    - Really use global_flags (Dafna)
    - More robust timeout handling when wedging it.

v4: A really robust clean exit done by Matt Brost.
    No more kernel warns on unbind.

v5: Simplify error message (Lucas)

Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Dafna Hirschfeld <dhirschfeld@habana.ai>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Himanshu Somaiya <himanshu.somaiya@intel.com>
Reviewed-by: default avatarLucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-3-rodrigo.vivi@intel.com


Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent 69281867
Loading
Loading
Loading
Loading
+29 −0
Original line number Diff line number Diff line
@@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
{
	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
}

/**
 * xe_device_declare_wedged - Declare device wedged
 * @xe: xe device instance
 *
 * This is a final state that can only be cleared with a module
 * re-probe (unbind + bind).
 * In this state every IOCTL will be blocked so the GT cannot be used.
 * In general it will be called upon any critical error such as gt reset
 * failure or guc loading failure.
 * If xe.wedged module parameter is set to 2, this function will be called
 * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
 * snapshot capture. In this mode, GT reset won't be attempted so the state of
 * the issue is preserved for further debugging.
 */
void xe_device_declare_wedged(struct xe_device *xe)
{
	if (xe_modparam.wedged_mode == 0)
		return;

	if (!atomic_xchg(&xe->wedged, 1)) {
		xe->needs_flr_on_fini = true;
		drm_err(&xe->drm,
			"CRITICAL: Xe has declared device %s as wedged.\n"
			"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
			dev_name(xe->drm.dev));
	}
}
+1 −14
Original line number Diff line number Diff line
@@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
	return atomic_read(&xe->wedged);
}

static inline void xe_device_declare_wedged(struct xe_device *xe)
{
	if (!atomic_xchg(&xe->wedged, 1)) {
		xe->needs_flr_on_fini = true;
		drm_err(&xe->drm,
			"CRITICAL: Xe has declared device %s as wedged.\n"
			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
			dev_name(xe->drm.dev));
	}
}
void xe_device_declare_wedged(struct xe_device *xe);

#endif
+9 −0
Original line number Diff line number Diff line
@@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
void xe_exec_queue_destroy(struct kref *ref);
void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);

static inline struct xe_exec_queue *
xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
{
	if (kref_get_unless_zero(&q->refcount))
		return q;

	return NULL;
}

struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);

static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)
+1 −1
Original line number Diff line number Diff line
@@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
			return seqno;

		xe_gt_tlb_invalidation_wait(gt, seqno);
	} else if (xe_device_uc_enabled(xe)) {
	} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
		xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
		if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
			xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
+8 −1
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "xe_lrc.h"
#include "xe_map.h"
#include "xe_mmio.h"
#include "xe_module.h"
#include "xe_platform_types.h"
#include "xe_wa.h"

@@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)

static void guc_policies_init(struct xe_guc_ads *ads)
{
	u32 global_flags = 0;

	ads_blob_write(ads, policies.dpc_promote_time,
		       GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
	ads_blob_write(ads, policies.max_num_work_items,
		       GLOBAL_POLICY_MAX_NUM_WI);
	ads_blob_write(ads, policies.global_flags, 0);

	if (xe_modparam.wedged_mode == 2)
		global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;

	ads_blob_write(ads, policies.global_flags, global_flags);
	ads_blob_write(ads, policies.is_valid, 1);
}

Loading