Unverified Commit c9474b72 authored by Matthew Brost's avatar Matthew Brost Committed by Rodrigo Vivi
Browse files

drm/xe: Wedge the entire device



Wedge the entire device, not just GT which may have triggered the wedge.
To implement this, cleanup the layering so xe_device_declare_wedged()
calls into the lower layers (GT) to ensure entire device is wedged.

While we are here, also signal any pending GT TLB invalidations upon
wedging device.

Lastly, short circuit reset wait if device is wedged.

v2:
 - Short circuit reset wait if device is wedged (Local testing)

Fixes: 8ed9aaae ("drm/xe: Force wedged state and block GT reset upon any GPU hang")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarJonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-1-matthew.brost@intel.com


(cherry picked from commit 7dbe8af1)
Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent bf07ca96
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -870,6 +870,9 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
 */
void xe_device_declare_wedged(struct xe_device *xe)
{
	struct xe_gt *gt;
	u8 id;

	if (xe->wedged.mode == 0) {
		drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n");
		return;
@@ -883,4 +886,7 @@ void xe_device_declare_wedged(struct xe_device *xe)
			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
			dev_name(xe->drm.dev));
	}

	for_each_gt(gt, xe, id)
		xe_gt_declare_wedged(gt);
}
+15 −0
Original line number Diff line number Diff line
@@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt)

	return NULL;
}

/**
 * xe_gt_declare_wedged() - Declare GT wedged
 * @gt: the GT object
 *
 * Wedge the GT which stops all submission, saves desired debug state, and
 * cleans up anything which could timeout.
 */
void xe_gt_declare_wedged(struct xe_gt *gt)
{
	xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode);

	xe_uc_declare_wedged(&gt->uc);
	xe_gt_tlb_invalidation_reset(gt);
}
+1 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
int xe_gt_init_hwconfig(struct xe_gt *gt);
int xe_gt_init_early(struct xe_gt *gt);
int xe_gt_init(struct xe_gt *gt);
void xe_gt_declare_wedged(struct xe_gt *gt);
int xe_gt_record_default_lrcs(struct xe_gt *gt);

/**
+16 −0
Original line number Diff line number Diff line
@@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
	xe_guc_ct_print(&guc->ct, p, false);
	xe_guc_submit_print(guc, p);
}

/**
 * xe_guc_declare_wedged() - Declare GuC wedged
 * @guc: the GuC object
 *
 * Wedge the GuC which stops all submission, saves desired debug state, and
 * cleans up anything which could timeout.
 */
void xe_guc_declare_wedged(struct xe_guc *guc)
{
	xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);

	xe_guc_reset_prepare(guc);
	xe_guc_ct_stop(&guc->ct);
	xe_guc_submit_wedge(guc);
}
+1 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
void xe_guc_stop_prepare(struct xe_guc *guc);
void xe_guc_stop(struct xe_guc *guc);
int xe_guc_start(struct xe_guc *guc);
void xe_guc_declare_wedged(struct xe_guc *guc);

static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
{
Loading