Unverified Commit 6b8ef44c authored by Rodrigo Vivi's avatar Rodrigo Vivi
Browse files

drm/xe: Introduce the wedged_mode debugfs



So, the wedged mode can be selected per device at runtime,
before the tests or before reproducing the issue.

v2: - s/busted/wedged
    - some locking consistency

v3: - remove mutex
    - toggle guc reset policy on any mode change

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: default avatarHimal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-4-rodrigo.vivi@intel.com


Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent 8ed9aaae
Loading
Loading
Loading
Loading
+55 −0
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
#include "xe_bo.h"
#include "xe_device.h"
#include "xe_gt_debugfs.h"
#include "xe_gt_printk.h"
#include "xe_guc_ads.h"
#include "xe_pm.h"
#include "xe_sriov.h"
#include "xe_step.h"
@@ -117,6 +119,56 @@ static const struct file_operations forcewake_all_fops = {
	.release = forcewake_release,
};

static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
				size_t size, loff_t *pos)
{
	struct xe_device *xe = file_inode(f)->i_private;
	char buf[32];
	int len = 0;

	len = scnprintf(buf, sizeof(buf), "%d\n", xe->wedged.mode);

	return simple_read_from_buffer(ubuf, size, pos, buf, len);
}

static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
			       size_t size, loff_t *pos)
{
	struct xe_device *xe = file_inode(f)->i_private;
	struct xe_gt *gt;
	u32 wedged_mode;
	ssize_t ret;
	u8 id;

	ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
	if (ret)
		return ret;

	if (wedged_mode > 2)
		return -EINVAL;

	if (xe->wedged.mode == wedged_mode)
		return 0;

	xe->wedged.mode = wedged_mode;

	for_each_gt(gt, xe, id) {
		ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
		if (ret) {
			xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
			return -EIO;
		}
	}

	return size;
}

static const struct file_operations wedged_mode_fops = {
	.owner = THIS_MODULE,
	.read = wedged_mode_show,
	.write = wedged_mode_set,
};

void xe_debugfs_register(struct xe_device *xe)
{
	struct ttm_device *bdev = &xe->ttm;
@@ -134,6 +186,9 @@ void xe_debugfs_register(struct xe_device *xe)
	debugfs_create_file("forcewake_all", 0400, root, xe,
			    &forcewake_all_fops);

	debugfs_create_file("wedged_mode", 0400, root, xe,
			    &wedged_mode_fops);

	for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
		man = ttm_manager_type(bdev, mem_type);

+7 −3
Original line number Diff line number Diff line
@@ -506,6 +506,8 @@ int xe_device_probe_early(struct xe_device *xe)
	if (err)
		return err;

	xe->wedged.mode = xe_modparam.wedged_mode;

	return 0;
}

@@ -769,7 +771,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
 * xe_device_declare_wedged - Declare device wedged
 * @xe: xe device instance
 *
 * This is a final state that can only be cleared with a module
 * This is a final state that can only be cleared with a mudule
 * re-probe (unbind + bind).
 * In this state every IOCTL will be blocked so the GT cannot be used.
 * In general it will be called upon any critical error such as gt reset
@@ -781,10 +783,12 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
 */
void xe_device_declare_wedged(struct xe_device *xe)
{
	if (xe_modparam.wedged_mode == 0)
	if (xe->wedged.mode == 0) {
		drm_dbg(&xe->drm, "Wedged mode is forcebly disabled\n");
		return;
	}

	if (!atomic_xchg(&xe->wedged, 1)) {
	if (!atomic_xchg(&xe->wedged.flag, 1)) {
		xe->needs_flr_on_fini = true;
		drm_err(&xe->drm,
			"CRITICAL: Xe has declared device %s as wedged.\n"
+1 −1
Original line number Diff line number Diff line
@@ -169,7 +169,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);

static inline bool xe_device_wedged(struct xe_device *xe)
{
	return atomic_read(&xe->wedged);
	return atomic_read(&xe->wedged.flag);
}

void xe_device_declare_wedged(struct xe_device *xe);
+7 −2
Original line number Diff line number Diff line
@@ -459,8 +459,13 @@ struct xe_device {
	/** @needs_flr_on_fini: requests function-reset on fini */
	bool needs_flr_on_fini;

	/** @wedged: Xe device faced a critical error and is now blocked. */
	atomic_t wedged;
	/** @wedged: Struct to control Wedged States and mode */
	struct {
		/** @wedged.flag: Xe device faced a critical error and is now blocked. */
		atomic_t flag;
		/** @wedged.mode: Mode controlled by kernel parameter and debugfs */
		int mode;
	} wedged;

	/* private: */

+58 −2
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@

#include <generated/xe_wa_oob.h>

#include "abi/guc_actions_abi.h"
#include "regs/xe_engine_regs.h"
#include "regs/xe_gt_regs.h"
#include "regs/xe_guc_regs.h"
@@ -16,11 +17,11 @@
#include "xe_gt.h"
#include "xe_gt_ccs_mode.h"
#include "xe_guc.h"
#include "xe_guc_ct.h"
#include "xe_hw_engine.h"
#include "xe_lrc.h"
#include "xe_map.h"
#include "xe_mmio.h"
#include "xe_module.h"
#include "xe_platform_types.h"
#include "xe_wa.h"

@@ -441,6 +442,7 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)

static void guc_policies_init(struct xe_guc_ads *ads)
{
	struct xe_device *xe = ads_to_xe(ads);
	u32 global_flags = 0;

	ads_blob_write(ads, policies.dpc_promote_time,
@@ -448,7 +450,7 @@ static void guc_policies_init(struct xe_guc_ads *ads)
	ads_blob_write(ads, policies.max_num_work_items,
		       GLOBAL_POLICY_MAX_NUM_WI);

	if (xe_modparam.wedged_mode == 2)
	if (xe->wedged.mode == 2)
		global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;

	ads_blob_write(ads, policies.global_flags, global_flags);
@@ -806,3 +808,57 @@ void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads)
{
	guc_populate_golden_lrc(ads);
}

static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset)
{
	struct  xe_guc_ct *ct = &ads_to_guc(ads)->ct;
	u32 action[] = {
		XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE,
		policy_offset
	};

	return xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
}

/**
 * xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
 * @ads: Additional data structures object
 *
 * This function update the GuC's engine reset policy based on wedged.mode.
 *
 * Return: 0 on success, and negative error code otherwise.
 */
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
{
	struct xe_device *xe = ads_to_xe(ads);
	struct xe_gt *gt = ads_to_gt(ads);
	struct xe_tile *tile = gt_to_tile(gt);
	struct guc_policies *policies;
	struct xe_bo *bo;
	int ret = 0;

	policies = kmalloc(sizeof(*policies), GFP_KERNEL);
	if (!policies)
		return -ENOMEM;

	policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
	policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
	policies->is_valid = 1;
	if (xe->wedged.mode == 2)
		policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
	else
		policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;

	bo = xe_managed_bo_create_from_data(xe, tile, policies, sizeof(struct guc_policies),
					    XE_BO_FLAG_VRAM_IF_DGFX(tile) |
					    XE_BO_FLAG_GGTT);
	if (IS_ERR(bo)) {
		ret = PTR_ERR(bo);
		goto out;
	}

	ret = guc_ads_action_update_policies(ads, xe_bo_ggtt_addr(bo));
out:
	kfree(policies);
	return ret;
}
Loading