Unverified Commit 0a2a873d authored by Riana Tauro's avatar Riana Tauro Committed by Rodrigo Vivi
Browse files

drm/xe: Add support to handle hardware errors



Gfx device reports two classes of errors: uncorrectable and
correctable. Depending on the severity uncorrectable errors are further
classified Non-Fatal and Fatal.

Correctable and Non-Fatal errors: These errors are reported as MSI. Bits in
the Master Interrupt Register indicate the class of the error.
The source of the error is then read from the Device Error Source
Register.

Fatal errors: These are reported as PCIe errors
When a PCIe error is asserted, the OS will perform a SBR (Secondary
Bus reset) which causes the driver to reload. The error registers are
sticky and the values are maintained through SBR.

Add basic support to handle these errors.

Bspec: 50875, 53073, 53074, 53075, 53076

v2: Format commit message (Umesh)
v3: fix documentation (Stuart)

Cc: Stuart Summers <stuart.summers@intel.com>
Co-developed-by: default avatarHimal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: default avatarHimal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: default avatarRiana Tauro <riana.tauro@intel.com>
Reviewed-by: default avatarUmesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Link: https://lore.kernel.org/r/20250826063419.3022216-9-riana.tauro@intel.com


Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent f646c9f9
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -82,6 +82,7 @@ xe-y += xe_bb.o \
	xe_hw_engine.o \
	xe_hw_engine_class_sysfs.o \
	xe_hw_engine_group.o \
	xe_hw_error.o \
	xe_hw_fence.o \
	xe_irq.o \
	xe_lrc.o \
+15 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: MIT */
/*
 * Copyright © 2025 Intel Corporation
 */

#ifndef _XE_HW_ERROR_REGS_H_
#define _XE_HW_ERROR_REGS_H_

#define DEV_ERR_STAT_NONFATAL			0x100178
#define DEV_ERR_STAT_CORRECTABLE		0x10017c
#define DEV_ERR_STAT_REG(x)			XE_REG(_PICK_EVEN((x), \
								  DEV_ERR_STAT_CORRECTABLE, \
								  DEV_ERR_STAT_NONFATAL))

#endif
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#define GFX_MSTR_IRQ				XE_REG(0x190010, XE_REG_OPTION_VF)
#define   MASTER_IRQ				REG_BIT(31)
#define   GU_MISC_IRQ				REG_BIT(29)
#define   ERROR_IRQ(x)				REG_BIT(26 + (x))
#define   DISPLAY_IRQ				REG_BIT(16)
#define   I2C_IRQ				REG_BIT(12)
#define   GT_DW_IRQ(x)				REG_BIT(x)
+109 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: MIT
/*
 * Copyright © 2025 Intel Corporation
 */

#include "regs/xe_hw_error_regs.h"
#include "regs/xe_irq_regs.h"

#include "xe_device.h"
#include "xe_hw_error.h"
#include "xe_mmio.h"

/* Error categories reported by hardware */
enum hardware_error {
	HARDWARE_ERROR_CORRECTABLE = 0,
	HARDWARE_ERROR_NONFATAL = 1,
	HARDWARE_ERROR_FATAL = 2,
	HARDWARE_ERROR_MAX,
};

static const char *hw_error_to_str(const enum hardware_error hw_err)
{
	switch (hw_err) {
	case HARDWARE_ERROR_CORRECTABLE:
		return "CORRECTABLE";
	case HARDWARE_ERROR_NONFATAL:
		return "NONFATAL";
	case HARDWARE_ERROR_FATAL:
		return "FATAL";
	default:
		return "UNKNOWN";
	}
}

static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
	const char *hw_err_str = hw_error_to_str(hw_err);
	struct xe_device *xe = tile_to_xe(tile);
	unsigned long flags;
	u32 err_src;

	if (xe->info.platform != XE_BATTLEMAGE)
		return;

	spin_lock_irqsave(&xe->irq.lock, flags);
	err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
	if (!err_src) {
		drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n",
				    tile->id, hw_err_str);
		goto unlock;
	}

	/* TODO: Process errrors per source */

	xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);

unlock:
	spin_unlock_irqrestore(&xe->irq.lock, flags);
}

/**
 * xe_hw_error_irq_handler - irq handling for hw errors
 * @tile: tile instance
 * @master_ctl: value read from master interrupt register
 *
 * Xe platforms add three error bits to the master interrupt register to support error handling.
 * These three bits are used to convey the class of error FATAL, NONFATAL, or CORRECTABLE.
 * To process the interrupt, determine the source of error by reading the Device Error Source
 * Register that corresponds to the class of error being serviced.
 */
void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
{
	enum hardware_error hw_err;

	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
		if (master_ctl & ERROR_IRQ(hw_err))
			hw_error_source_handler(tile, hw_err);
}

/*
 * Process hardware errors during boot
 */
static void process_hw_errors(struct xe_device *xe)
{
	struct xe_tile *tile;
	u32 master_ctl;
	u8 id;

	for_each_tile(tile, xe, id) {
		master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
		xe_hw_error_irq_handler(tile, master_ctl);
		xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
	}
}

/**
 * xe_hw_error_init - Initialize hw errors
 * @xe: xe device instance
 *
 * Initialize and check for errors that occurred during boot
 * prior to driver load
 */
void xe_hw_error_init(struct xe_device *xe)
{
	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
		return;

	process_hw_errors(xe);
}
+15 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: MIT */
/*
 * Copyright © 2025 Intel Corporation
 */
#ifndef XE_HW_ERROR_H_
#define XE_HW_ERROR_H_

#include <linux/types.h>

struct xe_tile;
struct xe_device;

void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
void xe_hw_error_init(struct xe_device *xe);
#endif
Loading