Unverified Commit f4e9fc96 authored by Riana Tauro's avatar Riana Tauro Committed by Rodrigo Vivi
Browse files

drm/xe/xe_survivability: Redesign survivability mode



Redesign survivability mode to have only one value per file.

1) Retain the survivability_mode sysfs to indicate the type

	cat /sys/bus/pci/devices/0000\:03\:00.0/survivability_mode
	(Boot / Runtime)

2) Add survivability_info directory to expose boot breadcrumbs.
Entries in survivability mode sysfs are only visible when
boot breadcrumb registers are populated.

	/sys/bus/pci/devices/0000:03:00.0/survivability_info
	├── aux_info0
	├── aux_info1
	├── aux_info2
	├── aux_info3
	├── aux_info4
	├── capability_info
	├── postcode_trace
	└── postcode_trace_overflow

Capability Info:

	Provides data about boot status and has bits that
	indicate the support for the other breadcrumbs

Postcode Trace / Postcode Trace Overflow :

	Each postcode is represented as an 8-bit value and represents
	a boot failure event. When a new failure event is logged by Pcode
	the existing postcodes are shifted left. These entries provide a
	history of 8 postcodes.

Auxiliary Info:

	Some failures have additional debug information.

Signed-off-by: default avatarRiana Tauro <riana.tauro@intel.com>
Reviewed-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patch.msgid.link/20251208084539.3652902-5-riana.tauro@intel.com


Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent a00e305f
Loading
Loading
Loading
Loading
+140 −82
Original line number Diff line number Diff line
@@ -19,8 +19,6 @@
#include "xe_pcode_api.h"
#include "xe_vsec.h"

#define MAX_SCRATCH_MMIO 8

/**
 * DOC: Survivability Mode
 *
@@ -48,19 +46,38 @@
 *
 * Refer :ref:`xe_configfs` for more details on how to use configfs
 *
 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
 * debug information::
 * Survivability mode is indicated by the below admin-only readable sysfs entry. It
 * provides information about the type of survivability mode (Boot/Runtime).
 *
 * .. code-block:: shell
 *
 *	# cat /sys/bus/pci/devices/<device>/survivability_mode
 *	  Boot
 *
 *
 * Any additional debug information if present will be visible under the directory
 * ``survivability_info``::
 *
 *	/sys/bus/pci/devices/<device>/survivability_info/
 *	├── aux_info0
 *	├── aux_info1
 *	├── aux_info2
 *	├── aux_info3
 *	├── aux_info4
 *	├── capability_info
 *	├── fdo_mode
 *	├── postcode_trace
 *	└── postcode_trace_overflow
 *
 * This directory has the following attributes
 *
 *	/sys/bus/pci/devices/<device>/survivability_mode
 * - ``capability_info`` : Indicates Boot status and support for additional information
 *
 * Capability Information:
 *	Provides boot status
 * Postcode Information:
 *	Provides information about the failure
 * Overflow Information
 *	Provides history of previous failures
 * Auxiliary Information
 *	Certain failures may have information in addition to postcode information
 * - ``postcode_trace``, ``postcode_trace_overflow`` : Each postcode is a 8bit value and
 *   represents a boot failure event. When a new failure event is logged by PCODE the
 *   existing postcodes are shifted left. These entries provide a history of 8 postcodes.
 *
 * - ``aux_info<n>`` : Some failures have additional debug information
 *
 * Runtime Survivability
 * =====================
@@ -68,60 +85,76 @@
 * Certain runtime firmware errors can cause the device to enter a wedged state
 * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
 * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
 * is indicated by the presence of survivability mode sysfs::
 * is indicated by the presence of survivability mode sysfs.
 * Survivability mode sysfs provides information about the type of survivability mode.
 *
 *	/sys/bus/pci/devices/<device>/survivability_mode
 * .. code-block:: shell
 *
 * Survivability mode sysfs provides information about the type of survivability mode.
 *	# cat /sys/bus/pci/devices/<device>/survivability_mode
 *	  Runtime
 *
 * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
 * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
 * to restore device to normal operation.
 */

static const char * const reg_map[] = {
	[CAPABILITY_INFO]         = "Capability Info",
	[POSTCODE_TRACE]          = "Postcode trace",
	[POSTCODE_TRACE_OVERFLOW] = "Postcode trace overflow",
	[AUX_INFO0]               = "Auxiliary Info 0",
	[AUX_INFO1]               = "Auxiliary Info 1",
	[AUX_INFO2]               = "Auxiliary Info 2",
	[AUX_INFO3]               = "Auxiliary Info 3",
	[AUX_INFO4]               = "Auxiliary Info 4",
};

struct xe_survivability_attribute {
	struct device_attribute attr;
	u8 index;
};

static struct
xe_survivability_attribute *dev_attr_to_survivability_attr(struct device_attribute *attr)
{
	return container_of(attr, struct xe_survivability_attribute, attr);
}

static u32 aux_history_offset(u32 reg_value)
{
	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
}

static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
				   int id, char *name)
static void set_survivability_info(struct xe_mmio *mmio, u32  *info, int id)
{
	strscpy(info[id].name, name, sizeof(info[id].name));
	info[id].reg = PCODE_SCRATCH(id).raw;
	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
	info[id] = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
}

static void populate_survivability_info(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	u32 *info = survivability->info;
	struct xe_mmio *mmio;
	u32 id = 0, reg_value;
	char name[NAME_MAX];
	int index;

	mmio = xe_root_tile_mmio(xe);
	set_survivability_info(mmio, info, id, "Capability Info");
	reg_value = info[id].value;
	set_survivability_info(mmio, info, CAPABILITY_INFO);
	reg_value = info[CAPABILITY_INFO];

	if (reg_value & HISTORY_TRACKING) {
		id++;
		set_survivability_info(mmio, info, id, "Postcode Info");
		set_survivability_info(mmio, info, POSTCODE_TRACE);

		if (reg_value & OVERFLOW_SUPPORT) {
			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
			set_survivability_info(mmio, info, id, "Overflow Info");
		}
		if (reg_value & OVERFLOW_SUPPORT)
			set_survivability_info(mmio, info, POSTCODE_TRACE_OVERFLOW);
	}

	if (reg_value & AUXINFO_SUPPORT) {
		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);

		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
		     id = aux_history_offset(reg_value)) {
			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
			set_survivability_info(mmio, info, id, name);
		for (index = 0; id >= AUX_INFO0 && id < MAX_SCRATCH_REG; index++) {
			set_survivability_info(mmio, info, id);
			id = aux_history_offset(info[id]);
		}
	}
}
@@ -130,15 +163,14 @@ static void log_survivability_info(struct pci_dev *pdev)
{
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	u32 *info = survivability->info;
	int id;

	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
		 survivability->boot_status);
	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
		if (info[id].reg)
			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
				 info[id].reg, info[id].value);
	for (id = 0; id < MAX_SCRATCH_REG; id++) {
		if (info[id])
			dev_info(&pdev->dev, "%s: 0x%x\n", reg_map[id], info[id]);
	}
}

@@ -156,25 +188,38 @@ static ssize_t survivability_mode_show(struct device *dev,
	struct pci_dev *pdev = to_pci_dev(dev);
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	int index = 0, count = 0;

	count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
			       survivability->type ? "Runtime" : "Boot");
	return sysfs_emit(buff, "%s\n", survivability->type ? "Runtime" : "Boot");
}

	if (!check_boot_failure(xe))
		return count;
static DEVICE_ATTR_ADMIN_RO(survivability_mode);

	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
		if (info[index].reg)
			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
					       info[index].reg, info[index].value);
static ssize_t survivability_info_show(struct device *dev,
				       struct device_attribute *attr, char *buff)
{
	struct xe_survivability_attribute *sa = dev_attr_to_survivability_attr(attr);
	struct pci_dev *pdev = to_pci_dev(dev);
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	u32 *info = survivability->info;

	return sysfs_emit(buff, "0x%x\n", info[sa->index]);
}

	return count;
#define SURVIVABILITY_ATTR_RO(name, _index)					\
	struct xe_survivability_attribute attr_##name =	{			\
		.attr =  __ATTR(name, 0400, survivability_info_show, NULL),	\
		.index = _index,						\
	}

static DEVICE_ATTR_ADMIN_RO(survivability_mode);
SURVIVABILITY_ATTR_RO(capability_info, CAPABILITY_INFO);
SURVIVABILITY_ATTR_RO(postcode_trace, POSTCODE_TRACE);
SURVIVABILITY_ATTR_RO(postcode_trace_overflow, POSTCODE_TRACE_OVERFLOW);
SURVIVABILITY_ATTR_RO(aux_info0, AUX_INFO0);
SURVIVABILITY_ATTR_RO(aux_info1, AUX_INFO1);
SURVIVABILITY_ATTR_RO(aux_info2, AUX_INFO2);
SURVIVABILITY_ATTR_RO(aux_info3, AUX_INFO3);
SURVIVABILITY_ATTR_RO(aux_info4, AUX_INFO4);

static void xe_survivability_mode_fini(void *arg)
{
@@ -182,17 +227,48 @@ static void xe_survivability_mode_fini(void *arg)
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
	struct device *dev = &pdev->dev;

	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
	device_remove_file(dev, &dev_attr_survivability_mode);
}

static umode_t survivability_info_attrs_visible(struct kobject *kobj, struct attribute *attr,
						int idx)
{
	struct xe_device *xe = kdev_to_xe_device(kobj_to_dev(kobj));
	struct xe_survivability *survivability = &xe->survivability;
	u32 *info = survivability->info;

	if (info[idx])
		return 0400;

	return 0;
}

/* Attributes are ordered according to enum scratch_reg */
static struct attribute *survivability_info_attrs[] = {
	&attr_capability_info.attr.attr,
	&attr_postcode_trace.attr.attr,
	&attr_postcode_trace_overflow.attr.attr,
	&attr_aux_info0.attr.attr,
	&attr_aux_info1.attr.attr,
	&attr_aux_info2.attr.attr,
	&attr_aux_info3.attr.attr,
	&attr_aux_info4.attr.attr,
	NULL,
};

static const struct attribute_group survivability_info_group = {
	.name = "survivability_info",
	.attrs = survivability_info_attrs,
	.is_visible = survivability_info_attrs_visible,
};

static int create_survivability_sysfs(struct pci_dev *pdev)
{
	struct device *dev = &pdev->dev;
	struct xe_device *xe = pdev_to_xe_device(pdev);
	int ret;

	/* create survivability mode sysfs */
	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
	ret = device_create_file(dev, &dev_attr_survivability_mode);
	if (ret) {
		dev_warn(dev, "Failed to create survivability sysfs files\n");
		return ret;
@@ -203,6 +279,12 @@ static int create_survivability_sysfs(struct pci_dev *pdev)
	if (ret)
		return ret;

	if (check_boot_failure(xe)) {
		ret = devm_device_add_group(dev, &survivability_info_group);
		if (ret)
			return ret;
	}

	return 0;
}

@@ -239,25 +321,6 @@ static int enable_boot_survivability_mode(struct pci_dev *pdev)
	return ret;
}

static int init_survivability_mode(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info;

	survivability->size = MAX_SCRATCH_MMIO;

	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
			    GFP_KERNEL);
	if (!info)
		return -ENOMEM;

	survivability->info = info;

	populate_survivability_info(xe);

	return 0;
}

/**
 * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
 * @xe: xe device instance
@@ -325,9 +388,7 @@ int xe_survivability_mode_runtime_enable(struct xe_device *xe)
		return -EINVAL;
	}

	ret = init_survivability_mode(xe);
	if (ret)
		return ret;
	populate_survivability_info(xe);

	ret = create_survivability_sysfs(pdev);
	if (ret)
@@ -356,14 +417,11 @@ int xe_survivability_mode_boot_enable(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
	int ret;

	if (!xe_survivability_mode_is_requested(xe))
		return 0;

	ret = init_survivability_mode(xe);
	if (ret)
		return ret;
	populate_survivability_info(xe);

	/* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
	if (survivability->boot_status == CRITICAL_FAILURE) {
+14 −8
Original line number Diff line number Diff line
@@ -9,23 +9,29 @@
#include <linux/limits.h>
#include <linux/types.h>

enum scratch_reg {
	CAPABILITY_INFO,
	POSTCODE_TRACE,
	POSTCODE_TRACE_OVERFLOW,
	AUX_INFO0,
	AUX_INFO1,
	AUX_INFO2,
	AUX_INFO3,
	AUX_INFO4,
	MAX_SCRATCH_REG,
};

enum xe_survivability_type {
	XE_SURVIVABILITY_TYPE_BOOT,
	XE_SURVIVABILITY_TYPE_RUNTIME,
};

struct xe_survivability_info {
	char name[NAME_MAX];
	u32 reg;
	u32 value;
};

/**
 * struct xe_survivability: Contains survivability mode information
 */
struct xe_survivability {
	/** @info: struct that holds survivability info from scratch registers */
	struct xe_survivability_info *info;
	/** @info: survivability debug info */
	u32 info[MAX_SCRATCH_REG];

	/** @size: number of scratch registers */
	u32 size;