Unverified Commit 5e940312 authored by Riana Tauro's avatar Riana Tauro Committed by Rodrigo Vivi
Browse files

drm/xe: Add functions and sysfs for boot survivability



Boot Survivability is a software based workflow for recovering a system
in a failed boot state. Here system recoverability is concerned with
recovering the firmware responsible for boot.

This is implemented by loading the driver with bare minimum (no drm card)
to allow the firmware to be flashed through mei-gsc and collect telemetry.
The driver's probe flow is modified such that it enters survivability mode
when pcode initialization is incomplete and boot status denotes a failure.
In this mode, drm card is not exposed and presence of survivability_mode
entry in PCI sysfs  is used to indicate survivability mode and
provide additional information required for debug

This patch adds initialization functions and exposes admin
readable sysfs entries

The new sysfs will have the below layout

	/sys/bus/.../bdf
             	     ├── survivability_mode

v2: reorder headers
    fix doc
    remove survivability info and use mode to display information
    use separate function for logging survivability information
    for critical error (Rodrigo)

v3: use for loop
    use dev logs instead of drm
    use helper function for aux history(Rodrigo)
    remove unnecessary error check of greater than max_scratch
    as we are reading only 3 bit

v4: fix checkpatch warnings
    fix space (Rodrigo)
    rename register

Signed-off-by: default avatarRiana Tauro <riana.tauro@intel.com>
Acked-by: default avatarAshwin Kumar Kulkarni <ashwin.kumar.kulkarni@intel.com>
Reviewed-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250128095632.1294722-2-riana.tauro@intel.com


Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent cb1f868c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -96,6 +96,7 @@ xe-y += xe_bb.o \
	xe_sa.o \
	xe_sched_job.o \
	xe_step.o \
	xe_survivability_mode.o \
	xe_sync.o \
	xe_tile.o \
	xe_tile_sysfs.o \
+4 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "xe_pt_types.h"
#include "xe_sriov_types.h"
#include "xe_step_types.h"
#include "xe_survivability_mode_types.h"

#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
#define TEST_VM_OPS_ERROR
@@ -331,6 +332,9 @@ struct xe_device {
		u8 skip_pcode:1;
	} info;

	/** @survivability: survivability information for device */
	struct xe_survivability survivability;

	/** @irq: device interrupt state */
	struct {
		/** @irq.lock: lock for processing irq's on this device */
+14 −0
Original line number Diff line number Diff line
@@ -49,6 +49,20 @@
/* Domain IDs (param2) */
#define     PCODE_MBOX_DOMAIN_HBM		0x2

#define PCODE_SCRATCH(x)		XE_REG(0x138320 + ((x) * 4))
/* PCODE_SCRATCH0 */
#define   AUXINFO_REG_OFFSET		REG_GENMASK(17, 15)
#define   OVERFLOW_REG_OFFSET		REG_GENMASK(14, 12)
#define   HISTORY_TRACKING		REG_BIT(11)
#define   OVERFLOW_SUPPORT		REG_BIT(10)
#define   AUXINFO_SUPPORT		REG_BIT(9)
#define   BOOT_STATUS			REG_GENMASK(3, 1)
#define      CRITICAL_FAILURE		4
#define      NON_CRITICAL_FAILURE	7

/* Auxiliary info bits */
#define   AUXINFO_HISTORY_OFFSET	REG_GENMASK(31, 29)

struct pcode_err_decode {
	int errno;
	const char *str;
+215 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: MIT
/*
 * Copyright © 2025 Intel Corporation
 */

#include "xe_survivability_mode.h"
#include "xe_survivability_mode_types.h"

#include <linux/kobject.h>
#include <linux/pci.h>
#include <linux/sysfs.h>

#include "xe_device.h"
#include "xe_gt.h"
#include "xe_mmio.h"
#include "xe_pcode_api.h"

#define MAX_SCRATCH_MMIO 8

/**
 * DOC: Xe Boot Survivability
 *
 * Boot Survivability is a software based workflow for recovering a system in a failed boot state
 * Here system recoverability is concerned with recovering the firmware responsible for boot.
 *
 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
 * to be flashed through mei and collect telemetry. The driver's probe flow is modified
 * such that it enters survivability mode when pcode initialization is incomplete and boot status
 * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
 * survivability mode and provides additional information required for debug
 *
 * KMD exposes below admin-only readable sysfs in survivability mode
 *
 * device/survivability_mode: The presence of this file indicates that the card is in survivability
 *			      mode. Also, provides additional information on why the driver entered
 *			      survivability mode.
 *
 *			      Capability Information - Provides boot status
 *			      Postcode Information   - Provides information about the failure
 *			      Overflow Information   - Provides history of previous failures
 *			      Auxiliary Information  - Certain failures may have information in
 *						       addition to postcode information
 */

static u32 aux_history_offset(u32 reg_value)
{
	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
}

static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
				   int id, char *name)
{
	strscpy(info[id].name, name, sizeof(info[id].name));
	info[id].reg = PCODE_SCRATCH(id).raw;
	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
}

static void populate_survivability_info(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	struct xe_mmio *mmio;
	u32 id = 0, reg_value;
	char name[NAME_MAX];
	int index;

	mmio = xe_root_tile_mmio(xe);
	set_survivability_info(mmio, info, id, "Capability Info");
	reg_value = info[id].value;

	if (reg_value & HISTORY_TRACKING) {
		id++;
		set_survivability_info(mmio, info, id, "Postcode Info");

		if (reg_value & OVERFLOW_SUPPORT) {
			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
			set_survivability_info(mmio, info, id, "Overflow Info");
		}
	}

	if (reg_value & AUXINFO_SUPPORT) {
		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);

		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
		     id = aux_history_offset(reg_value)) {
			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
			set_survivability_info(mmio, info, id, name);
		}
	}
}

static void log_survivability_info(struct pci_dev *pdev)
{
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	int id;

	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
		 survivability->boot_status);
	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
		if (info[id].reg)
			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
				 info[id].reg, info[id].value);
	}
}

static ssize_t survivability_mode_show(struct device *dev,
				       struct device_attribute *attr, char *buff)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	int index = 0, count = 0;

	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
		if (info[index].reg)
			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
					       info[index].reg, info[index].value);
	}

	return count;
}

static DEVICE_ATTR_ADMIN_RO(survivability_mode);

static void enable_survivability_mode(struct pci_dev *pdev)
{
	struct device *dev = &pdev->dev;
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	int ret = 0;

	/* set survivability mode */
	survivability->mode = true;
	dev_info(dev, "In Survivability Mode\n");

	/* create survivability mode sysfs */
	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
	if (ret) {
		dev_warn(dev, "Failed to create survivability sysfs files\n");
		return;
	}
}

/**
 * xe_survivability_mode_required - checks if survivability mode is required
 * @xe: xe device instance
 *
 * This function reads the boot status from Pcode
 *
 * Return: true if boot status indicates failure, false otherwise
 */
bool xe_survivability_mode_required(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
	u32 data;

	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);

	return (survivability->boot_status == NON_CRITICAL_FAILURE ||
		survivability->boot_status == CRITICAL_FAILURE);
}

/**
 * xe_survivability_mode_remove - remove survivability mode
 * @xe: xe device instance
 *
 * clean up sysfs entries of survivability mode
 */
void xe_survivability_mode_remove(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
	struct device *dev = &pdev->dev;

	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
	kfree(survivability->info);
	pci_set_drvdata(pdev, NULL);
}

/**
 * xe_survivability_mode_init - Initialize the survivability mode
 * @xe: xe device instance
 *
 * Initializes survivability information and enables survivability mode
 */
void xe_survivability_mode_init(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info;
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);

	survivability->size = MAX_SCRATCH_MMIO;

	info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL);
	if (!info)
		return;

	survivability->info = info;

	populate_survivability_info(xe);

	/* Only log debug information and exit if it is a critical failure */
	if (survivability->boot_status == CRITICAL_FAILURE) {
		log_survivability_info(pdev);
		kfree(survivability->info);
		return;
	}

	enable_survivability_mode(pdev);
}
+17 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: MIT */
/*
 * Copyright © 2025 Intel Corporation
 */

#ifndef _XE_SURVIVABILITY_MODE_H_
#define _XE_SURVIVABILITY_MODE_H_

#include <linux/types.h>

struct xe_device;

void xe_survivability_mode_init(struct xe_device *xe);
void xe_survivability_mode_remove(struct xe_device *xe);
bool xe_survivability_mode_required(struct xe_device *xe);

#endif /* _XE_SURVIVABILITY_MODE_H_ */
Loading