Commit c11a50b1 authored by Jeff Hugo's avatar Jeff Hugo
Browse files

accel/qaic: Add Reliability, Accessibility, Serviceability (RAS)



AIC100 devices generates Reliability, Availability, Serviceability events
via MHI QAIC_STATUS channel. Support such events and print a structured
log with details of the events, and if the event describes an uncorrected
error, reset the device to put it back into service. As these events may
not all be reported via other mechanisms like AER, maintain counts of
the number of errors observed for each type.

Signed-off-by: default avatarJeff Hugo <jeff.hugo@oss.qualcomm.com>
Reviewed-by: default avatarJacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Reviewed-by: default avatarTroy Hanson <quic_thanson@quicinc.com>
Reviewed-by: default avatarMaciej Falkowski <maciej.falkowski@linux.intel.com>
Link: https://lore.kernel.org/r/20250516160634.1408309-1-jeff.hugo@oss.qualcomm.com
parent df1c3093
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
What:		/sys/bus/pci/drivers/qaic/XXXX:XX:XX.X/ce_count
Date:		May 2025
KernelVersion:	6.17
Contact:	dri-devel@lists.freedesktop.org
Description:	Number of correctable errors received from device since driver is loaded.

What:		/sys/bus/pci/drivers/qaic/XXXX:XX:XX.X/ue_count
Date:		May 2025
KernelVersion:	6.17
Contact:	dri-devel@lists.freedesktop.org
Description:	Number of uncorrectable errors received from device since driver is loaded.

What:		/sys/bus/pci/drivers/qaic/XXXX:XX:XX.X/ue_nonfatal_count
Date:		May 2025
KernelVersion:	6.17
Contact:	dri-devel@lists.freedesktop.org
Description:	Number of uncorrectable non-fatal errors received from device since driver
		is loaded.
+1 −0
Original line number Diff line number Diff line
@@ -19952,6 +19952,7 @@ L: linux-arm-msm@vger.kernel.org
L:	dri-devel@lists.freedesktop.org
S:	Supported
T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
F:	Documentation/ABI/testing/sysfs-driver-qaic
F:	Documentation/accel/qaic/
F:	drivers/accel/qaic/
F:	include/uapi/drm/qaic_accel.h
+1 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ qaic-y := \
	qaic_control.o \
	qaic_data.o \
	qaic_drv.o \
	qaic_ras.o \
	qaic_timesync.o \
	sahara.o

+8 −0
Original line number Diff line number Diff line
@@ -167,6 +167,14 @@ struct qaic_device {
	struct workqueue_struct *bootlog_wq;
	/* Synchronizes access of pages in MHI bootlog device */
	struct mutex            bootlog_mutex;
	/* MHI RAS channel device */
	struct mhi_device	*ras_ch;
	/* Correctable error count */
	unsigned int		ce_count;
	/* Un-correctable error count */
	unsigned int		ue_count;
	/* Un-correctable non-fatal error count */
	unsigned int		ue_nf_count;
};

struct qaic_drm_device {
+6 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include "mhi_controller.h"
#include "qaic.h"
#include "qaic_debugfs.h"
#include "qaic_ras.h"
#include "qaic_timesync.h"
#include "sahara.h"

@@ -695,6 +696,10 @@ static int __init qaic_init(void)
	if (ret)
		pr_debug("qaic: qaic_bootlog_register failed %d\n", ret);

	ret = qaic_ras_register();
	if (ret)
		pr_debug("qaic: qaic_ras_register failed %d\n", ret);

	return 0;

free_mhi:
@@ -722,6 +727,7 @@ static void __exit qaic_exit(void)
	 * reinitializing the link_up state after the cleanup is done.
	 */
	link_up = true;
	qaic_ras_unregister();
	qaic_bootlog_unregister();
	qaic_timesync_deinit();
	sahara_unregister();
Loading