Commit 8e937866 authored by Rafael J. Wysocki's avatar Rafael J. Wysocki
Browse files

Merge branch 'acpi-apei'

Merge ACPI APEI updates for 7.1-rc1:

 - Add devm_ghes_register_vendor_record_notifier(), use it in the PCI
   hisi driver, and Add NVIDIA vendor CPER record handler (Kai-Heng
   Feng)

* acpi-apei:
  ACPI: APEI: GHES: Add NVIDIA vendor CPER record handler
  PCI: hisi: Use devm_ghes_register_vendor_record_notifier()
  ACPI: APEI: GHES: Add devm_ghes_register_vendor_record_notifier()
parents 2fb9ec38 d7610855
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -18919,6 +18919,12 @@ S: Maintained
F:	drivers/video/fbdev/nvidia/
F:	drivers/video/fbdev/riva/
NVIDIA GHES VENDOR CPER RECORD HANDLER
M:	Kai-Heng Feng <kaihengf@nvidia.com>
L:	linux-acpi@vger.kernel.org
S:	Maintained
F:	drivers/acpi/apei/nvidia-ghes.c
NVIDIA VRS RTC DRIVER
M:	Shubhi Garg <shgarg@nvidia.com>
L:	linux-tegra@vger.kernel.org
+14 −0
Original line number Diff line number Diff line
@@ -74,6 +74,20 @@ config ACPI_APEI_EINJ_CXL

	  If unsure say 'n'

config ACPI_APEI_GHES_NVIDIA
	tristate "NVIDIA GHES vendor record handler"
	depends on ACPI_APEI_GHES
	help
	  Support for decoding NVIDIA-specific CPER sections delivered via
	  the APEI GHES vendor record notifier chain. Registers a handler
	  for the NVIDIA section GUID and logs error signatures, severity,
	  socket, and diagnostic register address-value pairs.

	  Enable on NVIDIA server platforms (e.g. DGX, HGX) that expose
	  ACPI device NVDA2012 in their firmware tables.

	  If unsure, say N.

config ACPI_APEI_ERST_DEBUG
	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
	depends on ACPI_APEI
+1 −0
Original line number Diff line number Diff line
@@ -10,5 +10,6 @@ obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o
einj-y				:= einj-core.o
einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
obj-$(CONFIG_ACPI_APEI_GHES_NVIDIA) += ghes-nvidia.o

apei-y := apei-base.o hest.o erst.o bert.o
+149 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * NVIDIA GHES vendor record handler
 *
 * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <acpi/ghes.h>

static const guid_t nvidia_sec_guid =
	GUID_INIT(0x6d5244f2, 0x2712, 0x11ec,
		  0xbe, 0xa7, 0xcb, 0x3f, 0xdb, 0x95, 0xc7, 0x86);

struct cper_sec_nvidia {
	char	signature[16];
	__le16	error_type;
	__le16	error_instance;
	u8	severity;
	u8	socket;
	u8	number_regs;
	u8	reserved;
	__le64	instance_base;
	struct {
		__le64	addr;
		__le64	val;
	} regs[] __counted_by(number_regs);
};

struct nvidia_ghes_private {
	struct notifier_block	nb;
	struct device		*dev;
};

static void nvidia_ghes_print_error(struct device *dev,
				    const struct cper_sec_nvidia *nvidia_err,
				    size_t error_data_length, bool fatal)
{
	const char *level = fatal ? KERN_ERR : KERN_INFO;
	size_t min_size;

	dev_printk(level, dev, "signature: %.16s\n", nvidia_err->signature);
	dev_printk(level, dev, "error_type: %u\n", le16_to_cpu(nvidia_err->error_type));
	dev_printk(level, dev, "error_instance: %u\n", le16_to_cpu(nvidia_err->error_instance));
	dev_printk(level, dev, "severity: %u\n", nvidia_err->severity);
	dev_printk(level, dev, "socket: %u\n", nvidia_err->socket);
	dev_printk(level, dev, "number_regs: %u\n", nvidia_err->number_regs);
	dev_printk(level, dev, "instance_base: 0x%016llx\n",
		   le64_to_cpu(nvidia_err->instance_base));

	if (nvidia_err->number_regs == 0)
		return;

	/*
	 * Validate that all registers fit within error_data_length.
	 * Each register pair is two little-endian u64s.
	 */
	min_size = struct_size(nvidia_err, regs, nvidia_err->number_regs);
	if (error_data_length < min_size) {
		dev_err(dev, "Invalid number_regs %u (section size %zu, need %zu)\n",
			nvidia_err->number_regs, error_data_length, min_size);
		return;
	}

	for (int i = 0; i < nvidia_err->number_regs; i++)
		dev_printk(level, dev, "register[%d]: address=0x%016llx value=0x%016llx\n",
			   i, le64_to_cpu(nvidia_err->regs[i].addr),
			   le64_to_cpu(nvidia_err->regs[i].val));
}

static int nvidia_ghes_notify(struct notifier_block *nb,
			      unsigned long event, void *data)
{
	struct acpi_hest_generic_data *gdata = data;
	struct nvidia_ghes_private *priv;
	const struct cper_sec_nvidia *nvidia_err;
	guid_t sec_guid;

	import_guid(&sec_guid, gdata->section_type);
	if (!guid_equal(&sec_guid, &nvidia_sec_guid))
		return NOTIFY_DONE;

	priv = container_of(nb, struct nvidia_ghes_private, nb);

	if (acpi_hest_get_error_length(gdata) < sizeof(*nvidia_err)) {
		dev_err(priv->dev, "Section too small (%d < %zu)\n",
			acpi_hest_get_error_length(gdata), sizeof(*nvidia_err));
		return NOTIFY_OK;
	}

	nvidia_err = acpi_hest_get_payload(gdata);

	if (event >= GHES_SEV_RECOVERABLE)
		dev_err(priv->dev, "NVIDIA CPER section, error_data_length: %u\n",
			acpi_hest_get_error_length(gdata));
	else
		dev_info(priv->dev, "NVIDIA CPER section, error_data_length: %u\n",
			 acpi_hest_get_error_length(gdata));

	nvidia_ghes_print_error(priv->dev, nvidia_err, acpi_hest_get_error_length(gdata),
				event >= GHES_SEV_RECOVERABLE);

	return NOTIFY_OK;
}

static int nvidia_ghes_probe(struct platform_device *pdev)
{
	struct nvidia_ghes_private *priv;
	int ret;

	priv = devm_kmalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return -ENOMEM;

	*priv = (struct nvidia_ghes_private) {
		.nb.notifier_call = nvidia_ghes_notify,
		.dev = &pdev->dev,
	};

	ret = devm_ghes_register_vendor_record_notifier(&pdev->dev, &priv->nb);
	if (ret)
		return dev_err_probe(&pdev->dev, ret,
				     "Failed to register NVIDIA GHES vendor record notifier\n");

	return 0;
}

static const struct acpi_device_id nvidia_ghes_acpi_match[] = {
	{ "NVDA2012" },
	{ }
};
MODULE_DEVICE_TABLE(acpi, nvidia_ghes_acpi_match);

static struct platform_driver nvidia_ghes_driver = {
	.driver = {
		.name = "nvidia-ghes",
		.acpi_match_table = nvidia_ghes_acpi_match,
	},
	.probe = nvidia_ghes_probe,
};
module_platform_driver(nvidia_ghes_driver);

MODULE_AUTHOR("Kai-Heng Feng <kaihengf@nvidia.com>");
MODULE_DESCRIPTION("NVIDIA GHES vendor CPER record handler");
MODULE_LICENSE("GPL");
+18 −0
Original line number Diff line number Diff line
@@ -689,6 +689,24 @@ void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(ghes_unregister_vendor_record_notifier);

static void ghes_vendor_record_notifier_destroy(void *nb)
{
	ghes_unregister_vendor_record_notifier(nb);
}

int devm_ghes_register_vendor_record_notifier(struct device *dev,
					      struct notifier_block *nb)
{
	int ret;

	ret = ghes_register_vendor_record_notifier(nb);
	if (ret)
		return ret;

	return devm_add_action_or_reset(dev, ghes_vendor_record_notifier_destroy, nb);
}
EXPORT_SYMBOL_GPL(devm_ghes_register_vendor_record_notifier);

static void ghes_vendor_record_work_func(struct work_struct *work)
{
	struct ghes_vendor_record_entry *entry;
Loading