Commit dfa5dc3a authored by Rafael J. Wysocki's avatar Rafael J. Wysocki
Browse files

Merge branch 'acpi-apei'

Merge ACPI APEI support updates for 6.20-rc1/7.0-rc1:

 - Make read-only array non_mmio_desc[] static const (Colin Ian King)

 - Prevent the APEI GHES support code on ARM from accessing memory out
   of bounds or going past the ARM processor CPER record buffer (Mauro
   Carvalho Chehab)

 - Prevent cper_print_fw_err() from dumping the entire memory on systems
   with defective firmware (Mauro Carvalho Chehab)

 - Improve ghes_notify_nmi() status check to avoid unnecessary overhead
   in the NMI handler by carrying out all of the requisite preparations
   and the NMI registration time (Tony Luck)

 - Refactor the GHES driver by extracting common functionality into
   reusable helper functions to reduce code duplication and improve
   the ghes_notify_sea() status check in analogy with the previous
   ghes_notify_nmi() status check improvement (Shuai Xue)

 - Make ELOG and GHES log and trace consistently and support the CPER
   CXL protocol analogously (Fabio De Francesco)

 - Disable KASAN instrumentation in the APEI GHES driver when compile
   testing with clang < 18 (Nathan Chancellor)

 - Let ghes_edac be the preferred driver to load on  __ZX__ and _BYO_
   systems by extending the platform detection list in the APEI GHES
   driver (Tony W Wang-oc)

* acpi-apei:
  ACPI: APEI: GHES: Add ghes_edac support for __ZX__ and _BYO_ systems
  ACPI: APEI: GHES: Disable KASAN instrumentation when compile testing with clang < 18
  ACPI: extlog: Trace CPER CXL Protocol Error Section
  ACPI: APEI: GHES: Add helper to copy CPER CXL protocol error info to work struct
  ACPI: APEI: GHES: Add helper for CPER CXL protocol errors checks
  ACPI: extlog: Trace CPER PCI Express Error Section
  ACPI: extlog: Trace CPER Non-standard Section Body
  ACPI: APEI: GHES: Improve ghes_notify_sea() status check
  ACPI: APEI: GHES: Extract helper functions for error status handling
  ACPI: APEI: GHES: Improve ghes_notify_nmi() status check
  EFI/CPER: don't dump the entire memory region
  APEI/GHES: ensure that won't go past CPER allocated record
  EFI/CPER: don't go past the ARM processor CPER record buffer
  APEI/GHES: ARM processor Error: don't go past allocated memory
  ACPI: APEI: EINJ: make read-only array non_mmio_desc static const
parents 2b0181a5 57d5287b
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -494,6 +494,8 @@ config ACPI_EXTLOG
	tristate "Extended Error Log support"
	depends on X86_MCE && X86_LOCAL_APIC && EDAC
	select UEFI_CPER
	select ACPI_APEI
	select ACPI_APEI_GHES
	help
	  Certain usages such as Predictive Failure Analysis (PFA) require
	  more information about the error than what can be described in
+64 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include <linux/ratelimit.h>
#include <linux/edac.h>
#include <linux/ras.h>
#include <cxl/event.h>
#include <acpi/ghes.h>
#include <asm/cpu.h>
#include <asm/mce.h>
@@ -132,6 +133,53 @@ static int print_extlog_rcd(const char *pfx,
	return 1;
}

static void extlog_print_pcie(struct cper_sec_pcie *pcie_err,
			      int severity)
{
#ifdef ACPI_APEI_PCIEAER
	struct aer_capability_regs *aer;
	struct pci_dev *pdev;
	unsigned int devfn;
	unsigned int bus;
	int aer_severity;
	int domain;

	if (!(pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
	      pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO))
		return;

	aer_severity = cper_severity_to_aer(severity);
	aer = (struct aer_capability_regs *)pcie_err->aer_info;
	domain = pcie_err->device_id.segment;
	bus = pcie_err->device_id.bus;
	devfn = PCI_DEVFN(pcie_err->device_id.device,
			  pcie_err->device_id.function);
	pdev = pci_get_domain_bus_and_slot(domain, bus, devfn);
	if (!pdev)
		return;

	pci_print_aer(pdev, aer_severity, aer);
	pci_dev_put(pdev);
#endif
}

static void
extlog_cxl_cper_handle_prot_err(struct cxl_cper_sec_prot_err *prot_err,
				int severity)
{
#ifdef ACPI_APEI_PCIEAER
	struct cxl_cper_prot_err_work_data wd;

	if (cxl_cper_sec_prot_err_valid(prot_err))
		return;

	if (cxl_cper_setup_prot_err_work_data(&wd, prot_err, severity))
		return;

	cxl_cper_handle_prot_err(&wd);
#endif
}

static int extlog_print(struct notifier_block *nb, unsigned long val,
			void *data)
{
@@ -183,6 +231,22 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
			if (gdata->error_data_length >= sizeof(*mem))
				trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
						       (u8)gdata->error_severity);
		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
			struct cxl_cper_sec_prot_err *prot_err =
				acpi_hest_get_payload(gdata);

			extlog_cxl_cper_handle_prot_err(prot_err,
							gdata->error_severity);
		} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
			struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);

			extlog_print_pcie(pcie_err, gdata->error_severity);
		} else {
			void *err = acpi_hest_get_payload(gdata);

			log_non_standard_event(sec_type, fru_id, fru_text,
					       gdata->error_severity, err,
					       gdata->error_data_length);
		}
	}

+5 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ACPI_APEI)		+= apei.o
obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
# clang versions prior to 18 may blow out the stack with KASAN
ifeq ($(CONFIG_COMPILE_TEST)_$(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_y_)
KASAN_SANITIZE_ghes.o := n
endif
obj-$(CONFIG_ACPI_APEI_PCIEAER)	+= ghes_helpers.o
obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
einj-y				:= einj-core.o
einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
+1 −1
Original line number Diff line number Diff line
@@ -679,7 +679,7 @@ static bool is_allowed_range(u64 base_addr, u64 size)
	 * region intersects with known resource. So do an allow list check for
	 * IORES_DESCs that definitely or most likely not MMIO.
	 */
	int non_mmio_desc[] = {
	static const int non_mmio_desc[] = {
		IORES_DESC_CRASH_KERNEL,
		IORES_DESC_ACPI_TABLES,
		IORES_DESC_ACPI_NV_STORAGE,
+138 −49
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include <linux/cper.h>
#include <linux/cleanup.h>
#include <linux/platform_device.h>
#include <linux/minmax.h>
#include <linux/mutex.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
@@ -294,6 +295,7 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
		error_block_length = GHES_ESTATUS_MAX_SIZE;
	}
	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
	ghes->estatus_length = error_block_length;
	if (!ghes->estatus) {
		rc = -ENOMEM;
		goto err_unmap_status_addr;
@@ -365,13 +367,15 @@ static int __ghes_check_estatus(struct ghes *ghes,
				struct acpi_hest_generic_status *estatus)
{
	u32 len = cper_estatus_len(estatus);
	u32 max_len = min(ghes->generic->error_block_length,
			  ghes->estatus_length);

	if (len < sizeof(*estatus)) {
		pr_warn_ratelimited(FW_WARN GHES_PFX "Truncated error status block!\n");
		return -EIO;
	}

	if (len > ghes->generic->error_block_length) {
	if (!len || len > max_len) {
		pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid error status block length!\n");
		return -EIO;
	}
@@ -552,21 +556,45 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
{
	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
	int flags = sync ? MF_ACTION_REQUIRED : 0;
	int length = gdata->error_data_length;
	char error_type[120];
	bool queued = false;
	int sec_sev, i;
	char *p;

	sec_sev = ghes_severity(gdata->error_severity);
	if (length >= sizeof(*err)) {
		log_arm_hw_error(err, sec_sev);
	} else {
		pr_warn(FW_BUG "arm error length: %d\n", length);
		pr_warn(FW_BUG "length is too small\n");
		pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
		return false;
	}

	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
		return false;

	p = (char *)(err + 1);
	length -= sizeof(err);

	for (i = 0; i < err->err_info_num; i++) {
		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
		bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
		struct cper_arm_err_info *err_info;
		bool is_cache, has_pa;

		/* Ensure we have enough data for the error info header */
		if (length < sizeof(*err_info))
			break;

		err_info = (struct cper_arm_err_info *)p;

		/* Validate the claimed length before using it */
		length -= err_info->length;
		if (length < 0)
			break;

		is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
		has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);

		/*
		 * The field (err_info->error_info & BIT(26)) is fixed to set to
@@ -711,53 +739,17 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
{
#ifdef CONFIG_ACPI_APEI_PCIEAER
	struct cxl_cper_prot_err_work_data wd;
	u8 *dvsec_start, *cap_start;

	if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
		pr_err_ratelimited("CXL CPER invalid agent type\n");
	if (cxl_cper_sec_prot_err_valid(prot_err))
		return;
	}

	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
		pr_err_ratelimited("CXL CPER invalid protocol error log\n");
		return;
	}

	if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
		pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
				   prot_err->err_len);
		return;
	}

	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
		pr_warn(FW_WARN "CXL CPER no device serial number\n");

	guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);

	if (!cxl_cper_prot_err_work)
		return;

	switch (prot_err->agent_type) {
	case RCD:
	case DEVICE:
	case LD:
	case FMLD:
	case RP:
	case DSP:
	case USP:
		memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));

		dvsec_start = (u8 *)(prot_err + 1);
		cap_start = dvsec_start + prot_err->dvsec_len;

		memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap));
		wd.severity = cper_severity_to_aer(severity);
		break;
	default:
		pr_err_ratelimited("CXL CPER invalid agent type: %d\n",
				   prot_err->agent_type);
	if (cxl_cper_setup_prot_err_work_data(&wd, prot_err, severity))
		return;
	}

	if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
		pr_err_ratelimited("CXL CPER kfifo overflow\n");
@@ -1406,6 +1398,71 @@ static int ghes_in_nmi_spool_from_list(struct list_head *rcu_list,
	return ret;
}

/**
 * ghes_has_active_errors - Check if there are active errors in error sources
 * @ghes_list: List of GHES entries to check for active errors
 *
 * This function iterates through all GHES entries in the given list and
 * checks if any of them has active error status by reading the error
 * status register.
 *
 * Return: true if at least one source has active error, false otherwise.
 */
static bool __maybe_unused ghes_has_active_errors(struct list_head *ghes_list)
{
	struct ghes *ghes;

	guard(rcu)();
	list_for_each_entry_rcu(ghes, ghes_list, list) {
		if (ghes->error_status_vaddr &&
		    readl(ghes->error_status_vaddr))
			return true;
	}

	return false;
}

/**
 * ghes_map_error_status - Map error status address to virtual address
 * @ghes: pointer to GHES structure
 *
 * Reads the error status address from ACPI HEST table and maps it to a virtual
 * address that can be accessed by the kernel.
 *
 * Return: 0 on success, error code on failure.
 */
static int __maybe_unused ghes_map_error_status(struct ghes *ghes)
{
	struct acpi_hest_generic *g = ghes->generic;
	u64 paddr;
	int rc;

	rc = apei_read(&paddr, &g->error_status_address);
	if (rc)
		return rc;

	ghes->error_status_vaddr =
		acpi_os_ioremap(paddr, sizeof(ghes->estatus->block_status));
	if (!ghes->error_status_vaddr)
		return -EINVAL;

	return 0;
}

/**
 * ghes_unmap_error_status - Unmap error status virtual address
 * @ghes: pointer to GHES structure
 *
 * Unmaps the error status address if it was previously mapped.
 */
static void __maybe_unused ghes_unmap_error_status(struct ghes *ghes)
{
	if (ghes->error_status_vaddr) {
		iounmap(ghes->error_status_vaddr);
		ghes->error_status_vaddr = NULL;
	}
}

#ifdef CONFIG_ACPI_APEI_SEA
static LIST_HEAD(ghes_sea);

@@ -1418,6 +1475,9 @@ int ghes_notify_sea(void)
	static DEFINE_RAW_SPINLOCK(ghes_notify_lock_sea);
	int rv;

	if (!ghes_has_active_errors(&ghes_sea))
		return -ENOENT;

	raw_spin_lock(&ghes_notify_lock_sea);
	rv = ghes_in_nmi_spool_from_list(&ghes_sea, FIX_APEI_GHES_SEA);
	raw_spin_unlock(&ghes_notify_lock_sea);
@@ -1425,11 +1485,19 @@ int ghes_notify_sea(void)
	return rv;
}

static void ghes_sea_add(struct ghes *ghes)
static int ghes_sea_add(struct ghes *ghes)
{
	int rc;

	rc = ghes_map_error_status(ghes);
	if (rc)
		return rc;

	mutex_lock(&ghes_list_mutex);
	list_add_rcu(&ghes->list, &ghes_sea);
	mutex_unlock(&ghes_list_mutex);

	return 0;
}

static void ghes_sea_remove(struct ghes *ghes)
@@ -1437,10 +1505,11 @@ static void ghes_sea_remove(struct ghes *ghes)
	mutex_lock(&ghes_list_mutex);
	list_del_rcu(&ghes->list);
	mutex_unlock(&ghes_list_mutex);
	ghes_unmap_error_status(ghes);
	synchronize_rcu();
}
#else /* CONFIG_ACPI_APEI_SEA */
static inline void ghes_sea_add(struct ghes *ghes) { }
static inline int ghes_sea_add(struct ghes *ghes) { return -EINVAL; }
static inline void ghes_sea_remove(struct ghes *ghes) { }
#endif /* CONFIG_ACPI_APEI_SEA */

@@ -1458,6 +1527,9 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
	static DEFINE_RAW_SPINLOCK(ghes_notify_lock_nmi);
	int ret = NMI_DONE;

	if (!ghes_has_active_errors(&ghes_nmi))
		return ret;

	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
		return ret;

@@ -1470,13 +1542,21 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
	return ret;
}

static void ghes_nmi_add(struct ghes *ghes)
static int ghes_nmi_add(struct ghes *ghes)
{
	int rc;

	rc = ghes_map_error_status(ghes);
	if (rc)
		return rc;

	mutex_lock(&ghes_list_mutex);
	if (list_empty(&ghes_nmi))
		register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
	list_add_rcu(&ghes->list, &ghes_nmi);
	mutex_unlock(&ghes_list_mutex);

	return 0;
}

static void ghes_nmi_remove(struct ghes *ghes)
@@ -1486,6 +1566,9 @@ static void ghes_nmi_remove(struct ghes *ghes)
	if (list_empty(&ghes_nmi))
		unregister_nmi_handler(NMI_LOCAL, "ghes");
	mutex_unlock(&ghes_list_mutex);

	ghes_unmap_error_status(ghes);

	/*
	 * To synchronize with NMI handler, ghes can only be
	 * freed after NMI handler finishes.
@@ -1493,7 +1576,7 @@ static void ghes_nmi_remove(struct ghes *ghes)
	synchronize_rcu();
}
#else /* CONFIG_HAVE_ACPI_APEI_NMI */
static inline void ghes_nmi_add(struct ghes *ghes) { }
static inline int ghes_nmi_add(struct ghes *ghes) { return -EINVAL; }
static inline void ghes_nmi_remove(struct ghes *ghes) { }
#endif /* CONFIG_HAVE_ACPI_APEI_NMI */

@@ -1658,10 +1741,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
		break;

	case ACPI_HEST_NOTIFY_SEA:
		ghes_sea_add(ghes);
		rc = ghes_sea_add(ghes);
		if (rc)
			goto err;
		break;
	case ACPI_HEST_NOTIFY_NMI:
		ghes_nmi_add(ghes);
		rc = ghes_nmi_add(ghes);
		if (rc)
			goto err;
		break;
	case ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED:
		rc = apei_sdei_register_ghes(ghes);
@@ -1810,6 +1897,8 @@ void __init acpi_ghes_init(void)
 */
static struct acpi_platform_list plat_list[] = {
	{"HPE   ", "Server  ", 0, ACPI_SIG_FADT, all_versions},
	{"__ZX__", "EDK2    ", 3, ACPI_SIG_FADT, greater_than_or_equal},
	{"_BYO_ ", "BYOSOFT ", 3, ACPI_SIG_FADT, greater_than_or_equal},
	{ } /* End */
};

Loading