Commit a52b6a2c authored by Davidlohr Bueso's avatar Davidlohr Bueso Committed by Dave Jiang
Browse files

cxl/pci: Support Global Persistent Flush (GPF)



Add support for GPF flows. It is found that the CXL specification
around this to be a bit too involved from the driver side. And while
this should really all handled by the hardware, this patch takes
things with a grain of salt.

Upon respective port enumeration, both phase timeouts are set to
a max of 20 seconds, which is the NMI watchdog default for lockup
detection. The premise is that the kernel does not have enough
information to set anything better than a max across the board
and hope devices finish their GPF flows within the platform energy
budget.

Timeout detection is based on dirty Shutdown semantics. The driver
will mark it as dirty, expecting that the device clear it upon a
successful GPF event. The admin may consult the device Health and
check the dirty shutdown counter to see if there was a problem
with data integrity.

[ davej: Explicitly set return to 0 in update_gpf_port_dvsec() ]
[ davej: Add spec reference for 'struct cxl_mbox_set_shutdown_state_in ]
[ davej: Fix 0-day reported issue ]

Signed-off-by: default avatarDavidlohr Bueso <dave@stgolabs.net>
Reviewed-by: default avatarJonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: default avatarDan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20250124233533.910535-1-dave@stgolabs.net


Signed-off-by: default avatarDave Jiang <dave.jiang@intel.com>
parent 2014c95a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -130,7 +130,7 @@ Mailbox commands
* [0] Switch CCI
* [3] Timestamp
* [1] PMEM labels
* [0] PMEM GPF / Dirty Shutdown
* [1] PMEM GPF / Dirty Shutdown
* [0] Scan Media

PMU
+2 −0
Original line number Diff line number Diff line
@@ -115,4 +115,6 @@ bool cxl_need_node_perf_attrs_update(int nid);
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
					struct access_coordinate *c);

int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);

#endif /* __CXL_CORE_H__ */
+18 −0
Original line number Diff line number Diff line
@@ -1308,6 +1308,24 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
}
EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL");

int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds)
{
	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
	struct cxl_mbox_cmd mbox_cmd;
	struct cxl_mbox_set_shutdown_state_in in = {
		.state = 1
	};

	mbox_cmd = (struct cxl_mbox_cmd) {
		.opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
		.size_in = sizeof(in),
		.payload_in = &in,
	};

	return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
}
EXPORT_SYMBOL_NS_GPL(cxl_dirty_shutdown_state, "CXL");

int cxl_set_timestamp(struct cxl_memdev_state *mds)
{
	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+87 −0
Original line number Diff line number Diff line
@@ -1054,3 +1054,90 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)

	return 0;
}

/*
 * Set max timeout such that platforms will optimize GPF flow to avoid
 * the implied worst-case scenario delays. On a sane platform, all
 * devices should always complete GPF within the energy budget of
 * the GPF flow. The kernel does not have enough information to pick
 * anything better than "maximize timeouts and hope it works".
 *
 * A misbehaving device could block forward progress of GPF for all
 * the other devices, exhausting the energy budget of the platform.
 * However, the spec seems to assume that moving on from slow to respond
 * devices is a virtue. It is not possible to know that, in actuality,
 * the slow to respond device is *the* most critical device in the
 * system to wait.
 */
#define GPF_TIMEOUT_BASE_MAX 2
#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */

static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
{
	u64 base, scale;
	int rc, offset;
	u16 ctrl;

	switch (phase) {
	case 1:
		offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
		base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
		scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
		break;
	case 2:
		offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
		base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
		scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
		break;
	default:
		return -EINVAL;
	}

	rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
	if (rc)
		return rc;

	if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
	    FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
		return 0;

	ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
	ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);

	rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
	if (!rc)
		pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
			phase, GPF_TIMEOUT_BASE_MAX);

	return rc;
}

int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port)
{
	struct pci_dev *pdev;

	if (!dev_is_pci(dport_dev))
		return 0;

	pdev = to_pci_dev(dport_dev);
	if (!pdev || !port)
		return -EINVAL;

	if (!port->gpf_dvsec) {
		int dvsec;

		dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
						  CXL_DVSEC_PORT_GPF);
		if (!dvsec) {
			pci_warn(pdev, "Port GPF DVSEC not present\n");
			return -EINVAL;
		}

		port->gpf_dvsec = dvsec;
	}

	update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1);
	update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2);

	return 0;
}
+2 −0
Original line number Diff line number Diff line
@@ -1672,6 +1672,8 @@ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd)
			if (rc && rc != -EBUSY)
				return rc;

			cxl_gpf_port_setup(dport_dev, port);

			/* Any more ports to add between this one and the root? */
			if (!dev_is_cxl_root_child(&port->dev))
				continue;
Loading