Commit aa775edb authored by Simona Vetter's avatar Simona Vetter
Browse files

Merge tag 'drm-habanalabs-next-2024-02-26' of...

Merge tag 'drm-habanalabs-next-2024-02-26' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux

 into drm-next

This tag contains habanalabs driver and accel changes for v6.9.

The notable changes are:

- New features and improvements:
  - Configure interrupt affinity according to NUMA nodes for the MSI-X interrupts that are
    assigned to the userspace application which acquires the device.
  - Move the HBM MMU page tables to reside inside the HBM to minimize latency when doing
    page-walks.
  - Improve the device reset mechanism when consecutive heartbeat failures occur (firmware
    fails to ack on heartbeat message).
  - Check also extended errors in the PCIe addr_dec interrupt information.
  - Rate limit the error messages that can be printed to dmesg log by userspace actions.

- Firmware related fixes:
  - Handle requests from firmware to reserve device memory

- Bug fixes and code cleanups:
  - constify the struct device_type usage in accel (accel_sysfs_device_minor).
  - Fix the PCI health check by reading uncached register.
  - Fix reporting of drain events.
  - Fix debugfs files permissions.
  - Fix calculation of DRAM BAR base address.

Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/ZdxJprop0EniVQtf@ogabbay-vm-u22.habana-labs.com
parents 19b232b9 576d7cc5
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ static struct idr accel_minors_idr;

static struct dentry *accel_debugfs_root;

static struct device_type accel_sysfs_device_minor = {
static const struct device_type accel_sysfs_device_minor = {
	.name = "accel_minor"
};

+1 −2
Original line number Diff line number Diff line
@@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
			return -EINVAL;
		}

	if (!hl_device_operational(hdev, &status)) {
	if (!hl_device_operational(hdev, &status))
		return -EBUSY;
	}

	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
			!hdev->supports_staged_submission) {
+9 −9
Original line number Diff line number Diff line
@@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
	struct hl_debugfs_entry *entry = s->private;
	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
	struct hl_device *hdev = dev_entry->hdev;
	char kbuf[MMU_KBUF_SIZE];
	char kbuf[MMU_KBUF_SIZE] = {0};
	char *c;
	ssize_t rc;

@@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
	struct hl_debugfs_entry *entry = s->private;
	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
	struct hl_device *hdev = dev_entry->hdev;
	char kbuf[MMU_KBUF_SIZE];
	char kbuf[MMU_KBUF_SIZE] = {0};
	ssize_t rc;

	if (count > sizeof(kbuf) - 1)
@@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
				&hl_data64b_fops);

	debugfs_create_file("set_power_state",
				0200,
				0644,
				root,
				dev_entry,
				&hl_power_fops);

	debugfs_create_file("device",
				0200,
				0644,
				root,
				dev_entry,
				&hl_device_fops);

	debugfs_create_file("clk_gate",
				0200,
				0644,
				root,
				dev_entry,
				&hl_clk_gate_fops);
@@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
				&hl_stop_on_err_fops);

	debugfs_create_file("dump_security_violations",
				0644,
				0400,
				root,
				dev_entry,
				&hl_security_violations_fops);

	debugfs_create_file("dump_razwi_events",
				0644,
				0400,
				root,
				dev_entry,
				&hl_razwi_check_fops);
@@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
				&hdev->reset_info.skip_reset_on_timeout);

	debugfs_create_file("state_dump",
				0600,
				0644,
				root,
				dev_entry,
				&hl_state_dump_fops);
@@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent

	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
		debugfs_create_file(hl_debugfs_list[i].name,
					0444,
					0644,
					root,
					entry,
					&hl_debugfs_fops);
+45 −10
Original line number Diff line number Diff line
@@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
	if (is_power_of_2(prop->dram_pci_bar_size))
		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
	else
		bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
		bar_base_addr = region->region_base +
				div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
				prop->dram_pci_bar_size;

	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
@@ -1034,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)

static bool is_pci_link_healthy(struct hl_device *hdev)
{
	u16 vendor_id;
	u16 device_id;

	if (!hdev->pdev)
		return false;

	pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
	pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);

	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
	return (device_id == hdev->pdev->device);
}

static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
@@ -1768,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
		hdev->device_cpu_disabled = false;
		hdev->reset_info.hard_reset_pending = false;

		if (hdev->reset_info.reset_trigger_repeated &&
				(hdev->reset_info.prev_reset_trigger ==
						HL_DRV_RESET_FW_FATAL_ERR)) {
			/* if there 2 back to back resets from FW,
			 * ensure driver puts the driver in a unusable state
		/*
		 * Put the device in an unusable state if there are 2 back to back resets due to
		 * fatal errors.
		 */
		if (hdev->reset_info.reset_trigger_repeated &&
				(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
						hdev->reset_info.prev_reset_trigger ==
								HL_DRV_RESET_HEARTBEAT)) {
			dev_crit(hdev->dev,
				"%s Consecutive FW fatal errors received, stopping hard reset\n",
				"%s Consecutive fatal errors, stopping hard reset\n",
				dev_name(&(hdev)->pdev->dev));
			rc = -EIO;
			goto out_err;
@@ -2801,3 +2804,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
	captured_err_info->undef_opcode.write_enable = true;
}

void hl_init_cpu_for_irq(struct hl_device *hdev)
{
#ifdef CONFIG_NUMA
	struct cpumask *available_mask = &hdev->irq_affinity_mask;
	int numa_node = hdev->pdev->dev.numa_node, i;
	static struct cpumask cpu_mask;

	if (numa_node < 0)
		return;

	if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
		dev_err(hdev->dev, "No available affinities in current numa node\n");
		return;
	}

	/* Remove HT siblings */
	for_each_cpu(i, &cpu_mask)
		cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
#endif
}

void hl_set_irq_affinity(struct hl_device *hdev, int irq)
{
	if (cpumask_empty(&hdev->irq_affinity_mask)) {
		dev_dbg(hdev->dev, "affinity mask is empty\n");
		return;
	}

	if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
}
+11 −14
Original line number Diff line number Diff line
@@ -501,7 +501,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
						0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
		dev_err(hdev->dev, "failed to unmask event %d", event_type);

	return rc;
}
@@ -540,7 +540,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
						total_pkt_size, 0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask IRQ array\n");
		dev_err(hdev->dev, "failed to unmask event array\n");

	kfree(pkt);

@@ -2718,18 +2718,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
	}

	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
		struct lkd_fw_binning_info *binning_info;

		rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
							sizeof(struct lkd_msg_comms));
	rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
	if (rc)
		goto protocol_err;

	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
		hdev->asic_prop.reserved_fw_mem_size =
			le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;

	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
		struct lkd_fw_binning_info *binning_info;

		/* read preboot version */
		rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);

		if (rc)
			return rc;

@@ -2756,11 +2758,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
				hdev->decoder_binning, hdev->rotator_binning);
		}

		if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
			hdev->asic_prop.reserved_fw_mem_size =
				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
		}

		return 0;
	}

@@ -2795,7 +2792,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);

	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
		dev_info(hdev->dev, "Skip loading Linux F/W\n");
		dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
		return 0;
	}

Loading