Commit 92242716 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'drm-habanalabs-next-2023-12-19' of...

Merge tag 'drm-habanalabs-next-2023-12-19' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux

 into drm-next

This tag contains habanalabs driver changes for v6.8.

The notable changes are:

- uAPI changes:
  - Add sysfs entry to allow users to identify a device minor id with its
    debugfs path
  - Add sysfs entry to expose the device's module id as given to us from
    the f/w
  - Add signed device information retrieval through the INFO ioctl

- New features and improvements:
  - Update documentation of debugfs paths
  - Add support for Gaudi2C device (new PCI revision number)
  - Add pcie reset prepare/done hooks

- Firmware related fixes and changes:
  - Print three instances version numbers of Infineon second stage
  - Assume hard-reset is done by f/w upon PCIe AXI drain

- Bug fixes and code cleanups:
  - Fix information leak in sec_attest_info()
  - Avoid overriding existing undefined opcode data in Gaudi2
  - Multiple Queue Manager (QMAN) fixes for Gaudi2
  - Set hard reset flag if graceful reset is skipped
  - Remove 'get temperature' debug print
  - Fix the new Event Queue heartbeat mechanism

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/ZYFpihZscr/fsRRd@ogabbay-vm-u22.habana-labs.com
parents dc83fb6e a9f07790
Loading
Loading
Loading
Loading
+36 −36
Original line number Diff line number Diff line
What:           /sys/kernel/debug/accel/<n>/addr
What:           /sys/kernel/debug/accel/<parent_device>/addr
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -8,34 +8,34 @@ Description: Sets the device address to be used for read or write through
                only when the IOMMU is disabled.
                The acceptable value is a string that starts with "0x"

What:           /sys/kernel/debug/accel/<n>/clk_gate
What:           /sys/kernel/debug/accel/<parent_device>/clk_gate
Date:           May 2020
KernelVersion:  5.8
Contact:        ogabbay@kernel.org
Description:    This setting is now deprecated as clock gating is handled solely by the f/w

What:           /sys/kernel/debug/accel/<n>/command_buffers
What:           /sys/kernel/debug/accel/<parent_device>/command_buffers
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Displays a list with information about the currently allocated
                command buffers

What:           /sys/kernel/debug/accel/<n>/command_submission
What:           /sys/kernel/debug/accel/<parent_device>/command_submission
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Displays a list with information about the currently active
                command submissions

What:           /sys/kernel/debug/accel/<n>/command_submission_jobs
What:           /sys/kernel/debug/accel/<parent_device>/command_submission_jobs
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Displays a list with detailed information about each JOB (CB) of
                each active command submission

What:           /sys/kernel/debug/accel/<n>/data32
What:           /sys/kernel/debug/accel/<parent_device>/data32
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -50,7 +50,7 @@ Description: Allows the root user to read or write directly through the
                If the IOMMU is disabled, it also allows the root user to read
                or write from the host a device VA of a host mapped memory

What:           /sys/kernel/debug/accel/<n>/data64
What:           /sys/kernel/debug/accel/<parent_device>/data64
Date:           Jan 2020
KernelVersion:  5.6
Contact:        ogabbay@kernel.org
@@ -65,7 +65,7 @@ Description: Allows the root user to read or write 64 bit data directly
                If the IOMMU is disabled, it also allows the root user to read
                or write from the host a device VA of a host mapped memory

What:           /sys/kernel/debug/accel/<n>/data_dma
What:           /sys/kernel/debug/accel/<parent_device>/data_dma
Date:           Apr 2021
KernelVersion:  5.13
Contact:        ogabbay@kernel.org
@@ -83,7 +83,7 @@ Description: Allows the root user to read from the device's internal
                workloads.
                Only supported on GAUDI at this stage.

What:           /sys/kernel/debug/accel/<n>/device
What:           /sys/kernel/debug/accel/<parent_device>/device
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -91,14 +91,14 @@ Description: Enables the root user to set the device to specific state.
                Valid values are "disable", "enable", "suspend", "resume".
                User can read this property to see the valid values

What:           /sys/kernel/debug/accel/<n>/device_release_watchdog_timeout
What:           /sys/kernel/debug/accel/<parent_device>/device_release_watchdog_timeout
Date:           Oct 2022
KernelVersion:  6.2
Contact:        ttayar@habana.ai
Description:    The watchdog timeout value in seconds for a device release upon
                certain error cases, after which the device is reset.

What:           /sys/kernel/debug/accel/<n>/dma_size
What:           /sys/kernel/debug/accel/<parent_device>/dma_size
Date:           Apr 2021
KernelVersion:  5.13
Contact:        ogabbay@kernel.org
@@ -108,7 +108,7 @@ Description: Specify the size of the DMA transaction when using DMA to read
                When the write is finished, the user can read the "data_dma"
                blob

What:           /sys/kernel/debug/accel/<n>/dump_razwi_events
What:           /sys/kernel/debug/accel/<parent_device>/dump_razwi_events
Date:           Aug 2022
KernelVersion:  5.20
Contact:        fkassabri@habana.ai
@@ -117,7 +117,7 @@ Description: Dumps all razwi events to dmesg if exist.
                the routine will clear the status register.
                Usage: cat dump_razwi_events

What:           /sys/kernel/debug/accel/<n>/dump_security_violations
What:           /sys/kernel/debug/accel/<parent_device>/dump_security_violations
Date:           Jan 2021
KernelVersion:  5.12
Contact:        ogabbay@kernel.org
@@ -125,14 +125,14 @@ Description: Dumps all security violations to dmesg. This will also ack
                all security violations meanings those violations will not be
                dumped next time user calls this API

What:           /sys/kernel/debug/accel/<n>/engines
What:           /sys/kernel/debug/accel/<parent_device>/engines
Date:           Jul 2019
KernelVersion:  5.3
Contact:        ogabbay@kernel.org
Description:    Displays the status registers values of the device engines and
                their derived idle status

What:           /sys/kernel/debug/accel/<n>/i2c_addr
What:           /sys/kernel/debug/accel/<parent_device>/i2c_addr
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -140,7 +140,7 @@ Description: Sets I2C device address for I2C transaction that is generated
                by the device's CPU, Not available when device is loaded with secured
                firmware

What:           /sys/kernel/debug/accel/<n>/i2c_bus
What:           /sys/kernel/debug/accel/<parent_device>/i2c_bus
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -148,7 +148,7 @@ Description: Sets I2C bus address for I2C transaction that is generated by
                the device's CPU, Not available when device is loaded with secured
                firmware

What:           /sys/kernel/debug/accel/<n>/i2c_data
What:           /sys/kernel/debug/accel/<parent_device>/i2c_data
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -157,7 +157,7 @@ Description: Triggers an I2C transaction that is generated by the device's
                reading from the file generates a read transaction, Not available
                when device is loaded with secured firmware

What:           /sys/kernel/debug/accel/<n>/i2c_len
What:           /sys/kernel/debug/accel/<parent_device>/i2c_len
Date:           Dec 2021
KernelVersion:  5.17
Contact:        obitton@habana.ai
@@ -165,7 +165,7 @@ Description: Sets I2C length in bytes for I2C transaction that is generated b
                the device's CPU, Not available when device is loaded with secured
                firmware

What:           /sys/kernel/debug/accel/<n>/i2c_reg
What:           /sys/kernel/debug/accel/<parent_device>/i2c_reg
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -173,35 +173,35 @@ Description: Sets I2C register id for I2C transaction that is generated by
                the device's CPU, Not available when device is loaded with secured
                firmware

What:           /sys/kernel/debug/accel/<n>/led0
What:           /sys/kernel/debug/accel/<parent_device>/led0
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Sets the state of the first S/W led on the device, Not available
                when device is loaded with secured firmware

What:           /sys/kernel/debug/accel/<n>/led1
What:           /sys/kernel/debug/accel/<parent_device>/led1
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Sets the state of the second S/W led on the device, Not available
                when device is loaded with secured firmware

What:           /sys/kernel/debug/accel/<n>/led2
What:           /sys/kernel/debug/accel/<parent_device>/led2
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Sets the state of the third S/W led on the device, Not available
                when device is loaded with secured firmware

What:           /sys/kernel/debug/accel/<n>/memory_scrub
What:           /sys/kernel/debug/accel/<parent_device>/memory_scrub
Date:           May 2022
KernelVersion:  5.19
Contact:        dhirschfeld@habana.ai
Description:    Allows the root user to scrub the dram memory. The scrubbing
                value can be set using the debugfs file memory_scrub_val.

What:           /sys/kernel/debug/accel/<n>/memory_scrub_val
What:           /sys/kernel/debug/accel/<parent_device>/memory_scrub_val
Date:           May 2022
KernelVersion:  5.19
Contact:        dhirschfeld@habana.ai
@@ -209,7 +209,7 @@ Description: The value to which the dram will be set to when the user
                scrubs the dram using 'memory_scrub' debugfs file and
                the scrubbing value when using module param 'memory_scrub'

What:           /sys/kernel/debug/accel/<n>/mmu
What:           /sys/kernel/debug/accel/<parent_device>/mmu
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -219,7 +219,7 @@ Description: Displays the hop values and physical address for a given ASID
                e.g. to display info about VA 0x1000 for ASID 1 you need to do:
                echo "1 0x1000" > /sys/kernel/debug/accel/0/mmu

What:           /sys/kernel/debug/accel/<n>/mmu_error
What:           /sys/kernel/debug/accel/<parent_device>/mmu_error
Date:           Mar 2021
KernelVersion:  5.12
Contact:        fkassabri@habana.ai
@@ -229,7 +229,7 @@ Description: Check and display page fault or access violation mmu errors for
                echo "0x200" > /sys/kernel/debug/accel/0/mmu_error
                cat /sys/kernel/debug/accel/0/mmu_error

What:           /sys/kernel/debug/accel/<n>/monitor_dump
What:           /sys/kernel/debug/accel/<parent_device>/monitor_dump
Date:           Mar 2022
KernelVersion:  5.19
Contact:        osharabi@habana.ai
@@ -243,7 +243,7 @@ Description: Allows the root user to dump monitors status from the device's
                This interface doesn't support concurrency in the same device.
                Only supported on GAUDI.

What:           /sys/kernel/debug/accel/<n>/monitor_dump_trig
What:           /sys/kernel/debug/accel/<parent_device>/monitor_dump_trig
Date:           Mar 2022
KernelVersion:  5.19
Contact:        osharabi@habana.ai
@@ -253,14 +253,14 @@ Description: Triggers dump of monitor data. The value to trigger the operatio
                When the write is finished, the user can read the "monitor_dump"
                blob

What:           /sys/kernel/debug/accel/<n>/set_power_state
What:           /sys/kernel/debug/accel/<parent_device>/set_power_state
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
                for D3Hot

What:           /sys/kernel/debug/accel/<n>/skip_reset_on_timeout
What:           /sys/kernel/debug/accel/<parent_device>/skip_reset_on_timeout
Date:           Jun 2021
KernelVersion:  5.13
Contact:        ynudelman@habana.ai
@@ -268,7 +268,7 @@ Description: Sets the skip reset on timeout option for the device. Value of
                "0" means device will be reset in case some CS has timed out,
                otherwise it will not be reset.

What:           /sys/kernel/debug/accel/<n>/state_dump
What:           /sys/kernel/debug/accel/<parent_device>/state_dump
Date:           Oct 2021
KernelVersion:  5.15
Contact:        ynudelman@habana.ai
@@ -279,7 +279,7 @@ Description: Gets the state dump occurring on a CS timeout or failure.
                Writing an integer X discards X state dumps, so that the
                next read would return X+1-st newest state dump.

What:           /sys/kernel/debug/accel/<n>/stop_on_err
What:           /sys/kernel/debug/accel/<parent_device>/stop_on_err
Date:           Mar 2020
KernelVersion:  5.6
Contact:        ogabbay@kernel.org
@@ -287,13 +287,13 @@ Description: Sets the stop-on_error option for the device engines. Value of
                "0" is for disable, otherwise enable.
                Relevant only for GOYA and GAUDI.

What:           /sys/kernel/debug/accel/<n>/timeout_locked
What:           /sys/kernel/debug/accel/<parent_device>/timeout_locked
Date:           Sep 2021
KernelVersion:  5.16
Contact:        obitton@habana.ai
Description:    Sets the command submission timeout value in seconds.

What:           /sys/kernel/debug/accel/<n>/userptr
What:           /sys/kernel/debug/accel/<parent_device>/userptr
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
@@ -301,7 +301,7 @@ Description: Displays a list with information about the current user
                pointers (user virtual addresses) that are pinned and mapped
                to DMA addresses

What:           /sys/kernel/debug/accel/<n>/userptr_lookup
What:           /sys/kernel/debug/accel/<parent_device>/userptr_lookup
Date:           Oct 2021
KernelVersion:  5.15
Contact:        ogabbay@kernel.org
@@ -309,7 +309,7 @@ Description: Allows to search for specific user pointers (user virtual
                addresses) that are pinned and mapped to DMA addresses, and see
                their resolution to the specific dma address.

What:           /sys/kernel/debug/accel/<n>/vm
What:           /sys/kernel/debug/accel/<parent_device>/vm
Date:           Jan 2019
KernelVersion:  5.1
Contact:        ogabbay@kernel.org
+12 −0
Original line number Diff line number Diff line
@@ -149,6 +149,18 @@ Contact: ogabbay@kernel.org
Description:    Displays the current clock frequency, in Hz, of the MME compute
                engine. This property is valid only for the Goya ASIC family

What:           /sys/class/accel/accel<n>/device/module_id
Date:           Nov 2023
KernelVersion:  not yet upstreamed
Contact:        ogabbay@kernel.org
Description:    Displays the device's module id

What:           /sys/class/accel/accel<n>/device/parent_device
Date:           Nov 2023
KernelVersion:  6.8
Contact:        ttayar@habana.ai
Description:    Displays the name of the parent device of the accel device

What:           /sys/class/accel/accel<n>/device/pci_addr
Date:           Jan 2019
KernelVersion:  5.1
+15 −10
Original line number Diff line number Diff line
@@ -853,6 +853,9 @@ static int device_early_init(struct hl_device *hdev)
		gaudi2_set_asic_funcs(hdev);
		strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
		break;
	case ASIC_GAUDI2C:
		gaudi2_set_asic_funcs(hdev);
		strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
		break;
	default:
		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
@@ -1041,18 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
}

static void hl_device_eq_heartbeat(struct hl_device *hdev)
static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
{
	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
	struct asic_fixed_properties *prop = &hdev->asic_prop;

	if (!prop->cpucp_info.eq_health_check_supported)
		return;
		return 0;

	if (hdev->eq_heartbeat_received)
	if (hdev->eq_heartbeat_received) {
		hdev->eq_heartbeat_received = false;
	else
		hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
	} else {
		dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
		return -EIO;
	}

	return 0;
}

static void hl_device_heartbeat(struct work_struct *work)
@@ -1069,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work)
	/*
	 * For EQ health check need to check if driver received the heartbeat eq event
	 * in order to validate the eq is working.
	 * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
	 */
	hl_device_eq_heartbeat(hdev);

	if (!hdev->asic_funcs->send_heartbeat(hdev))
	if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
		goto reschedule;

	if (hl_device_operational(hdev, NULL))
@@ -2035,7 +2040,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
	if (ctx)
		hl_ctx_put(ctx);

	return hl_device_reset(hdev, flags);
	return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD);
}

static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
+40 −83
Original line number Diff line number Diff line
@@ -646,39 +646,27 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
	return rc;
}

static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
								u32 sts_val)
static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val)
{
	bool err_exists = false;

	if (!(err_val & CPU_BOOT_ERR0_ENABLED))
		return false;

	if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
		dev_err(hdev->dev,
			"Device boot error - DRAM initialization failed\n");
		err_exists = true;
	}
	if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
		dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n");

	if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
	if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
		dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
		dev_err(hdev->dev,
			"Device boot error - Thermal Sensor initialization failed\n");
		err_exists = true;
	}
	if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
		dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n");

	if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
		if (hdev->bmc_enable) {
			dev_err(hdev->dev,
				"Device boot error - Skipped waiting for BMC\n");
			err_exists = true;
			dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n");
		} else {
			dev_info(hdev->dev,
				"Device boot message - Skipped waiting for BMC\n");
			dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n");
			/* This is an info so we don't want it to disable the
			 * device
			 */
@@ -686,48 +674,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
		}
	}

	if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
		dev_err(hdev->dev,
			"Device boot error - Serdes data from BMC not available\n");
		err_exists = true;
	}
	if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
		dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n");

	if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
		dev_err(hdev->dev,
			"Device boot error - NIC F/W initialization failed\n");
		err_exists = true;
	}
	if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
		dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n");

	if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
		dev_err(hdev->dev,
			"Device boot warning - security not ready\n");
		err_exists = true;
	}
	if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
		dev_err(hdev->dev, "Device boot warning - security not ready\n");

	if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
	if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
		dev_err(hdev->dev, "Device boot error - security failure\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
	if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
		dev_err(hdev->dev, "Device boot error - eFuse failure\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
	if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL)
		dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
	if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
		dev_err(hdev->dev, "Device boot error - PLL failure\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) {
	if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL)
		dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n");
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
		/* Ignore this bit, don't prevent driver loading */
@@ -735,52 +704,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
		err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL;
	}

	if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) {
	if (err_val & CPU_BOOT_ERR0_BINNING_FAIL)
		dev_err(hdev->dev, "Device boot error - binning failure\n");
		err_exists = true;
	}

	if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
		dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);

	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
		dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n");

	if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
		dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n");

	/* All warnings should go here in order not to reach the unknown error validation */
	if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
		dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
		err_exists = true;
	}

	/* All warnings should go here in order not to reach the unknown error validation */
	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
		dev_warn(hdev->dev,
			"Device boot warning - Skipped DRAM initialization\n");
		/* This is a warning so we don't want it to disable the
		 * device
		 */
		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
	}
	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL)
		dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n");

	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
		dev_warn(hdev->dev,
			"Device boot warning - Failed to load preboot primary image\n");
		/* This is a warning so we don't want it to disable the
		 * device as we have a secondary preboot image
		 */
		err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
	}

	if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
		dev_warn(hdev->dev,
			"Device boot warning - TPM failure\n");
		/* This is a warning so we don't want it to disable the
		 * device
		 */
		err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
	}
	if (err_val & CPU_BOOT_ERR0_TPM_FAIL)
		dev_warn(hdev->dev, "Device boot warning - TPM failure\n");

	if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
		dev_err(hdev->dev,
			"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
	if (err_val & CPU_BOOT_ERR_FATAL_MASK)
		err_exists = true;
	}

	/* return error only if it's in the predefined mask */
	if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
@@ -3295,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
					HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}

int hl_fw_get_dev_info_signed(struct hl_device *hdev,
			      struct cpucp_dev_info_signed *dev_info_signed, u32 nonce)
{
	return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed,
					 sizeof(struct cpucp_dev_info_signed), nonce,
					 HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}

int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
						dma_addr_t buff, u32 *size)
{
+15 −0
Original line number Diff line number Diff line
@@ -1262,6 +1262,7 @@ struct hl_dec {
 * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000).
 * @ASIC_GAUDI2: Gaudi2 device.
 * @ASIC_GAUDI2B: Gaudi2B device.
 * @ASIC_GAUDI2C: Gaudi2C device.
 */
enum hl_asic_type {
	ASIC_INVALID,
@@ -1270,6 +1271,7 @@ enum hl_asic_type {
	ASIC_GAUDI_SEC,
	ASIC_GAUDI2,
	ASIC_GAUDI2B,
	ASIC_GAUDI2C,
};

struct hl_cs_parser;
@@ -3519,6 +3521,9 @@ struct hl_device {
	u8				heartbeat;
};

/* Retrieve PCI device name in case of a PCI device or dev name in simulator */
#define HL_DEV_NAME(hdev)	\
		((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE")

/**
 * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
@@ -3594,6 +3599,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major
	return false;
}

static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
							u32 fw_sw_minor)
{
	return (hdev->fw_sw_major_ver > fw_sw_major ||
			(hdev->fw_sw_major_ver == fw_sw_major &&
					hdev->fw_sw_minor_ver >= fw_sw_minor));
}

/*
 * Kernel module functions that can be accessed by entire module
 */
@@ -3954,6 +3967,8 @@ long hl_fw_get_max_power(struct hl_device *hdev);
void hl_fw_set_max_power(struct hl_device *hdev);
int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
				u32 nonce);
int hl_fw_get_dev_info_signed(struct hl_device *hdev,
			      struct cpucp_dev_info_signed *dev_info_signed, u32 nonce);
int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value);
Loading