Commit 422ef171 authored by Tal Cohen's avatar Tal Cohen Committed by Greg Kroah-Hartman
Browse files

habanalabs: add support for notification via eventfd



The driver will be able to send notification events towards
a user process, using user's registered event file descriptor.
The driver uses the notification mechanism to inform the
user about an occurred event.
A user thread can wait until a notification is received from
the driver.

The driver stores the occurred event until the user reads it,
using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl.

Gaudi specific implementation includes sending a notification
on a TPC assertion event that is received from f/w.

Signed-off-by: default avatarTal Cohen <talcohen@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent f2daa2d9
Loading
Loading
Loading
Loading
+52 −0
Original line number Diff line number Diff line
@@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref)

	hdev->compute_ctx_in_release = 0;

	/* release the eventfd */
	if (hpriv->notifier_event.eventfd) {
		eventfd_ctx_put(hpriv->notifier_event.eventfd);
		hpriv->notifier_event.eventfd = 0;
	}

	mutex_destroy(&hpriv->notifier_event.lock);

	kfree(hpriv);
}

@@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
	list_del(&hpriv->dev_node);
	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
out:
	/* release the eventfd */
	if (hpriv->notifier_event.eventfd) {
		eventfd_ctx_put(hpriv->notifier_event.eventfd);
		hpriv->notifier_event.eventfd = 0;
	}

	mutex_destroy(&hpriv->notifier_event.lock);
	put_pid(hpriv->taskpid);

	kfree(hpriv);
@@ -1506,6 +1521,43 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
	return rc;
}

static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
{
	mutex_lock(&notifier_event->lock);
	notifier_event->events_mask |= event;
	if (notifier_event->eventfd)
		eventfd_signal(notifier_event->eventfd, 1);

	mutex_unlock(&notifier_event->lock);
}

/*
 * hl_notifier_event_send_all - notify all user processes via eventfd
 *
 * @hdev: pointer to habanalabs device structure
 * @event: the occurred event
 * Returns 0 for success or an error on failure.
 */
void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
{
	struct hl_fpriv	*hpriv;

	mutex_lock(&hdev->fpriv_list_lock);

	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
		hl_notifier_event_send(&hpriv->notifier_event, event);

	mutex_unlock(&hdev->fpriv_list_lock);

	/* control device */
	mutex_lock(&hdev->fpriv_ctrl_list_lock);

	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
		hl_notifier_event_send(&hpriv->notifier_event, event);

	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
}

/*
 * hl_device_init - main initialization function for habanalabs device
 *
+28 −12
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include <linux/hashtable.h>
#include <linux/debugfs.h>
#include <linux/rwsem.h>
#include <linux/eventfd.h>
#include <linux/bitfield.h>
#include <linux/genalloc.h>
#include <linux/sched/signal.h>
@@ -1932,6 +1933,18 @@ struct hl_debug_params {
	bool enable;
};

/**
 * struct hl_notifier_event - holds the notifier data structure
 * @eventfd: the event file descriptor to raise the notifications
 * @lock: mutex lock to protect the notifier data flows
 * @events_mask: indicates the bitmap events
 */
struct hl_notifier_event {
	struct eventfd_ctx	*eventfd;
	struct mutex		lock;
	u64			events_mask;
};

/*
 * FILE PRIVATE STRUCTURE
 */
@@ -1943,8 +1956,8 @@ struct hl_debug_params {
 * @taskpid: current process ID.
 * @ctx: current executing context. TODO: remove for multiple ctx per process
 * @ctx_mgr: context manager to handle multiple context for this FD.
 * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
 * @mem_mgr: manager descriptor for memory exportable via mmap
 * @notifier_event: notifier eventfd towards user process
 * @debugfs_list: list of relevant ASIC debugfs.
 * @dev_node: node in the device list of file private data
 * @refcount: number of related contexts.
@@ -1957,6 +1970,7 @@ struct hl_fpriv {
	struct hl_ctx			*ctx;
	struct hl_ctx_mgr		ctx_mgr;
	struct hl_mem_mgr		mem_mgr;
	struct hl_notifier_event	notifier_event;
	struct list_head		debugfs_list;
	struct list_head		dev_node;
	struct kref			refcount;
@@ -2676,8 +2690,8 @@ struct hl_reset_info {
 * @state_dump_specs: constants and dictionaries needed to dump system state.
 * @multi_cs_completion: array of multi-CS completion.
 * @clk_throttling: holds information about current/previous clock throttling events
 * @reset_info: holds current device reset information.
 * @last_error: holds information about last session in which CS timeout or razwi error occurred.
 * @reset_info: holds current device reset information.
 * @stream_master_qid_arr: pointer to array with QIDs of master streams.
 * @fw_major_version: major version of current loaded preboot
 * @dram_used_mem: current DRAM memory consumption.
@@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
int hl_build_hwmon_channel_info(struct hl_device *hdev,
		struct cpucp_sensor *sensors_arr);

void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);

int hl_sysfs_init(struct hl_device *hdev);
void hl_sysfs_fini(struct hl_device *hdev);

+9 −0
Original line number Diff line number Diff line
@@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp)
	hpriv->hdev = hdev;
	filp->private_data = hpriv;
	hpriv->filp = filp;
	hpriv->notifier_event.events_mask = 0;
	hpriv->notifier_event.eventfd = 0;

	mutex_init(&hpriv->notifier_event.lock);
	mutex_init(&hpriv->restore_phase_mutex);
	kref_init(&hpriv->refcount);
	nonseekable_open(inode, filp);
@@ -208,6 +212,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
	filp->private_data = NULL;
	mutex_destroy(&hpriv->restore_phase_mutex);
	mutex_destroy(&hpriv->notifier_event.lock);
	put_pid(hpriv->taskpid);

	kfree(hpriv);
@@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
	hpriv->hdev = hdev;
	filp->private_data = hpriv;
	hpriv->filp = filp;
	hpriv->notifier_event.events_mask = 0;
	hpriv->notifier_event.eventfd = 0;

	mutex_init(&hpriv->notifier_event.lock);
	nonseekable_open(inode, filp);

	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
+65 −0
Original line number Diff line number Diff line
@@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
}

static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	int rc;
	u32 max_size = args->return_size;
	u64 events_mask;
	void __user *out = (void __user *) (uintptr_t) args->return_pointer;

	if ((max_size < sizeof(u64)) || (!out))
		return -EINVAL;

	mutex_lock(&hpriv->notifier_event.lock);
	events_mask = hpriv->notifier_event.events_mask;
	hpriv->notifier_event.events_mask = 0;
	mutex_unlock(&hpriv->notifier_event.lock);

	rc = copy_to_user(out, &events_mask, sizeof(u64));
	return rc;
}

static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	struct hl_device *hdev = hpriv->hdev;
@@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_
	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
}

static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	int rc;

	/* check if there is already a registered on that process */
	mutex_lock(&hpriv->notifier_event.lock);
	if (hpriv->notifier_event.eventfd) {
		mutex_unlock(&hpriv->notifier_event.lock);
		return -EINVAL;
	}

	hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd);
	if (IS_ERR(hpriv->notifier_event.eventfd)) {
		rc = PTR_ERR(hpriv->notifier_event.eventfd);
		hpriv->notifier_event.eventfd = 0;
		mutex_unlock(&hpriv->notifier_event.lock);
		return rc;
	}

	mutex_unlock(&hpriv->notifier_event.lock);
	return 0;
}

static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	mutex_lock(&hpriv->notifier_event.lock);
	if (!hpriv->notifier_event.eventfd) {
		mutex_unlock(&hpriv->notifier_event.lock);
		return -EINVAL;
	}

	eventfd_ctx_put(hpriv->notifier_event.eventfd);
	hpriv->notifier_event.eventfd = 0;
	mutex_unlock(&hpriv->notifier_event.lock);
	return 0;
}

static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
				struct device *dev)
{
@@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
	case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
		return dev_mem_alloc_page_sizes_info(hpriv, args);

	case HL_INFO_GET_EVENTS:
		return events_info(hpriv, args);

	default:
		break;
	}
@@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
	case HL_INFO_DRAM_PENDING_ROWS:
		return dram_pending_rows_info(hpriv, args);

	case HL_INFO_REGISTER_EVENTFD:
		return eventfd_register(hpriv, args);

	case HL_INFO_UNREGISTER_EVENTFD:
		return eventfd_unregister(hpriv, args);

	default:
		dev_err(dev, "Invalid request %d\n", args->op);
		rc = -EINVAL;
+13 −1
Original line number Diff line number Diff line
@@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
	case GAUDI_EVENT_MMU_PAGE_FAULT:
	case GAUDI_EVENT_MMU_WR_PERM:
	case GAUDI_EVENT_RAZWI_OR_ADC:
	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
	case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
	case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
		fallthrough;
@@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
		gaudi_print_irq_info(hdev, event_type, true);
		gaudi_handle_qman_err(hdev, event_type);
		hl_fw_unmask_irq(hdev, event_type);

		/* In TPC QM event, notify on TPC assertion. While there isn't
		 * a specific event for assertion yet, the FW generates QM event.
		 * The SW upper layer will inspect an internal mapped area to indicate
		 * if the event is a tpc assertion or tpc QM.
		 */
		hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT);
		break;

	case GAUDI_EVENT_RAZWI_OR_ADC_SW:
		gaudi_print_irq_info(hdev, event_type, true);
		goto reset_device;
Loading