Commit 72d66255 authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay
Browse files

habanalabs: modify multi-CS to wait on stream masters



During the integration, the multi-CS requirements were refined:
- The multi CS call shall wait on "per-ASIC" predefined stream masters
  instead of set of streams.
- Stream masters are set of QIDs used by the upper SW layers (synapse)
  for completion (must be an external/HW queue).

Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 1f6bdee7
Loading
Loading
Loading
Loading
+33 −17
Original line number Diff line number Diff line
@@ -487,14 +487,15 @@ static void force_complete_multi_cs(struct hl_device *hdev)
 *
 * @hdev: pointer to habanalabs device structure
 * @cs: CS structure
 *
 * The function signals waiting entity that its waiting stream has common
 * stream with the completed CS.
 * The function signals a waiting entity that has an overlapping stream masters
 * with the completed CS.
 * For example:
 * - a completed CS worked on streams 0 and 1, multi CS completion
 *   is actively waiting on stream 3. don't send signal as no common stream
 * - a completed CS worked on streams 0 and 1, multi CS completion
 *   is actively waiting on streams 1 and 3. send signal as stream 1 is common
 * - a completed CS worked on stream master QID 4, multi CS completion
 *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
 *   common stream master QID
 * - a completed CS worked on stream master QID 4, multi CS completion
 *   is actively waiting on stream master QIDs 3, 4. send signal as stream
 *   master QID 4 is common
 */
static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
{
@@ -518,10 +519,11 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
		 * complete if:
		 * 1. still waiting for completion
		 * 2. the completed CS has at least one overlapping stream
		 *    with the streams in the completion
		 *    master with the stream masters in the completion
		 */
		if (mcs_compl->used &&
				(fence->stream_map & mcs_compl->stream_map)) {
				(fence->stream_master_qid_map &
					mcs_compl->stream_master_qid_map)) {
			/* extract the timestamp only of first completed CS */
			if (!mcs_compl->timestamp)
				mcs_compl->timestamp =
@@ -1228,6 +1230,17 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
	return 0;
}

static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
{
	int i;

	for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
		if (qid == hdev->stream_master_qid_arr[i])
			return BIT(i);

	return 0;
}

static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
				u32 num_chunks, u64 *cs_seq, u32 flags,
				u32 encaps_signals_handle, u32 timeout)
@@ -1241,7 +1254,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
	struct hl_cs *cs;
	struct hl_cb *cb;
	u64 user_sequence;
	u8 stream_map = 0;
	u8 stream_master_qid_map = 0;
	int rc, i;

	cntr = &hdev->aggregated_cs_counters;
@@ -1310,7 +1323,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
			 * queues of this CS
			 */
			if (hdev->supports_wait_for_multi_cs)
				stream_map |= BIT((chunk->queue_index % 4));
				stream_master_qid_map |=
					get_stream_master_qid_mask(hdev,
							chunk->queue_index);
		}

		job = hl_cs_allocate_job(hdev, queue_type,
@@ -1378,7 +1393,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
	 * fence object for multi-CS completion
	 */
	if (hdev->supports_wait_for_multi_cs)
		cs->fence->stream_map = stream_map;
		cs->fence->stream_master_qid_map = stream_master_qid_map;

	rc = hl_hw_queue_schedule_cs(cs);
	if (rc) {
@@ -2332,7 +2347,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
			break;
		}

		mcs_data->stream_map |= fence->stream_map;
		mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;

		if (status == CS_WAIT_STATUS_BUSY)
			continue;
@@ -2394,7 +2409,8 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 * hl_wait_multi_cs_completion_init - init completion structure
 *
 * @hdev: pointer to habanalabs device structure
 * @stream_map: stream map, set bit indicates stream to wait on
 * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
 *                        master QID to wait on
 *
 * @return valid completion struct pointer on success, otherwise error pointer
 *
@@ -2404,7 +2420,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 */
static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
							struct hl_device *hdev,
							u8 stream_map)
							u8 stream_master_bitmap)
{
	struct multi_cs_completion *mcs_compl;
	int i;
@@ -2416,7 +2432,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
		if (!mcs_compl->used) {
			mcs_compl->used = 1;
			mcs_compl->timestamp = 0;
			mcs_compl->stream_map = stream_map;
			mcs_compl->stream_master_qid_map = stream_master_bitmap;
			reinit_completion(&mcs_compl->completion);
			spin_unlock(&mcs_compl->lock);
			break;
@@ -2464,7 +2480,7 @@ static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
	long completion_rc;

	mcs_compl = hl_wait_multi_cs_completion_init(hdev,
							mcs_data->stream_map);
					mcs_data->stream_master_qid_map);
	if (IS_ERR(mcs_compl))
		return PTR_ERR(mcs_compl);

+13 −9
Original line number Diff line number Diff line
@@ -592,18 +592,18 @@ struct asic_fixed_properties {
 * @completion: fence is implemented using completion
 * @refcount: refcount for this fence
 * @cs_sequence: sequence of the corresponding command submission
 * @stream_master_qid_map: streams masters QID bitmap to represent all streams
 *                         masters QIDs that multi cs is waiting on
 * @error: mark this fence with error
 * @timestamp: timestamp upon completion
 * @stream_map: streams bitmap to represent all streams that multi cs is
 *              waiting on
 */
struct hl_fence {
	struct completion	completion;
	struct kref		refcount;
	u64			cs_sequence;
	u32			stream_master_qid_map;
	int			error;
	ktime_t			timestamp;
	u8			stream_map;
};

/**
@@ -1160,6 +1160,7 @@ struct fw_load_mgr {
 * @state_dump_init: initialize constants required for state dump
 * @get_sob_addr: get SOB base address offset.
 * @set_pci_memory_regions: setting properties of PCI memory regions
 * @get_stream_master_qid_arr: get pointer to stream masters QID array
 */
struct hl_asic_funcs {
	int (*early_init)(struct hl_device *hdev);
@@ -1289,6 +1290,7 @@ struct hl_asic_funcs {
	void (*state_dump_init)(struct hl_device *hdev);
	u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
	void (*set_pci_memory_regions)(struct hl_device *hdev);
	u32* (*get_stream_master_qid_arr)(void);
};


@@ -2263,16 +2265,16 @@ struct hl_mmu_funcs {
 * @completion: completion of any of the CS in the list
 * @lock: spinlock for the completion structure
 * @timestamp: timestamp for the multi-CS completion
 * @used: 1 if in use, otherwise 0
 * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
 * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS
 *                        is waiting
 * @used: 1 if in use, otherwise 0
 */
struct multi_cs_completion {
	struct completion	completion;
	spinlock_t		lock;
	s64			timestamp;
	u32			stream_master_qid_map;
	u8			used;
	u8			stream_map;
};

/**
@@ -2284,9 +2286,9 @@ struct multi_cs_completion {
 * @timestamp: timestamp of first completed CS
 * @wait_status: wait for CS status
 * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
 * @stream_master_qid_map: bitmap of all stream master QIDs on which the
 *                         multi-CS is waiting
 * @arr_len: fence_arr and seq_arr array length
 * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
 *              is waiting
 * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
 * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
 */
@@ -2298,8 +2300,8 @@ struct multi_cs_data {
	s64		timestamp;
	long		wait_status;
	u32		completion_bitmap;
	u32		stream_master_qid_map;
	u8		arr_len;
	u8		stream_map;
	u8		gone_cs;
	u8		update_ts;
};
@@ -2520,6 +2522,7 @@ struct hl_device {

	struct multi_cs_completion	multi_cs_completion[
							MULTI_CS_MAX_USER_CTX];
	u32				*stream_master_qid_arr;
	atomic64_t			dram_used_mem;
	u64				timeout_jiffies;
	u64				max_power;
@@ -2570,6 +2573,7 @@ struct hl_device {
	u8				skip_reset_on_timeout;
	u8				device_cpu_is_halted;
	u8				supports_wait_for_multi_cs;
	u8				stream_master_qid_arr_size;

	/* Parameters for bring-up */
	u64				nic_ports_mask;
+2 −1
Original line number Diff line number Diff line
@@ -721,7 +721,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)

		/* update stream map of the first CS */
		if (hdev->supports_wait_for_multi_cs)
			staged_cs->fence->stream_map |= cs->fence->stream_map;
			staged_cs->fence->stream_master_qid_map |=
					cs->fence->stream_master_qid_map;
	}

	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
+21 −1
Original line number Diff line number Diff line
@@ -110,6 +110,17 @@

#define MONITOR_SOB_STRING_SIZE		256

static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
	GAUDI_QUEUE_ID_DMA_0_0,
	GAUDI_QUEUE_ID_DMA_0_1,
	GAUDI_QUEUE_ID_DMA_0_2,
	GAUDI_QUEUE_ID_DMA_0_3,
	GAUDI_QUEUE_ID_DMA_1_0,
	GAUDI_QUEUE_ID_DMA_1_1,
	GAUDI_QUEUE_ID_DMA_1_2,
	GAUDI_QUEUE_ID_DMA_1_3
};

static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
		"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
		"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -1870,6 +1881,9 @@ static int gaudi_sw_init(struct hl_device *hdev)
	hdev->supports_wait_for_multi_cs = true;

	hdev->asic_funcs->set_pci_memory_regions(hdev);
	hdev->stream_master_qid_arr =
				hdev->asic_funcs->get_stream_master_qid_arr();
	hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;

	return 0;

@@ -9352,6 +9366,11 @@ static void gaudi_state_dump_init(struct hl_device *hdev)
	sds->funcs = gaudi_state_dump_funcs;
}

static u32 *gaudi_get_stream_master_qid_arr(void)
{
	return gaudi_stream_master;
}

static const struct hl_asic_funcs gaudi_funcs = {
	.early_init = gaudi_early_init,
	.early_fini = gaudi_early_fini,
@@ -9440,7 +9459,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
	.state_dump_init = gaudi_state_dump_init,
	.get_sob_addr = gaudi_get_sob_addr,
	.set_pci_memory_regions = gaudi_set_pci_memory_regions
	.set_pci_memory_regions = gaudi_set_pci_memory_regions,
	.get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
};

/**
+2 −0
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@
#define NUMBER_OF_INTERRUPTS		(NUMBER_OF_CMPLT_QUEUES + \
						NUMBER_OF_CPU_HW_QUEUES)

#define GAUDI_STREAM_MASTER_ARR_SIZE	8

#if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
#endif
Loading