Commit f728c17f authored by Farah Kassabri's avatar Farah Kassabri Committed by Oded Gabbay
Browse files

accel/habanalabs/gaudi2: move HMMU page tables to device memory



Currently the HMMU page tables reside in the host memory,
which will cause host access from the device for every page walk.
This can affect PCIe bandwidth in certain scenarios.

To prevent that problem, HMMU page tables will be moved to the device
memory so the miss transaction will read the hops from there instead of
going to the host.

Signed-off-by: default avatarFarah Kassabri <fkassabri@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 246d8b6c
Loading
Loading
Loading
Loading
+26 −0
Original line number Diff line number Diff line
@@ -443,18 +443,22 @@ enum hl_collective_mode {
 *                  a CB handle can be provided for jobs on this queue.
 *                  Otherwise, a CB address must be provided.
 * @collective_mode: collective mode of current queue
 * @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM.
 * @driver_only: true if only the driver is allowed to send a job to this queue,
 *               false otherwise.
 * @binned: True if the queue is binned out and should not be used
 * @supports_sync_stream: True if queue supports sync stream
 * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
 */
struct hw_queue_properties {
	enum hl_queue_type		type;
	enum queue_cb_alloc_flags	cb_alloc_flags;
	enum hl_collective_mode		collective_mode;
	u64				q_dram_bd_address;
	u8				driver_only;
	u8				binned;
	u8				supports_sync_stream;
	u8				dram_bd;
};

/**
@@ -1052,6 +1056,8 @@ struct hl_encaps_signals_mgr {
 * @collective_mode: collective mode of current queue
 * @kernel_address: holds the queue's kernel virtual address.
 * @bus_address: holds the queue's DMA address.
 * @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in
 *                   queue properites.
 * @pi: holds the queue's pi value.
 * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
 * @hw_queue_id: the id of the H/W queue.
@@ -1061,6 +1067,7 @@ struct hl_encaps_signals_mgr {
 * @valid: is the queue valid (we have array of 32 queues, not all of them
 *         exist).
 * @supports_sync_stream: True if queue supports sync stream
 * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
 */
struct hl_hw_queue {
	struct hl_cs_job			**shadow_queue;
@@ -1069,6 +1076,7 @@ struct hl_hw_queue {
	enum hl_collective_mode			collective_mode;
	void					*kernel_address;
	dma_addr_t				bus_address;
	u64					pq_dram_address;
	u32					pi;
	atomic_t				ci;
	u32					hw_queue_id;
@@ -1077,6 +1085,7 @@ struct hl_hw_queue {
	u16					int_queue_len;
	u8					valid;
	u8					supports_sync_stream;
	u8					dram_bd;
};

/**
@@ -3889,6 +3898,7 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
							struct hl_hr_mmu_funcs *hr_func);
int hl_mmu_if_set_funcs(struct hl_device *hdev);
void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
@@ -3896,6 +3906,22 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr);
void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr);
void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info);
u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx);
u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx);
void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr);
u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr);
int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr);
u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop);
u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx);
void hl_mmu_dr_flush(struct hl_ctx *ctx);
int hl_mmu_dr_init(struct hl_device *hdev);
void hl_mmu_dr_fini(struct hl_device *hdev);

int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
				void __iomem *dst, u32 src_offset, u32 size);
+17 −0
Original line number Diff line number Diff line
@@ -84,6 +84,8 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
		u32 ctl, u32 len, u64 ptr)
{
	struct hl_bd *bd;
	u64 addr;
	int i;

	bd = q->kernel_address;
	bd += hl_pi_2_offset(q->pi);
@@ -91,7 +93,16 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
	bd->len = cpu_to_le32(len);
	bd->ptr = cpu_to_le64(ptr);

	if (q->dram_bd)
		for (i = 0 ; i < 2 ; i++) {
			addr = q->pq_dram_address +
			((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd))	+ (i * sizeof(u64)));
			hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,	addr,
						(u64 *)(bd) + i, DEBUGFS_WRITE64);
		}

	q->pi = hl_queue_inc_ptr(q->pi);

	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
}

@@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev)
		q->supports_sync_stream =
				asic->hw_queues_props[i].supports_sync_stream;
		q->collective_mode = asic->hw_queues_props[i].collective_mode;
		q->dram_bd = asic->hw_queues_props[i].dram_bd;

		rc = queue_init(hdev, q, i);
		if (rc) {
			dev_err(hdev->dev,
				"failed to initialize queue %d\n", i);
			goto release_queues;
		}

		/* Set DRAM PQ address for the queue if it should be at DRAM */
		if (q->dram_bd)
			q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
	}

	return 0;
+1 −1
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \
			common/mmu/mmu_v2_hr.o
			common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o
+221 −2
Original line number Diff line number Diff line
@@ -585,6 +585,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,

int hl_mmu_if_set_funcs(struct hl_device *hdev)
{
	struct asic_fixed_properties *prop = &hdev->asic_prop;

	if (hdev->mmu_disable)
		return 0;

@@ -597,7 +599,8 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
	case ASIC_GAUDI2:
	case ASIC_GAUDI2B:
	case ASIC_GAUDI2C:
		/* MMUs in Gaudi2 are always host resident */
		hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
		if (prop->pmmu.host_resident)
			hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
		break;
	default:
@@ -1209,3 +1212,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
	return 0;
}

struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
{
	struct pgt_info *pgt_info = NULL;

	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
			(unsigned long) hop_addr)
		if (hop_addr == pgt_info->shadow_addr)
			break;

	return pgt_info;
}

void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr)
{
	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);

	hl_mmu_dr_free_pgt_node(ctx, pgt_info);
}

void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
{
	struct hl_device *hdev = ctx->hdev;

	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
			hdev->asic_prop.mmu_hop_table_size);
	hash_del(&pgt_info->node);
	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
	kfree(pgt_info);
}

u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
{
	return ctx->hdev->asic_prop.mmu_pgt_addr +
			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}

u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
{
	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}

u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
{
	u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
	u64 shadow_hop_addr = shadow_addr & (~page_mask);
	u64 pte_offset = shadow_addr & page_mask;
	u64 phys_hop_addr;

	if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx))
		phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
	else
		phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);

	return phys_hop_addr + pte_offset;
}

void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
{
	u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val);

	ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr),
					phys_val);

	*(u64 *) (uintptr_t) shadow_pte_addr = val;
}

void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
{
	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
				hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val);
	*(u64 *) (uintptr_t) shadow_pte_addr = val;
}

void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr)
{
	hl_mmu_dr_write_final_pte(ctx, pte_addr, 0);
}

void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr)
{
	hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++;
}

int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr)
{
	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
	int num_of_ptes_left;

	pgt_info->num_of_ptes--;

	/*
	 * Need to save the number of ptes left because hl_mmu_free_hop might free
	 * the pgt_info
	 */
	num_of_ptes_left = pgt_info->num_of_ptes;
	if (!num_of_ptes_left)
		hl_mmu_dr_free_pgt_node(ctx, pgt_info);

	return num_of_ptes_left;
}

u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
{
	struct hl_device *hdev = ctx->hdev;
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	struct pgt_info *pgt_info;
	u64 phys_addr, shadow_addr;

	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
	if (!pgt_info)
		return ULLONG_MAX;

	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
					prop->mmu_hop_table_size);
	if (!phys_addr) {
		dev_err(hdev->dev, "failed to allocate page\n");
		goto pool_add_err;
	}

	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
						GFP_KERNEL);
	if (!shadow_addr)
		goto shadow_err;

	pgt_info->phys_addr = phys_addr;
	pgt_info->shadow_addr = shadow_addr;
	pgt_info->ctx = ctx;
	pgt_info->num_of_ptes = 0;
	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);

	return shadow_addr;

shadow_err:
	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
			phys_addr, prop->mmu_hop_table_size);
pool_add_err:
	kfree(pgt_info);

	return ULLONG_MAX;
}

u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop)
{
	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);

	if (hop_addr == ULLONG_MAX) {
		hop_addr = hl_mmu_dr_alloc_hop(ctx);
		*is_new_hop = (hop_addr != ULLONG_MAX);
	}

	return hop_addr;
}

void hl_mmu_dr_flush(struct hl_ctx *ctx)
{
	/* flush all writes from all cores to reach PCI */
	mb();
	ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx));
}

int hl_mmu_dr_init(struct hl_device *hdev)
{
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	int rc;

	hdev->mmu_priv.dr.mmu_pgt_pool =
			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);

	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
		dev_err(hdev->dev, "Failed to create page gen pool\n");
		return -ENOMEM;
	}

	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
			prop->mmu_hop0_tables_total_size,
			prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
			-1);
	if (rc) {
		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
		goto err_pool_add;
	}

	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
						prop->mmu_hop_table_size, GFP_KERNEL);
	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
		rc = -ENOMEM;
		goto err_pool_add;
	}

	/* MMU H/W init will be done in device hw_init() */

	return 0;

err_pool_add:
	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);

	return rc;
}

void hl_mmu_dr_fini(struct hl_device *hdev)
{
	/* MMU H/W fini was already done in device hw_fini() */

	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0))
		return;

	kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);

	/* Make sure that if we arrive here again without init was
	 * called we won't cause kernel panic. This can happen for
	 * example if we fail during hard reset code at certain points
	 */
	hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
}
+44 −308

File changed.

Preview size limit exceeded, changes collapsed.

Loading