Commit e7ae83da authored by Antonino Maniscalco's avatar Antonino Maniscalco Committed by Rob Clark
Browse files

drm/msm/a6xx: Implement preemption for a7xx targets



This patch implements preemption feature for A6xx targets, this allows
the GPU to switch to a higher priority ringbuffer if one is ready. A6XX
hardware as such supports multiple levels of preemption granularities,
ranging from coarse grained(ringbuffer level) to a more fine grained
such as draw-call level or a bin boundary level preemption. This patch
enables the basic preemption level, with more fine grained preemption
support to follow.

Reviewed-by: default avatarAkhil P Oommen <quic_akhilpo@quicinc.com>
Tested-by: default avatarRob Clark <robdclark@gmail.com>
Tested-by: Neil Armstrong <neil.armstrong@linaro.org> # on SM8650-QRD
Tested-by: Neil Armstrong <neil.armstrong@linaro.org> # on SM8550-QRD
Tested-by: Neil Armstrong <neil.armstrong@linaro.org> # on SM8450-HDK
Signed-off-by: default avatarSharat Masetty <smasetty@codeaurora.org>
Signed-off-by: default avatarAntonino Maniscalco <antomani103@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/618021/


Signed-off-by: default avatarRob Clark <robdclark@chromium.org>
parent 91389b4e
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ adreno-y := \
	adreno/a6xx_gpu.o \
	adreno/a6xx_gmu.o \
	adreno/a6xx_hfi.o \
	adreno/a6xx_preempt.o \

adreno-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o \

+182 −11
Original line number Diff line number Diff line
@@ -68,6 +68,8 @@ static void update_shadow_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)

static void a6xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	uint32_t wptr;
	unsigned long flags;

@@ -81,12 +83,17 @@ static void a6xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
	/* Make sure to wrap wptr if we need to */
	wptr = get_wptr(ring);

	spin_unlock_irqrestore(&ring->preempt_lock, flags);

	/* Make sure everything is posted before making a decision */
	mb();

	/* Update HW if this is the current ring and we are not in preempt*/
	if (!a6xx_in_preempt(a6xx_gpu)) {
		if (a6xx_gpu->cur_ring == ring)
			gpu_write(gpu, REG_A6XX_CP_RB_WPTR, wptr);
		else
			ring->restore_wptr = true;
	} else {
		ring->restore_wptr = true;
	}

	spin_unlock_irqrestore(&ring->preempt_lock, flags);
}

static void get_stats_counter(struct msm_ringbuffer *ring, u32 counter,
@@ -138,12 +145,14 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,

	/*
	 * Write the new TTBR0 to the memstore. This is good for debugging.
	 * Needed for preemption
	 */
	OUT_PKT7(ring, CP_MEM_WRITE, 4);
	OUT_PKT7(ring, CP_MEM_WRITE, 5);
	OUT_RING(ring, CP_MEM_WRITE_0_ADDR_LO(lower_32_bits(memptr)));
	OUT_RING(ring, CP_MEM_WRITE_1_ADDR_HI(upper_32_bits(memptr)));
	OUT_RING(ring, lower_32_bits(ttbr));
	OUT_RING(ring, (asid << 16) | upper_32_bits(ttbr));
	OUT_RING(ring, upper_32_bits(ttbr));
	OUT_RING(ring, ctx->seqno);

	/*
	 * Sync both threads after switching pagetables and enable BR only
@@ -268,6 +277,34 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	a6xx_flush(gpu, ring);
}

static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
		struct a6xx_gpu *a6xx_gpu, struct msm_gpu_submitqueue *queue)
{
	OUT_PKT7(ring, CP_SET_PSEUDO_REG, 12);

	OUT_RING(ring, SMMU_INFO);
	/* don't save SMMU, we write the record from the kernel instead */
	OUT_RING(ring, 0);
	OUT_RING(ring, 0);

	/* privileged and non secure buffer save */
	OUT_RING(ring, NON_SECURE_SAVE_ADDR);
	OUT_RING(ring, lower_32_bits(
		a6xx_gpu->preempt_iova[ring->id]));
	OUT_RING(ring, upper_32_bits(
		a6xx_gpu->preempt_iova[ring->id]));

	/* user context buffer save, seems to be unnused by fw */
	OUT_RING(ring, NON_PRIV_SAVE_ADDR);
	OUT_RING(ring, 0);
	OUT_RING(ring, 0);

	OUT_RING(ring, COUNTER);
	/* seems OK to set to 0 to disable it */
	OUT_RING(ring, 0);
	OUT_RING(ring, 0);
}

static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
{
	unsigned int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
@@ -285,6 +322,13 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)

	a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);

	/*
	 * If preemption is enabled, then set the pseudo register for the save
	 * sequence
	 */
	if (gpu->nr_rings > 1)
		a6xx_emit_set_pseudo_reg(ring, a6xx_gpu, submit->queue);

	get_stats_counter(ring, REG_A7XX_RBBM_PERFCTR_CP(0),
		rbmemptr_stats(ring, index, cpcycles_start));
	get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_COUNTER,
@@ -376,6 +420,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	OUT_RING(ring, upper_32_bits(rbmemptr(ring, bv_fence)));
	OUT_RING(ring, submit->seqno);

	a6xx_gpu->last_seqno[ring->id] = submit->seqno;

	/* write the ringbuffer timestamp */
	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
	OUT_RING(ring, CACHE_CLEAN | CP_EVENT_WRITE_0_IRQ | BIT(27));
@@ -389,10 +435,32 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
	OUT_PKT7(ring, CP_SET_MARKER, 1);
	OUT_RING(ring, 0x100); /* IFPC enable */

	/* If preemption is enabled */
	if (gpu->nr_rings > 1) {
		/* Yield the floor on command completion */
		OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);

		/*
		 * If dword[2:1] are non zero, they specify an address for
		 * the CP to write the value of dword[3] to on preemption
		 * complete. Write 0 to skip the write
		 */
		OUT_RING(ring, 0x00);
		OUT_RING(ring, 0x00);
		/* Data value - not used if the address above is 0 */
		OUT_RING(ring, 0x01);
		/* generate interrupt on preemption completion */
		OUT_RING(ring, 0x00);
	}


	trace_msm_gpu_submit_flush(submit,
		gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_COUNTER));

	a6xx_flush(gpu, ring);

	/* Check to see if we need to start preemption */
	a6xx_preempt_trigger(gpu);
}

static void a6xx_set_hwcg(struct msm_gpu *gpu, bool state)
@@ -599,6 +667,77 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
		  adreno_gpu->ubwc_config.macrotile_mode);
}

static void a7xx_patch_pwrup_reglist(struct msm_gpu *gpu)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	const struct adreno_reglist_list *reglist;
	void *ptr = a6xx_gpu->pwrup_reglist_ptr;
	struct cpu_gpu_lock *lock = ptr;
	u32 *dest = (u32 *)&lock->regs[0];
	int i;

	reglist = adreno_gpu->info->a6xx->pwrup_reglist;

	lock->gpu_req = lock->cpu_req = lock->turn = 0;
	lock->ifpc_list_len = 0;
	lock->preemption_list_len = reglist->count;

	/*
	 * For each entry in each of the lists, write the offset and the current
	 * register value into the GPU buffer
	 */
	for (i = 0; i < reglist->count; i++) {
		*dest++ = reglist->regs[i];
		*dest++ = gpu_read(gpu, reglist->regs[i]);
	}

	/*
	 * The overall register list is composed of
	 * 1. Static IFPC-only registers
	 * 2. Static IFPC + preemption registers
	 * 3. Dynamic IFPC + preemption registers (ex: perfcounter selects)
	 *
	 * The first two lists are static. Size of these lists are stored as
	 * number of pairs in ifpc_list_len and preemption_list_len
	 * respectively. With concurrent binning, Some of the perfcounter
	 * registers being virtualized, CP needs to know the pipe id to program
	 * the aperture inorder to restore the same. Thus, third list is a
	 * dynamic list with triplets as
	 * (<aperture, shifted 12 bits> <address> <data>), and the length is
	 * stored as number for triplets in dynamic_list_len.
	 */
	lock->dynamic_list_len = 0;
}

static int a7xx_preempt_start(struct msm_gpu *gpu)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	struct msm_ringbuffer *ring = gpu->rb[0];

	if (gpu->nr_rings <= 1)
		return 0;

	/* Turn CP protection off */
	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
	OUT_RING(ring, 0);

	a6xx_emit_set_pseudo_reg(ring, a6xx_gpu, NULL);

	/* Yield the floor on command completion */
	OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
	OUT_RING(ring, 0x00);
	OUT_RING(ring, 0x00);
	OUT_RING(ring, 0x00);
	/* Generate interrupt on preemption completion */
	OUT_RING(ring, 0x00);

	a6xx_flush(gpu, ring);

	return a6xx_idle(gpu, ring) ? 0 : -EINVAL;
}

static int a6xx_cp_init(struct msm_gpu *gpu)
{
	struct msm_ringbuffer *ring = gpu->rb[0];
@@ -630,6 +769,8 @@ static int a6xx_cp_init(struct msm_gpu *gpu)

static int a7xx_cp_init(struct msm_gpu *gpu)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	struct msm_ringbuffer *ring = gpu->rb[0];
	u32 mask;

@@ -667,11 +808,11 @@ static int a7xx_cp_init(struct msm_gpu *gpu)

	/* *Don't* send a power up reg list for concurrent binning (TODO) */
	/* Lo address */
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, lower_32_bits(a6xx_gpu->pwrup_reglist_iova));
	/* Hi address */
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, upper_32_bits(a6xx_gpu->pwrup_reglist_iova));
	/* BIT(31) set => read the regs from the list */
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, BIT(31));

	a6xx_flush(gpu, ring);
	return a6xx_idle(gpu, ring) ? 0 : -EINVAL;
@@ -795,6 +936,16 @@ static int a6xx_ucode_load(struct msm_gpu *gpu)
		msm_gem_object_set_name(a6xx_gpu->shadow_bo, "shadow");
	}

	a6xx_gpu->pwrup_reglist_ptr = msm_gem_kernel_new(gpu->dev, PAGE_SIZE,
							 MSM_BO_WC  | MSM_BO_MAP_PRIV,
							 gpu->aspace, &a6xx_gpu->pwrup_reglist_bo,
							 &a6xx_gpu->pwrup_reglist_iova);

	if (IS_ERR(a6xx_gpu->pwrup_reglist_ptr))
		return PTR_ERR(a6xx_gpu->pwrup_reglist_ptr);

	msm_gem_object_set_name(a6xx_gpu->pwrup_reglist_bo, "pwrup_reglist");

	return 0;
}

@@ -1125,6 +1276,8 @@ static int hw_init(struct msm_gpu *gpu)
	if (a6xx_gpu->shadow_bo) {
		gpu_write64(gpu, REG_A6XX_CP_RB_RPTR_ADDR,
			shadowptr(a6xx_gpu, gpu->rb[0]));
		for (unsigned int i = 0; i < gpu->nr_rings; i++)
			a6xx_gpu->shadow[i] = 0;
	}

	/* ..which means "always" on A7xx, also for BV shadow */
@@ -1133,6 +1286,8 @@ static int hw_init(struct msm_gpu *gpu)
			    rbmemptr(gpu->rb[0], bv_rptr));
	}

	a6xx_preempt_hw_init(gpu);

	/* Always come up on rb 0 */
	a6xx_gpu->cur_ring = gpu->rb[0];

@@ -1142,6 +1297,11 @@ static int hw_init(struct msm_gpu *gpu)
	/* Enable the SQE_to start the CP engine */
	gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1);

	if (adreno_is_a7xx(adreno_gpu) && !a6xx_gpu->pwrup_reglist_emitted) {
		a7xx_patch_pwrup_reglist(gpu);
		a6xx_gpu->pwrup_reglist_emitted = true;
	}

	ret = adreno_is_a7xx(adreno_gpu) ? a7xx_cp_init(gpu) : a6xx_cp_init(gpu);
	if (ret)
		goto out;
@@ -1179,6 +1339,10 @@ static int hw_init(struct msm_gpu *gpu)
out:
	if (adreno_has_gmu_wrapper(adreno_gpu))
		return ret;

	/* Last step - yield the ringbuffer */
	a7xx_preempt_start(gpu);

	/*
	 * Tell the GMU that we are done touching the GPU and it can start power
	 * management
@@ -1556,8 +1720,13 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
	if (status & A6XX_RBBM_INT_0_MASK_SWFUSEVIOLATION)
		a7xx_sw_fuse_violation_irq(gpu);

	if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
	if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) {
		msm_gpu_retire(gpu);
		a6xx_preempt_trigger(gpu);
	}

	if (status & A6XX_RBBM_INT_0_MASK_CP_SW)
		a6xx_preempt_irq(gpu);

	return IRQ_HANDLED;
}
@@ -2330,6 +2499,8 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
				a6xx_fault_handler);

	a6xx_calc_ubwc_config(adreno_gpu);
	/* Set up the preemption specific bits and pieces for each ringbuffer */
	a6xx_preempt_init(gpu);

	return gpu;
}
+162 −0
Original line number Diff line number Diff line
@@ -12,6 +12,24 @@

extern bool hang_debug;

struct cpu_gpu_lock {
	uint32_t gpu_req;
	uint32_t cpu_req;
	uint32_t turn;
	union {
		struct {
			uint16_t list_length;
			uint16_t list_offset;
		};
		struct {
			uint8_t ifpc_list_len;
			uint8_t preemption_list_len;
			uint16_t dynamic_list_len;
		};
	};
	uint64_t regs[62];
};

/**
 * struct a6xx_info - a6xx specific information from device table
 *
@@ -35,6 +53,23 @@ struct a6xx_gpu {
	uint64_t sqe_iova;

	struct msm_ringbuffer *cur_ring;
	struct msm_ringbuffer *next_ring;

	struct drm_gem_object *preempt_bo[MSM_GPU_MAX_RINGS];
	void *preempt[MSM_GPU_MAX_RINGS];
	uint64_t preempt_iova[MSM_GPU_MAX_RINGS];
	struct drm_gem_object *preempt_smmu_bo[MSM_GPU_MAX_RINGS];
	void *preempt_smmu[MSM_GPU_MAX_RINGS];
	uint64_t preempt_smmu_iova[MSM_GPU_MAX_RINGS];
	uint32_t last_seqno[MSM_GPU_MAX_RINGS];

	atomic_t preempt_state;
	spinlock_t eval_lock;
	struct timer_list preempt_timer;

	unsigned int preempt_level;
	bool uses_gmem;
	bool skip_save_restore;

	struct a6xx_gmu gmu;

@@ -42,6 +77,11 @@ struct a6xx_gpu {
	uint64_t shadow_iova;
	uint32_t *shadow;

	struct drm_gem_object *pwrup_reglist_bo;
	void *pwrup_reglist_ptr;
	uint64_t pwrup_reglist_iova;
	bool pwrup_reglist_emitted;

	bool has_whereami;

	void __iomem *llc_mmio;
@@ -53,6 +93,100 @@ struct a6xx_gpu {

#define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)

/*
 * In order to do lockless preemption we use a simple state machine to progress
 * through the process.
 *
 * PREEMPT_NONE - no preemption in progress.  Next state START.
 * PREEMPT_START - The trigger is evaluating if preemption is possible. Next
 * states: TRIGGERED, NONE
 * PREEMPT_FINISH - An intermediate state before moving back to NONE. Next
 * state: NONE.
 * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next
 * states: FAULTED, PENDING
 * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger
 * recovery.  Next state: N/A
 * PREEMPT_PENDING: Preemption complete interrupt fired - the callback is
 * checking the success of the operation. Next state: FAULTED, NONE.
 */

enum a6xx_preempt_state {
	PREEMPT_NONE = 0,
	PREEMPT_START,
	PREEMPT_FINISH,
	PREEMPT_TRIGGERED,
	PREEMPT_FAULTED,
	PREEMPT_PENDING,
};

/*
 * struct a6xx_preempt_record is a shared buffer between the microcode and the
 * CPU to store the state for preemption. The record itself is much larger
 * (2112k) but most of that is used by the CP for storage.
 *
 * There is a preemption record assigned per ringbuffer. When the CPU triggers a
 * preemption, it fills out the record with the useful information (wptr, ring
 * base, etc) and the microcode uses that information to set up the CP following
 * the preemption.  When a ring is switched out, the CP will save the ringbuffer
 * state back to the record. In this way, once the records are properly set up
 * the CPU can quickly switch back and forth between ringbuffers by only
 * updating a few registers (often only the wptr).
 *
 * These are the CPU aware registers in the record:
 * @magic: Must always be 0xAE399D6EUL
 * @info: Type of the record - written 0 by the CPU, updated by the CP
 * @errno: preemption error record
 * @data: Data field in YIELD and SET_MARKER packets, Written and used by CP
 * @cntl: Value of RB_CNTL written by CPU, save/restored by CP
 * @rptr: Value of RB_RPTR written by CPU, save/restored by CP
 * @wptr: Value of RB_WPTR written by CPU, save/restored by CP
 * @_pad: Reserved/padding
 * @rptr_addr: Value of RB_RPTR_ADDR_LO|HI written by CPU, save/restored by CP
 * @rbase: Value of RB_BASE written by CPU, save/restored by CP
 * @counter: GPU address of the storage area for the preemption counters
 * @bv_rptr_addr: Value of BV_RB_RPTR_ADDR_LO|HI written by CPU, save/restored by CP
 */
struct a6xx_preempt_record {
	u32 magic;
	u32 info;
	u32 errno;
	u32 data;
	u32 cntl;
	u32 rptr;
	u32 wptr;
	u32 _pad;
	u64 rptr_addr;
	u64 rbase;
	u64 counter;
	u64 bv_rptr_addr;
};

#define A6XX_PREEMPT_RECORD_MAGIC 0xAE399D6EUL

#define PREEMPT_SMMU_INFO_SIZE 4096

#define PREEMPT_RECORD_SIZE(adreno_gpu) \
	((adreno_gpu->info->preempt_record_size) == 0 ? \
	 4192 * SZ_1K : (adreno_gpu->info->preempt_record_size))

/*
 * The preemption counter block is a storage area for the value of the
 * preemption counters that are saved immediately before context switch. We
 * append it on to the end of the allocation for the preemption record.
 */
#define A6XX_PREEMPT_COUNTER_SIZE (16 * 4)

struct a7xx_cp_smmu_info {
	u32 magic;
	u32 _pad4;
	u64 ttbr0;
	u32 asid;
	u32 context_idr;
	u32 context_bank;
};

#define GEN7_CP_SMMU_INFO_MAGIC 0x241350d5UL

/*
 * Given a register and a count, return a value to program into
 * REG_CP_PROTECT_REG(n) - this will block both reads and writes for
@@ -110,6 +244,34 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node);
int a6xx_gmu_wrapper_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node);
void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu);

void a6xx_preempt_init(struct msm_gpu *gpu);
void a6xx_preempt_hw_init(struct msm_gpu *gpu);
void a6xx_preempt_trigger(struct msm_gpu *gpu);
void a6xx_preempt_irq(struct msm_gpu *gpu);
void a6xx_preempt_fini(struct msm_gpu *gpu);
int a6xx_preempt_submitqueue_setup(struct msm_gpu *gpu,
		struct msm_gpu_submitqueue *queue);
void a6xx_preempt_submitqueue_close(struct msm_gpu *gpu,
		struct msm_gpu_submitqueue *queue);

/* Return true if we are in a preempt state */
static inline bool a6xx_in_preempt(struct a6xx_gpu *a6xx_gpu)
{
	/*
	 * Make sure the read to preempt_state is ordered with respect to reads
	 * of other variables before ...
	 */
	smp_rmb();

	int preempt_state = atomic_read(&a6xx_gpu->preempt_state);

	/* ... and after. */
	smp_rmb();

	return !(preempt_state == PREEMPT_NONE ||
			preempt_state == PREEMPT_FINISH);
}

void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp,
		       bool suspended);
unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu);
+393 −0

File added.

Preview size limit exceeded, changes collapsed.

+7 −0
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@ struct msm_rbmemptrs {

	volatile struct msm_gpu_submit_stats stats[MSM_GPU_SUBMIT_STATS_COUNT];
	volatile u64 ttbr0;
	volatile u32 context_idr;
};

struct msm_cp_state {
@@ -101,6 +102,12 @@ struct msm_ringbuffer {
	 */
	spinlock_t preempt_lock;

	/*
	 * Whether we skipped writing wptr and it needs to be updated in the
	 * future when the ring becomes current.
	 */
	bool restore_wptr;

	/**
	 * cur_ctx_seqno:
	 *