drm/msm/a6xx: Implement preemption for a7xx targets (e7ae83da) · Commits · git / linux-net

drivers/gpu/drm/msm/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -23,6 +23,7 @@ adreno-y := \
		adreno/a6xx_gpu.o \
		adreno/a6xx_gmu.o \
		adreno/a6xx_hfi.o \
		adreno/a6xx_preempt.o \

		adreno-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o \

drivers/gpu/drm/msm/adreno/a6xx_gpu.c

+182 −11

Original line number	Diff line number	Diff line
		@@ -68,6 +68,8 @@ static void update_shadow_rptr(struct msm_gpu gpu, struct msm_ringbuffer ring)

		static void a6xx_flush(struct msm_gpu gpu, struct msm_ringbuffer ring)
		{
		struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
		struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
		uint32_t wptr;
		unsigned long flags;

		@@ -81,12 +83,17 @@ static void a6xx_flush(struct msm_gpu gpu, struct msm_ringbuffer ring)
		/* Make sure to wrap wptr if we need to */
		wptr = get_wptr(ring);

		spin_unlock_irqrestore(&ring->preempt_lock, flags);

		/* Make sure everything is posted before making a decision */
		mb();

		/* Update HW if this is the current ring and we are not in preempt*/
		if (!a6xx_in_preempt(a6xx_gpu)) {
		if (a6xx_gpu->cur_ring == ring)
		gpu_write(gpu, REG_A6XX_CP_RB_WPTR, wptr);
		else
		ring->restore_wptr = true;
		} else {
		ring->restore_wptr = true;
		}

		spin_unlock_irqrestore(&ring->preempt_lock, flags);
		}

		static void get_stats_counter(struct msm_ringbuffer *ring, u32 counter,
		@@ -138,12 +145,14 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,

		/*
		* Write the new TTBR0 to the memstore. This is good for debugging.
		* Needed for preemption
		*/
		OUT_PKT7(ring, CP_MEM_WRITE, 4);
		OUT_PKT7(ring, CP_MEM_WRITE, 5);
		OUT_RING(ring, CP_MEM_WRITE_0_ADDR_LO(lower_32_bits(memptr)));
		OUT_RING(ring, CP_MEM_WRITE_1_ADDR_HI(upper_32_bits(memptr)));
		OUT_RING(ring, lower_32_bits(ttbr));
		OUT_RING(ring, (asid << 16) \| upper_32_bits(ttbr));
		OUT_RING(ring, upper_32_bits(ttbr));
		OUT_RING(ring, ctx->seqno);

		/*
		* Sync both threads after switching pagetables and enable BR only
		@@ -268,6 +277,34 @@ static void a6xx_submit(struct msm_gpu gpu, struct msm_gem_submit submit)
		a6xx_flush(gpu, ring);
		}

		static void a6xx_emit_set_pseudo_reg(struct msm_ringbuffer *ring,
		struct a6xx_gpu a6xx_gpu, struct msm_gpu_submitqueue queue)
		{
		OUT_PKT7(ring, CP_SET_PSEUDO_REG, 12);

		OUT_RING(ring, SMMU_INFO);
		/* don't save SMMU, we write the record from the kernel instead */
		OUT_RING(ring, 0);
		OUT_RING(ring, 0);

		/* privileged and non secure buffer save */
		OUT_RING(ring, NON_SECURE_SAVE_ADDR);
		OUT_RING(ring, lower_32_bits(
		a6xx_gpu->preempt_iova[ring->id]));
		OUT_RING(ring, upper_32_bits(
		a6xx_gpu->preempt_iova[ring->id]));

		/* user context buffer save, seems to be unnused by fw */
		OUT_RING(ring, NON_PRIV_SAVE_ADDR);
		OUT_RING(ring, 0);
		OUT_RING(ring, 0);

		OUT_RING(ring, COUNTER);
		/* seems OK to set to 0 to disable it */
		OUT_RING(ring, 0);
		OUT_RING(ring, 0);
		}

		static void a7xx_submit(struct msm_gpu gpu, struct msm_gem_submit submit)
		{
		unsigned int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
		@@ -285,6 +322,13 @@ static void a7xx_submit(struct msm_gpu gpu, struct msm_gem_submit submit)

		a6xx_set_pagetable(a6xx_gpu, ring, submit->queue->ctx);

		/*
		* If preemption is enabled, then set the pseudo register for the save
		* sequence
		*/
		if (gpu->nr_rings > 1)
		a6xx_emit_set_pseudo_reg(ring, a6xx_gpu, submit->queue);

		get_stats_counter(ring, REG_A7XX_RBBM_PERFCTR_CP(0),
		rbmemptr_stats(ring, index, cpcycles_start));
		get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_COUNTER,
		@@ -376,6 +420,8 @@ static void a7xx_submit(struct msm_gpu gpu, struct msm_gem_submit submit)
		OUT_RING(ring, upper_32_bits(rbmemptr(ring, bv_fence)));
		OUT_RING(ring, submit->seqno);

		a6xx_gpu->last_seqno[ring->id] = submit->seqno;

		/* write the ringbuffer timestamp */
		OUT_PKT7(ring, CP_EVENT_WRITE, 4);
		OUT_RING(ring, CACHE_CLEAN \| CP_EVENT_WRITE_0_IRQ \| BIT(27));
		@@ -389,10 +435,32 @@ static void a7xx_submit(struct msm_gpu gpu, struct msm_gem_submit submit)
		OUT_PKT7(ring, CP_SET_MARKER, 1);
		OUT_RING(ring, 0x100); /* IFPC enable */

		/* If preemption is enabled */
		if (gpu->nr_rings > 1) {
		/* Yield the floor on command completion */
		OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);

		/*
		* If dword[2:1] are non zero, they specify an address for
		* the CP to write the value of dword[3] to on preemption
		* complete. Write 0 to skip the write
		*/
		OUT_RING(ring, 0x00);
		OUT_RING(ring, 0x00);
		/* Data value - not used if the address above is 0 */
		OUT_RING(ring, 0x01);
		/* generate interrupt on preemption completion */
		OUT_RING(ring, 0x00);
		}


		trace_msm_gpu_submit_flush(submit,
		gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_COUNTER));

		a6xx_flush(gpu, ring);

		/* Check to see if we need to start preemption */
		a6xx_preempt_trigger(gpu);
		}

		static void a6xx_set_hwcg(struct msm_gpu *gpu, bool state)
		@@ -599,6 +667,77 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
		adreno_gpu->ubwc_config.macrotile_mode);
		}

		static void a7xx_patch_pwrup_reglist(struct msm_gpu *gpu)
		{
		struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
		struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
		const struct adreno_reglist_list *reglist;
		void *ptr = a6xx_gpu->pwrup_reglist_ptr;
		struct cpu_gpu_lock *lock = ptr;
		u32 dest = (u32 )&lock->regs[0];
		int i;

		reglist = adreno_gpu->info->a6xx->pwrup_reglist;

		lock->gpu_req = lock->cpu_req = lock->turn = 0;
		lock->ifpc_list_len = 0;
		lock->preemption_list_len = reglist->count;

		/*
		* For each entry in each of the lists, write the offset and the current
		* register value into the GPU buffer
		*/
		for (i = 0; i < reglist->count; i++) {
		*dest++ = reglist->regs[i];
		*dest++ = gpu_read(gpu, reglist->regs[i]);
		}

		/*
		* The overall register list is composed of
		* 1. Static IFPC-only registers
		* 2. Static IFPC + preemption registers
		* 3. Dynamic IFPC + preemption registers (ex: perfcounter selects)
		*
		* The first two lists are static. Size of these lists are stored as
		* number of pairs in ifpc_list_len and preemption_list_len
		* respectively. With concurrent binning, Some of the perfcounter
		* registers being virtualized, CP needs to know the pipe id to program
		* the aperture inorder to restore the same. Thus, third list is a
		* dynamic list with triplets as
		* (<aperture, shifted 12 bits> <address> <data>), and the length is
		* stored as number for triplets in dynamic_list_len.
		*/
		lock->dynamic_list_len = 0;
		}

		static int a7xx_preempt_start(struct msm_gpu *gpu)
		{
		struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
		struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
		struct msm_ringbuffer *ring = gpu->rb[0];

		if (gpu->nr_rings <= 1)
		return 0;

		/* Turn CP protection off */
		OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
		OUT_RING(ring, 0);

		a6xx_emit_set_pseudo_reg(ring, a6xx_gpu, NULL);

		/* Yield the floor on command completion */
		OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
		OUT_RING(ring, 0x00);
		OUT_RING(ring, 0x00);
		OUT_RING(ring, 0x00);
		/* Generate interrupt on preemption completion */
		OUT_RING(ring, 0x00);

		a6xx_flush(gpu, ring);

		return a6xx_idle(gpu, ring) ? 0 : -EINVAL;
		}

		static int a6xx_cp_init(struct msm_gpu *gpu)
		{
		struct msm_ringbuffer *ring = gpu->rb[0];
		@@ -630,6 +769,8 @@ static int a6xx_cp_init(struct msm_gpu *gpu)

		static int a7xx_cp_init(struct msm_gpu *gpu)
		{
		struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
		struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
		struct msm_ringbuffer *ring = gpu->rb[0];
		u32 mask;

		@@ -667,11 +808,11 @@ static int a7xx_cp_init(struct msm_gpu *gpu)

		/* Don't send a power up reg list for concurrent binning (TODO) */
		/* Lo address */
		OUT_RING(ring, 0x00000000);
		OUT_RING(ring, lower_32_bits(a6xx_gpu->pwrup_reglist_iova));
		/* Hi address */
		OUT_RING(ring, 0x00000000);
		OUT_RING(ring, upper_32_bits(a6xx_gpu->pwrup_reglist_iova));
		/* BIT(31) set => read the regs from the list */
		OUT_RING(ring, 0x00000000);
		OUT_RING(ring, BIT(31));

		a6xx_flush(gpu, ring);
		return a6xx_idle(gpu, ring) ? 0 : -EINVAL;
		@@ -795,6 +936,16 @@ static int a6xx_ucode_load(struct msm_gpu *gpu)
		msm_gem_object_set_name(a6xx_gpu->shadow_bo, "shadow");
		}

		a6xx_gpu->pwrup_reglist_ptr = msm_gem_kernel_new(gpu->dev, PAGE_SIZE,
		MSM_BO_WC \| MSM_BO_MAP_PRIV,
		gpu->aspace, &a6xx_gpu->pwrup_reglist_bo,
		&a6xx_gpu->pwrup_reglist_iova);

		if (IS_ERR(a6xx_gpu->pwrup_reglist_ptr))
		return PTR_ERR(a6xx_gpu->pwrup_reglist_ptr);

		msm_gem_object_set_name(a6xx_gpu->pwrup_reglist_bo, "pwrup_reglist");

		return 0;
		}

		@@ -1125,6 +1276,8 @@ static int hw_init(struct msm_gpu *gpu)
		if (a6xx_gpu->shadow_bo) {
		gpu_write64(gpu, REG_A6XX_CP_RB_RPTR_ADDR,
		shadowptr(a6xx_gpu, gpu->rb[0]));
		for (unsigned int i = 0; i < gpu->nr_rings; i++)
		a6xx_gpu->shadow[i] = 0;
		}

		/* ..which means "always" on A7xx, also for BV shadow */
		@@ -1133,6 +1286,8 @@ static int hw_init(struct msm_gpu *gpu)
		rbmemptr(gpu->rb[0], bv_rptr));
		}

		a6xx_preempt_hw_init(gpu);

		/* Always come up on rb 0 */
		a6xx_gpu->cur_ring = gpu->rb[0];

		@@ -1142,6 +1297,11 @@ static int hw_init(struct msm_gpu *gpu)
		/* Enable the SQE_to start the CP engine */
		gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1);

		if (adreno_is_a7xx(adreno_gpu) && !a6xx_gpu->pwrup_reglist_emitted) {
		a7xx_patch_pwrup_reglist(gpu);
		a6xx_gpu->pwrup_reglist_emitted = true;
		}

		ret = adreno_is_a7xx(adreno_gpu) ? a7xx_cp_init(gpu) : a6xx_cp_init(gpu);
		if (ret)
		goto out;
		@@ -1179,6 +1339,10 @@ static int hw_init(struct msm_gpu *gpu)
		out:
		if (adreno_has_gmu_wrapper(adreno_gpu))
		return ret;

		/* Last step - yield the ringbuffer */
		a7xx_preempt_start(gpu);

		/*
		* Tell the GMU that we are done touching the GPU and it can start power
		* management
		@@ -1556,8 +1720,13 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
		if (status & A6XX_RBBM_INT_0_MASK_SWFUSEVIOLATION)
		a7xx_sw_fuse_violation_irq(gpu);

		if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
		if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) {
		msm_gpu_retire(gpu);
		a6xx_preempt_trigger(gpu);
		}

		if (status & A6XX_RBBM_INT_0_MASK_CP_SW)
		a6xx_preempt_irq(gpu);

		return IRQ_HANDLED;
		}
		@@ -2330,6 +2499,8 @@ struct msm_gpu a6xx_gpu_init(struct drm_device dev)
		a6xx_fault_handler);

		a6xx_calc_ubwc_config(adreno_gpu);
		/* Set up the preemption specific bits and pieces for each ringbuffer */
		a6xx_preempt_init(gpu);

		return gpu;
		}

drivers/gpu/drm/msm/adreno/a6xx_gpu.h

+162 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,24 @@

		extern bool hang_debug;

		struct cpu_gpu_lock {
		uint32_t gpu_req;
		uint32_t cpu_req;
		uint32_t turn;
		union {
		struct {
		uint16_t list_length;
		uint16_t list_offset;
		};
		struct {
		uint8_t ifpc_list_len;
		uint8_t preemption_list_len;
		uint16_t dynamic_list_len;
		};
		};
		uint64_t regs[62];
		};

		/**
		* struct a6xx_info - a6xx specific information from device table
		*
		@@ -35,6 +53,23 @@ struct a6xx_gpu {
		uint64_t sqe_iova;

		struct msm_ringbuffer *cur_ring;
		struct msm_ringbuffer *next_ring;

		struct drm_gem_object *preempt_bo[MSM_GPU_MAX_RINGS];
		void *preempt[MSM_GPU_MAX_RINGS];
		uint64_t preempt_iova[MSM_GPU_MAX_RINGS];
		struct drm_gem_object *preempt_smmu_bo[MSM_GPU_MAX_RINGS];
		void *preempt_smmu[MSM_GPU_MAX_RINGS];
		uint64_t preempt_smmu_iova[MSM_GPU_MAX_RINGS];
		uint32_t last_seqno[MSM_GPU_MAX_RINGS];

		atomic_t preempt_state;
		spinlock_t eval_lock;
		struct timer_list preempt_timer;

		unsigned int preempt_level;
		bool uses_gmem;
		bool skip_save_restore;

		struct a6xx_gmu gmu;

		@@ -42,6 +77,11 @@ struct a6xx_gpu {
		uint64_t shadow_iova;
		uint32_t *shadow;

		struct drm_gem_object *pwrup_reglist_bo;
		void *pwrup_reglist_ptr;
		uint64_t pwrup_reglist_iova;
		bool pwrup_reglist_emitted;

		bool has_whereami;

		void __iomem *llc_mmio;
		@@ -53,6 +93,100 @@ struct a6xx_gpu {

		#define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)

		/*
		* In order to do lockless preemption we use a simple state machine to progress
		* through the process.
		*
		* PREEMPT_NONE - no preemption in progress. Next state START.
		* PREEMPT_START - The trigger is evaluating if preemption is possible. Next
		* states: TRIGGERED, NONE
		* PREEMPT_FINISH - An intermediate state before moving back to NONE. Next
		* state: NONE.
		* PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next
		* states: FAULTED, PENDING
		* PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger
		* recovery. Next state: N/A
		* PREEMPT_PENDING: Preemption complete interrupt fired - the callback is
		* checking the success of the operation. Next state: FAULTED, NONE.
		*/

		enum a6xx_preempt_state {
		PREEMPT_NONE = 0,
		PREEMPT_START,
		PREEMPT_FINISH,
		PREEMPT_TRIGGERED,
		PREEMPT_FAULTED,
		PREEMPT_PENDING,
		};

		/*
		* struct a6xx_preempt_record is a shared buffer between the microcode and the
		* CPU to store the state for preemption. The record itself is much larger
		* (2112k) but most of that is used by the CP for storage.
		*
		* There is a preemption record assigned per ringbuffer. When the CPU triggers a
		* preemption, it fills out the record with the useful information (wptr, ring
		* base, etc) and the microcode uses that information to set up the CP following
		* the preemption. When a ring is switched out, the CP will save the ringbuffer
		* state back to the record. In this way, once the records are properly set up
		* the CPU can quickly switch back and forth between ringbuffers by only
		* updating a few registers (often only the wptr).
		*
		* These are the CPU aware registers in the record:
		* @magic: Must always be 0xAE399D6EUL
		* @info: Type of the record - written 0 by the CPU, updated by the CP
		* @errno: preemption error record
		* @data: Data field in YIELD and SET_MARKER packets, Written and used by CP
		* @cntl: Value of RB_CNTL written by CPU, save/restored by CP
		* @rptr: Value of RB_RPTR written by CPU, save/restored by CP
		* @wptr: Value of RB_WPTR written by CPU, save/restored by CP
		* @_pad: Reserved/padding
		* @rptr_addr: Value of RB_RPTR_ADDR_LO\|HI written by CPU, save/restored by CP
		* @rbase: Value of RB_BASE written by CPU, save/restored by CP
		* @counter: GPU address of the storage area for the preemption counters
		* @bv_rptr_addr: Value of BV_RB_RPTR_ADDR_LO\|HI written by CPU, save/restored by CP
		*/
		struct a6xx_preempt_record {
		u32 magic;
		u32 info;
		u32 errno;
		u32 data;
		u32 cntl;
		u32 rptr;
		u32 wptr;
		u32 _pad;
		u64 rptr_addr;
		u64 rbase;
		u64 counter;
		u64 bv_rptr_addr;
		};

		#define A6XX_PREEMPT_RECORD_MAGIC 0xAE399D6EUL

		#define PREEMPT_SMMU_INFO_SIZE 4096

		#define PREEMPT_RECORD_SIZE(adreno_gpu) \
		((adreno_gpu->info->preempt_record_size) == 0 ? \
		4192 * SZ_1K : (adreno_gpu->info->preempt_record_size))

		/*
		* The preemption counter block is a storage area for the value of the
		* preemption counters that are saved immediately before context switch. We
		* append it on to the end of the allocation for the preemption record.
		*/
		#define A6XX_PREEMPT_COUNTER_SIZE (16 * 4)

		struct a7xx_cp_smmu_info {
		u32 magic;
		u32 _pad4;
		u64 ttbr0;
		u32 asid;
		u32 context_idr;
		u32 context_bank;
		};

		#define GEN7_CP_SMMU_INFO_MAGIC 0x241350d5UL

		/*
		* Given a register and a count, return a value to program into
		* REG_CP_PROTECT_REG(n) - this will block both reads and writes for
		@@ -110,6 +244,34 @@ int a6xx_gmu_init(struct a6xx_gpu a6xx_gpu, struct device_node node);
		int a6xx_gmu_wrapper_init(struct a6xx_gpu a6xx_gpu, struct device_node node);
		void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu);

		void a6xx_preempt_init(struct msm_gpu *gpu);
		void a6xx_preempt_hw_init(struct msm_gpu *gpu);
		void a6xx_preempt_trigger(struct msm_gpu *gpu);
		void a6xx_preempt_irq(struct msm_gpu *gpu);
		void a6xx_preempt_fini(struct msm_gpu *gpu);
		int a6xx_preempt_submitqueue_setup(struct msm_gpu *gpu,
		struct msm_gpu_submitqueue *queue);
		void a6xx_preempt_submitqueue_close(struct msm_gpu *gpu,
		struct msm_gpu_submitqueue *queue);

		/* Return true if we are in a preempt state */
		static inline bool a6xx_in_preempt(struct a6xx_gpu *a6xx_gpu)
		{
		/*
		* Make sure the read to preempt_state is ordered with respect to reads
		* of other variables before ...
		*/
		smp_rmb();

		int preempt_state = atomic_read(&a6xx_gpu->preempt_state);

		/* ... and after. */
		smp_rmb();

		return !(preempt_state == PREEMPT_NONE \|\|
		preempt_state == PREEMPT_FINISH);
		}

		void a6xx_gmu_set_freq(struct msm_gpu gpu, struct dev_pm_opp opp,
		bool suspended);
		unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu);

drivers/gpu/drm/msm/adreno/a6xx_preempt.c

0 → 100644

+393 −0

File added.

Preview size limit exceeded, changes collapsed.

drivers/gpu/drm/msm/msm_ringbuffer.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -36,6 +36,7 @@ struct msm_rbmemptrs {

		volatile struct msm_gpu_submit_stats stats[MSM_GPU_SUBMIT_STATS_COUNT];
		volatile u64 ttbr0;
		volatile u32 context_idr;
		};

		struct msm_cp_state {
		@@ -101,6 +102,12 @@ struct msm_ringbuffer {
		*/
		spinlock_t preempt_lock;

		/*
		* Whether we skipped writing wptr and it needs to be updated in the
		* future when the ring becomes current.
		*/
		bool restore_wptr;

		/**
		* cur_ctx_seqno:
		*