Commit 29b703d7 authored by Jay Cornwall's avatar Jay Cornwall Committed by Alex Deucher
Browse files

drm/amdkfd: gfx12.1 cluster barrier context save workaround



Trap cluster barrier may not serialize with user cluster barrier
under some circumstances. Add a check for pending user cluster
barrier complete.

Signed-off-by: default avatarJay Cornwall <jay.cornwall@amd.com>
Tested-by: default avatarGang Ba <Gang.Ba@amd.com>
Cc: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Reviewed-by: default avatarLancelot Six <lancelot.six@amd.com>
Cc: Vladimir Indic <vladimir.indic@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent ea89b305
Loading
Loading
Loading
Loading
+18 −13
Original line number Diff line number Diff line
@@ -3754,11 +3754,11 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
	0x84708a70, 0x8070ff70,
	0x00000200, 0x7e000280,
	0x7e020280, 0x7e040280,
	0xbefd0080, 0xbe804ec2,
	0xbf94fffe, 0xb8faf804,
	0x8b7a847a, 0x91788478,
	0x8c787a78, 0xd7610002,
	0xbefd0080, 0xd7610002,
	0x0000fa71, 0x807d817d,
	0xbe804ec2, 0xbf94fffe,
	0xb8faf804, 0x8b7a847a,
	0x91788478, 0x8c787a78,
	0xd7610002, 0x0000fa6c,
	0x807d817d, 0x917aff6d,
	0x80000000, 0xd7610002,
@@ -4587,7 +4587,7 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
};

static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
	0xbfa00001, 0xbfa003aa,
	0xbfa00001, 0xbfa003b4,
	0xb0804009, 0xb8eef81a,
	0xbf880000, 0xb980081a,
	0x00000000, 0xb8f8f804,
@@ -4838,15 +4838,20 @@ static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
	0x84708a70, 0x8070ff70,
	0x00000200, 0x7e000280,
	0x7e020280, 0x7e040280,
	0xbefd0080, 0xb8faf802,
	0xbf0c8b7a, 0xbfa20003,
	0xbe804fc2, 0xbf94fffe,
	0xbfa10001, 0xbe804ec4,
	0xbf94fffc, 0xb8faf804,
	0x8b7aff7a, 0x0001000c,
	0x9178ff78, 0x0001000c,
	0x8c787a78, 0xd7610002,
	0xbefd0080, 0xd7610002,
	0x0000fa71, 0x807d817d,
	0xb8faf802, 0xbf0c8b7a,
	0xbfa20003, 0xbe804fc2,
	0xbf94fffe, 0xbfa10001,
	0xbe804ec4, 0xbf94fffc,
	0xbefa4c88, 0xbfc70000,
	0xbf0c807a, 0xbfa20006,
	0x9371ff7a, 0x00070004,
	0x937aff7a, 0x00070010,
	0xbf06717a, 0xbfa2fff6,
	0xb8faf804, 0x8b7aff7a,
	0x0001000c, 0x9178ff78,
	0x0001000c, 0x8c787a78,
	0xd7610002, 0x0000fa6c,
	0x807d817d, 0x917aff6d,
	0x80000000, 0xd7610002,
+29 −7
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3)
#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0)
#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3)
#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3)

#define SINGLE_STEP_MISSED_WORKAROUND 1	//workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
@@ -104,6 +105,7 @@ var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0
var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2

var BARRIER_STATE_SIGNAL_OFFSET			= 16
var BARRIER_STATE_SIGNAL_SIZE			= 7
var BARRIER_STATE_MEMBER_OFFSET			= 4
var BARRIER_STATE_MEMBER_SIZE			= 7
var BARRIER_STATE_VALID_OFFSET			= 0
@@ -519,9 +521,11 @@ L_SAVE_HWREG:
	v_mov_b32	v2, 0x0							//Set of SGPRs for TCP store
	s_mov_b32	m0, 0x0							//Next lane of v2 to write to

	write_hwreg_to_v2(s_save_m0)

	// Ensure no further changes to barrier or LDS state.
	// STATE_PRIV.*BARRIER_COMPLETE may change up to this point.
	wait_trap_barriers(s_save_tmp)
	wait_trap_barriers(s_save_tmp, s_save_m0, 1)

	// Re-read final state of *BARRIER_COMPLETE fields for save.
	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
@@ -529,7 +533,6 @@ L_SAVE_HWREG:
	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
	s_or_b32	s_save_state_priv, s_save_state_priv, s_save_tmp

	write_hwreg_to_v2(s_save_m0)
	write_hwreg_to_v2(s_save_pc_lo)
	s_andn2_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
	write_hwreg_to_v2(s_save_tmp)
@@ -1197,7 +1200,7 @@ L_SKIP_CLUSTER_BARRIER_RESTORE:

	// Make barrier and LDS state visible to all waves in the group/cluster.
	// STATE_PRIV.*BARRIER_COMPLETE may change after this point.
	wait_trap_barriers(s_restore_tmp)
	wait_trap_barriers(s_restore_tmp, 0, 0)

#if HAVE_CLUSTER_BARRIER
	// SCC is changed by wait_trap_barriers, restore it separately.
@@ -1210,7 +1213,7 @@ L_SKIP_CLUSTER_BARRIER_RESTORE:
L_END_PGM:
	// Make sure that no wave of the group/cluster can exit the trap handler
	// before the group/cluster barrier state is saved.
	wait_trap_barriers(s_restore_tmp)
	wait_trap_barriers(s_restore_tmp, 0, 0)

	s_endpgm_saved
end
@@ -1300,11 +1303,11 @@ function restore_xnack_state_priv(s_tmp)
end
#endif

function wait_trap_barriers(s_tmp)
function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa)
#if HAVE_CLUSTER_BARRIER
	// If not in a WG then wave cannot use s_barrier_signal_isfirst.
	s_getreg_b32	s_tmp, hwreg(HW_REG_WAVE_STATUS)
	s_bitcmp0_b32	s_tmp, SQ_WAVE_STATUS_IN_WG_SHIFT
	s_getreg_b32	s_tmp1, hwreg(HW_REG_WAVE_STATUS)
	s_bitcmp0_b32	s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT
	s_cbranch_scc1	L_TRAP_CLUSTER_BARRIER_SIGNAL

	s_barrier_signal_isfirst	-2
@@ -1318,6 +1321,25 @@ L_TRAP_CLUSTER_BARRIER_SIGNAL:

L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
	s_barrier_wait	-4

#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND
if serialize_wa
	// Trap cluster barrier may complete with a user cluster barrier in-flight.
	// This is indicated if user cluster member count and signal count are equal.
L_WAIT_USER_CLUSTER_BARRIER_COMPLETE:
	s_sendmsg_rtn_b32	s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
	s_wait_kmcnt	0
	s_bitcmp0_b32	s_tmp1, BARRIER_STATE_VALID_OFFSET
	s_cbranch_scc1	L_NOT_IN_CLUSTER

	s_bfe_u32	s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10))
	s_bfe_u32	s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10))
	s_cmp_eq_u32	s_tmp1, s_tmp2
	s_cbranch_scc1	L_WAIT_USER_CLUSTER_BARRIER_COMPLETE
end
L_NOT_IN_CLUSTER:
#endif

#else
	s_barrier_signal	-2
	s_barrier_wait	-2