Commit 5690011a authored by Lancelot SIX's avatar Lancelot SIX Committed by Alex Deucher
Browse files

drm/amdkfd: Handle save/restore of lds allocated in 1280B blocks



The gfx-9 trap handler is reading LDS allocation size in 256 bytes
granularity (from SQ_WAVE_LDS_ALLOC), but it using the assumption that
this value is always even (i.e. the LDS allocation is really done in
multiple of 512 bytes).  This was true so far, but gfx-950 allocates LDS
in chunks of 1280 bytes, making this assumption invalid.  This can cause
the trap handler to try to save / restore past the end of LDS, and past
the LDS allocated slot in the save are, overriding data from the
following wave.

This patch updates the trap handler to support LDS allocated in 1280
bytes blocks:
- During restore, copy from main memory directly to LDS in batch of 1280
  bytes.
- During save, continue to use 512 bytes blocks (we only have 2 VGPRs we
  can use to hold data), making sure to mask the upper half of the wave
  when handling when the LDS size is not a multiple of 512 bytes.

Signed-off-by: default avatarLancelot SIX <lancelot.six@amd.com>
Co-authored-by: default avatarAlex Sierra <alex.sierra@amd.com>
Reviewed-by: default avatarJay Cornwall <jay.cornwall@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 549120ed
Loading
Loading
Loading
Loading
+27 −33
Original line number Diff line number Diff line
@@ -4124,7 +4124,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
};

static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0xbf820001, 0xbf8202ea,
	0xbf820001, 0xbf8202d8,
	0xb8f8f802, 0x8978ff78,
	0x00020006, 0xb8fbf803,
	0x866eff78, 0x00002000,
@@ -4321,9 +4321,9 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0xe0724300, 0x701d0300,
	0xbefe00c1, 0xbeff00c1,
	0xb8fb5306, 0x867bc17b,
	0xbf840064, 0xbf8a0000,
	0xbf840052, 0xbf8a0000,
	0x867aff6f, 0x04000000,
	0xbf840060, 0x8e7b867b,
	0xbf84004e, 0x8e7b867b,
	0x8e7b827b, 0xbef6007b,
	0xb8f02985, 0x80708170,
	0x8e708a70, 0x8e708170,
@@ -4336,8 +4336,8 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0x000204c1, 0x867aff78,
	0x00400000, 0xbf850003,
	0xb8faf803, 0x897a7aff,
	0x10000000, 0xbf850030,
	0x24040682, 0xd86e4000,
	0x10000000, 0xbf85001d,
	0x24040682, 0xd86c0000,
	0x00000002, 0xbf8cc07f,
	0xbe840080, 0xd2890000,
	0x00000900, 0x80048104,
@@ -4348,29 +4348,20 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0x80048104, 0xc069003a,
	0x00000070, 0xbf8cc07f,
	0x80709070, 0xbf06c004,
	0xbf84ffee, 0xbe840080,
	0xd2890000, 0x00000901,
	0x80048104, 0xd2890001,
	0x00000901, 0x80048104,
	0xd2890002, 0x00000901,
	0x80048104, 0xd2890003,
	0x00000901, 0x80048104,
	0xc069003a, 0x00000070,
	0xbf8cc07f, 0x80709070,
	0xbf06c004, 0xbf84ffee,
	0x680404ff, 0x00000200,
	0xbf84ffee, 0x680404ff,
	0x00000100, 0xd0c9006a,
	0x0000f702, 0xbf87ffe5,
	0xbf820016, 0xd1060002,
	0x00011103, 0x7e0602ff,
	0x00000200, 0xbefc00ff,
	0x00010000, 0xbe800077,
	0x8677ff77, 0xff7fffff,
	0x8777ff77, 0x00058000,
	0xd8ec0000, 0x00000002,
	0xbf8cc07f, 0xe0765000,
	0x701d0002, 0x68040702,
	0xd0c9006a, 0x0000f702,
	0xbf87ffd2, 0xbf820015,
	0xd1060002, 0x00011103,
	0x7e0602ff, 0x00000200,
	0xbefc00ff, 0x00010000,
	0xbe800077, 0x8677ff77,
	0xff7fffff, 0x8777ff77,
	0x00058000, 0xd8ec0000,
	0x00000002, 0xbf8cc07f,
	0xe0765000, 0x701d0002,
	0x68040702, 0xd0c9006a,
	0x0000f702, 0xbf87fff7,
	0xbefe016a, 0xbf87fff6,
	0xbef70000, 0xbef000ff,
	0x00000400, 0xbefe00c1,
	0xbeff00c1, 0xb8fb2b05,
@@ -4497,15 +4488,15 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0x701d0300, 0x807c847c,
	0x8070ff70, 0x00000400,
	0xbf0a7b7c, 0xbf85ffeb,
	0xbf9c0000, 0xbf8200ee,
	0xbf9c0000, 0xbf8200f4,
	0xbef4007e, 0x8675ff7f,
	0x0000ffff, 0x8775ff75,
	0x00040000, 0xbef60080,
	0xbef700ff, 0x00807fac,
	0x866eff7f, 0x04000000,
	0xbf84001f, 0xbefe00c1,
	0xbf840025, 0xbefe00c1,
	0xbeff00c1, 0xb8ef5306,
	0x866fc16f, 0xbf84001a,
	0x866fc16f, 0xbf840020,
	0x8e6f866f, 0x8e6f826f,
	0xbef6006f, 0xb8f82985,
	0x80788178, 0x8e788a78,
@@ -4516,9 +4507,12 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
	0x01000000, 0xbefc0080,
	0xe0510000, 0x781d0000,
	0xe0510100, 0x781d0000,
	0x807cff7c, 0x00000200,
	0x8078ff78, 0x00000200,
	0xbf0a6f7c, 0xbf85fff6,
	0xe0510200, 0x781d0000,
	0xe0510300, 0x781d0000,
	0xe0510400, 0x781d0000,
	0x807cff7c, 0x00000500,
	0x8078ff78, 0x00000500,
	0xbf0a6f7c, 0xbf85fff0,
	0xbefe00c1, 0xbeff00c1,
	0xbef600ff, 0x01000000,
	0xb8ef2b05, 0x806f816f,
+22 −3
Original line number Diff line number Diff line
@@ -75,8 +75,10 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
#if ASIC_FAMILY >= CHIP_GC_9_5_0
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 11
var LDS_RESTORE_GRANULARITY_BYTES	= 1280
#else
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
var LDS_RESTORE_GRANULARITY_BYTES	= 512
#endif
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 3			//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
@@ -572,12 +574,21 @@ if SAVE_AFTER_XNACK_ERROR

	v_lshlrev_b32 v2, 2, v3
L_SAVE_LDS_LOOP_SQC:
#if ASIC_FAMILY < CHIP_GC_9_5_0
	ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
	s_waitcnt lgkmcnt(0)

	write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)

	v_add_u32 v2, 0x200, v2
#else
	// gfx950 needs to save in multiple of 256 bytes.
	ds_read_b32 v0, v2
	s_waitcnt lgkmcnt(0)
	write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)

	v_add_u32 v2, 0x100, v2
#endif

	v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
	s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC

@@ -601,6 +612,9 @@ L_SAVE_LDS_LOOP_VECTOR:
//	v_add_u32 v2, vcc[0:1], v2, v3
      v_add_u32 v2, v2, v3
      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
#if ASIC_FAMILY >= CHIP_GC_9_5_0
      s_mov_b64 exec, vcc
#endif
      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR

      // restore rsrc3
@@ -763,8 +777,13 @@ L_RESTORE:
  L_RESTORE_LDS_LOOP:
	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1		       // first 64DW
	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256	       // second 64DW
    s_add_u32	    m0, m0, 256*2						// 128 DW
    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*2		//mem offset increased by 128DW
#if ASIC_FAMILY >= CHIP_GC_9_5_0
	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512	// third 64DW
	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768	// forth 64DW
	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024	// fifth 64DW
#endif
    s_add_u32	    m0, m0, LDS_RESTORE_GRANULARITY_BYTES					// 128/320 DW
    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES	//mem offset increased by 128/320 DW
    s_cmp_lt_u32    m0, s_restore_alloc_size					//scc=(m0 < s_restore_alloc_size) ? 1 : 0
    s_cbranch_scc1  L_RESTORE_LDS_LOOP							    //LDS restore is complete?