-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer #150887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The new instruction represents the unknown number of waitcnts needed at a release operation to ensure that prior direct loads to LDS (formerly called LDS DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a suitable value for vmcnt(). Co-authored-by: Austin Kerbow <[email protected]>.
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Sameer Sahasrabuddhe (ssahasra) ChangesThe new instruction represents the unknown number of waitcnts needed at a Co-authored-by: Austin Kerbow <[email protected]>. Patch is 39.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150887.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe25a239..9a4360374621d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420fbb70a..30c180c0e420e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc2f00e6..09630e20840cf 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 66037615f0ba0..5fd8553820685 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
; GFX10WGP-LABEL: name: workgroup_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1330,6 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1337,6 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1369,6 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1376,6 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
new file mode 100644
index 0000000000000..b376360157141
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
@@ -0,0 +1,133 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+
+# Expected vmcnt(0) since the direct load is the only load.
+---
+name: dma_then_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_global_load
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_global_load
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.
+
+---
+name: no_dma_just_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_dma_just_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_system_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_system_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_prev_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_prev_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_next_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_next_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ S_WAITCNT 3952
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
new file mode 100644
index 0000000000000..d23509b5aa812
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -0,0 +1,543 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU
+
+; In each of these tests, an LDS DMA operation is followed by a release pattern
+; at workgroup scope. The fence in such a release (implicit or explicit) should
+; wait for the store component in the LDS DMA. The additional noalias metadata
+; is just meant to ensure that the wait counts are not generated due to some
+; unintended aliasing.
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: barrier_release:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s12
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX900-NEXT: v_mov_b32_e32 v0, s13
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_barrier
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v1, v0, s[14:15]
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: barrier_release:
+; GFX90A: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_branch .LBB0_0
+; GFX90A-NEXT: .p2align 8
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: .LBB0_0: ; %main_body
+; GFX90A-NEXT: s_mov_b32 m0, s12
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_barrier
+; GFX90A-NEXT: ds_read_b32 v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: barrier_release:
+; GFX90A-TGSPLIT: ; %bb.1:
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0
+; GFX90A-TGSPLIT-NEXT: .p2align 8
+; GFX90A-TGSPLIT-NEXT: ; %bb.2:
+; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_barrier
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-LABEL: barrier_release:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB0_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB0_0: ; %main_body
+; GFX942-NEXT: s_mov_b32 m0, s12
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX942-NEXT: v_mov_b32_e32 v0, s13
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_barrier
+; GFX942-NEXT: ds_read_b32 v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; ...
[truncated]
|
Reimplements:
Supersedes: |
// Represents the point at which a wave must wait for all outstanding direct loads to LDS. | ||
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. | ||
|
||
def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Synthetic variants of real instructions should use a lowercase suffix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I have a great deal of uncertainty on what this pseudo should be called. It seems the "_soft" suffix signifies a definite wait count argument, which the SIInsertWaitcnts pass is allowed to relax. Are you suggesting "S_WAITCNT_lds_direct"? Because "_direct" as a suffix doesn't contain enough information, and there is no "S_WAITCNT_LDS".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WAIT_LDS_DIRECT_PSEUDO
(following a similar pattern to ATOMIC_FENCE
and other SOP pseudos?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
S_WAITCNT_something_lowercase
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am now using S_WAITCNT_lds_direct
.... it represents a wait count, for direct loads to LDS, but it's not real. Having it called S_WAITCNT_something
like everything else near it helps reinforce that its relevance is to the SIInsertWaitcnts pass, where it is included by the isWaitInstr()
predicate.
// SIInsertWaitcnts will later replace this with a vmcnt(). | ||
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | ||
Scope == SIAtomicScope::WORKGROUP && | ||
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { | |
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { |
Just to be consistent with the rest of the file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Meh. Fixed it. Although I would rather have the rest of the file be consistent with this (highly recommended and correct) use of any()
. In fact BitmaskEnum
there should be able to provide a much clearer way to check for individual bits.
// SIInsertWaitcnts will later replace this with a vmcnt(). | ||
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | ||
Scope == SIAtomicScope::WORKGROUP && | ||
any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here
- try renaming to S_WAITCNT_lds_direct - be consistent (even at the cost of brevity?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Small nit otherwise this LGTM. Name is ok for me, I'd prefer if there was a "_pseudo" at the end instead (maybe S_WAITCNT_LDS_DIRECT_pseudo
, just to make it clear it isn't a HW inst in any ISA, but it's really bikeshedding at this point
// SIInsertWaitcnts will later replace this with a vmcnt(). | ||
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && | ||
Scope == SIAtomicScope::WORKGROUP && | ||
((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS) != |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it need the cast still? I thought it wouldn't need it
The new instruction represents the unknown number of waitcnts needed at a
release operation to ensure that prior direct loads to LDS (formerly called LDS
DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a
suitable value for vmcnt().
Co-authored-by: Austin Kerbow [email protected].