Skip to content

Commit 7ef153b

Browse files
PawelJurekpszymich
authored andcommitted
Revert: Limit the scope of post-atomic fences
By default the LSC scope of the fence was "gpu". For post-atomic fence "local" scope should be sufficient.
1 parent f68aa25 commit 7ef153b

File tree

9 files changed

+29
-40
lines changed

9 files changed

+29
-40
lines changed

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ local uchar* __builtin_IB_AllocLocalMemPool(bool allocAllWorkgroups, uint numAdd
103103

104104
// Memory fences
105105
// See GenISAIntrinsics.td for documentation
106-
void __builtin_IB_memfence(bool commitEnable, bool flushRW, bool flushConstant, bool flushTexture, bool flushIcache, bool isGlobal, bool invalidateL1, bool forceLocalLSCScope);
106+
void __builtin_IB_memfence(bool commitEnable, bool flushRW, bool flushConstant, bool flushTexture, bool flushIcache, bool isGlobal, bool invalidateL1);
107107
void __builtin_IB_flush_sampler_cache(void);
108108
void __builtin_IB_typedmemfence(bool invalidateCache);
109109

IGC/BiFModule/Implementation/atomics.cl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ extern __constant int __HasThreadPauseSupport;
2828
__local int* __builtin_IB_get_local_lock();
2929
__global int* __builtin_IB_get_global_lock();
3030
void __builtin_IB_eu_thread_pause(uint value);
31-
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1, bool forceLocalLSCScope);
31+
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1);
3232

3333
#define LOCAL_SPINLOCK_START() \
3434
{ \
@@ -59,14 +59,14 @@ extern __constant int __HasThreadPauseSupport;
5959
if( ( (Semantics) & ( SEMANTICS_PRE_OP_NEED_FENCE ) ) > 0 ) \
6060
{ \
6161
bool flushL3 = (isGlobal) && ((Scope) == Device || (Scope) == CrossDevice); \
62-
__intel_memfence_handler(flushL3, isGlobal, false, false); \
62+
__intel_memfence_handler(flushL3, isGlobal, false); \
6363
}
6464

6565
#define FENCE_POST_OP(Scope, Semantics, isGlobal) \
6666
if( ( (Semantics) & ( SEMANTICS_POST_OP_NEEDS_FENCE ) ) > 0 ) \
6767
{ \
6868
bool flushL3 = (isGlobal) && ((Scope) == Device || (Scope) == CrossDevice); \
69-
__intel_memfence_handler(flushL3, isGlobal, isGlobal, true); \
69+
__intel_memfence_handler(flushL3, isGlobal, isGlobal); \
7070
}
7171

7272
// This fencing scheme allows us to obey the memory model when coherency is

IGC/BiFModule/Implementation/barrier.cl

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,39 +18,37 @@ extern __constant int __OptDisable;
1818

1919
// MEMFENCE IMPLEMENTATION
2020

21-
void __attribute__((optnone)) __intel_memfence_optnone(bool flushRW, bool isGlobal, bool invalidateL1, bool forceLocalLSCScope)
21+
void __attribute__((optnone)) __intel_memfence_optnone(bool flushRW, bool isGlobal, bool invalidateL1)
2222
{
23-
#define MEMFENCE_IF(V1, V5, V6, V7) \
24-
if (flushRW == V1 && isGlobal == V5 && invalidateL1 == V6 && forceLocalLSCScope == V7) \
25-
{ \
26-
__builtin_IB_memfence(true, V1, false, false, false, V5, V6, V7); \
23+
#define MEMFENCE_IF(V1, V5, V6) \
24+
if (flushRW == V1 && isGlobal == V5 && invalidateL1 == V6) \
25+
{ \
26+
__builtin_IB_memfence(true, V1, false, false, false, V5, V6); \
2727
} else
2828

2929
// Generate combinations for all MEMFENCE_IF cases, e.g.:
30-
// true, true, true, true
31-
// true, true, true, false etc.
32-
#define MF_L3(...) MF_L2(__VA_ARGS__,false) MF_L2(__VA_ARGS__,true)
30+
// true, true, true
31+
// true, true, false etc.
3332
#define MF_L2(...) MF_L1(__VA_ARGS__,false) MF_L1(__VA_ARGS__,true)
3433
#define MF_L1(...) MEMFENCE_IF(__VA_ARGS__,false) MEMFENCE_IF(__VA_ARGS__,true)
35-
MF_L3(false)
36-
MF_L3(true) {}
34+
MF_L2(false)
35+
MF_L2(true) {}
3736

3837
#undef MEMFENCE_IF
39-
#undef MF_L3
4038
#undef MF_L2
4139
#undef MF_L1
4240
}
43-
void __intel_memfence(bool flushRW, bool isGlobal, bool invalidateL1, bool forceLocalLSCScope)
41+
void __intel_memfence(bool flushRW, bool isGlobal, bool invalidateL1)
4442
{
45-
__builtin_IB_memfence(true, flushRW, false, false, false, isGlobal, invalidateL1, forceLocalLSCScope);
43+
__builtin_IB_memfence(true, flushRW, false, false, false, isGlobal, invalidateL1);
4644
}
4745

48-
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1, bool forceLocalLSCScope)
46+
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1)
4947
{
5048
if (__OptDisable)
51-
__intel_memfence_optnone(flushRW, isGlobal, invalidateL1, forceLocalLSCScope);
49+
__intel_memfence_optnone(flushRW, isGlobal, invalidateL1);
5250
else
53-
__intel_memfence(flushRW, isGlobal, invalidateL1, forceLocalLSCScope);
51+
__intel_memfence(flushRW, isGlobal, invalidateL1);
5452
}
5553

5654
// TYPEDMEMFENCE IMPLEMENTATION
@@ -99,12 +97,12 @@ static void __intel_atomic_work_item_fence( Scope_t Memory, uint Semantics )
9997
// although on some platforms they may be elided; platform-specific checks are performed in codegen
10098
if (Semantics & WorkgroupMemory)
10199
{
102-
__intel_memfence_handler(false, false, false,false);
100+
__intel_memfence_handler(false, false, false);
103101
}
104102
if (Semantics & CrossWorkgroupMemory)
105103
{
106104
bool flushL3 = Memory == Device || Memory == CrossDevice;
107-
__intel_memfence_handler(flushL3, true, invalidateL1, false);
105+
__intel_memfence_handler(flushL3, true, invalidateL1);
108106
}
109107
}
110108
}

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13520,12 +13520,12 @@ LSC_FENCE_OP EmitPass::getLSCMemoryFenceOp(bool IsGlobalMemFence, bool Invalidat
1352013520

1352113521
void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1352213522
{
13523-
static constexpr int ExpectedNumberOfArguments = 8;
13523+
static constexpr int ExpectedNumberOfArguments = 7;
1352413524
IGC_ASSERT(IGCLLVM::getNumArgOperands(cast<CallInst>(inst)) == ExpectedNumberOfArguments);
1352513525
CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
1352613526

1352713527
// If passed a non-constant value for any of the parameters,
13528-
// be conservative and assume the default value defined below.
13528+
// be conservative and assume that the parameter is true.
1352913529
// This could happen in "optnone" scenarios.
1353013530
bool CommitEnable = true;
1353113531
bool L3_Flush_RW_Data = true;
@@ -13534,7 +13534,6 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1353413534
bool L3_Flush_Instructions = true;
1353513535
bool Global_Mem_Fence = true;
1353613536
bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
13537-
bool Force_Local_LSC_Scope = false;
1353813537

1353913538
std::array<reference_wrapper<bool>, ExpectedNumberOfArguments> MemFenceArguments{
1354013539
CommitEnable,
@@ -13544,13 +13543,12 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1354413543
L3_Flush_Instructions,
1354513544
Global_Mem_Fence,
1354613545
L1_Invalidate,
13547-
Force_Local_LSC_Scope
1354813546
};
1354913547

1355013548
for (size_t i = 0; i < MemFenceArguments.size(); ++i) {
1355113549
if (ConstantInt* CI = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(i)))
1355213550
{
13553-
MemFenceArguments[i].get() = CI->getValue().getBoolValue();
13551+
MemFenceArguments[i] &= CI->getValue().getBoolValue();
1355413552
}
1355513553
}
1355613554

@@ -13584,9 +13582,6 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1358413582
LSC_SFID sfid = Global_Mem_Fence ? LSC_UGM : LSC_SLM;
1358513583
// ToDo: replace with fence instrinsics that take scope/op
1358613584
LSC_SCOPE scope = Global_Mem_Fence ? LSC_SCOPE_GPU : LSC_SCOPE_GROUP;
13587-
// When post-atomic fence is added with L1 invalidate, we may want to limit the scope to subslice.
13588-
if (Force_Local_LSC_Scope)
13589-
scope = (sfid == LSC_SLM) ? LSC_SCOPE_GROUP : LSC_SCOPE_LOCAL;
1359013585
// Change the scope from `GPU` to `Tile` on single-tile platforms to avoid L3 flush on DG2 and MTL
1359113586
if (scope == LSC_SCOPE_GPU &&
1359213587
!m_currShader->m_Platform->hasMultiTile() &&

IGC/Compiler/Optimizer/OpenCLPasses/Atomics/ResolveOCLAtomics.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,6 @@ void ResolveOCLAtomics::generateLockInitilization(Function* F)
372372
falseValue,
373373
falseValue,
374374
trueValue,
375-
falseValue
376375
};
377376
m_builder->SetInsertPoint(initSpinLockEndBB, initSpinLockEndBB->getFirstInsertionPt());
378377
Function* localMemFence = GenISAIntrinsic::getDeclaration(m_pModule, GenISAIntrinsic::GenISA_memoryfence);

IGC/Compiler/Optimizer/OpenCLPasses/NamedBarriers/NamedBarriersResolution.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ void NamedBarriersResolution::HandleNamedBarrierSyncHW(CallInst& NBarrierSyncCal
257257
falseValue, // bool flushIcache
258258
isGlobal, // bool isGlobal
259259
falseValue, // bool invalidateL1
260-
falseValue, // bool foceLocalLSCScope
261260
},
262261
"",
263262
&(NBarrierSyncCall));

IGC/Compiler/tests/ResolveOCLAtomics/local_spin_lock.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ entry:
4646
; CHECK: br label %init_spinlock_var.end
4747

4848
; CHECK: init_spinlock_var.end:
49-
; CHECK: call void @llvm.genx.GenISA.memoryfence(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false)
49+
; CHECK: call void @llvm.genx.GenISA.memoryfence(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true)
5050
; CHECK: call void @llvm.genx.GenISA.threadgroupbarrier()
5151

5252
; ---------- LOCAL_SPINLOCK_START() --------------------
@@ -74,11 +74,11 @@ if.then3.i: ; preds = %while.body.i
7474
store volatile i8 1, i8* %done_alloca, align 1
7575
; CHECK-NOT: __builtin_IB_get_local_lock
7676
%call4.i = call spir_func i32 addrspace(3)* @__builtin_IB_get_local_lock()
77-
call spir_func void @__builtin_IB_memfence(i1 zeroext true, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false)
77+
call spir_func void @__builtin_IB_memfence(i1 zeroext true, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false)
7878
; CHECK: %[[SPINLOCK_AS_INT1:.*]] = ptrtoint i32 addrspace(3)* @spinlock to i32
7979
; CHECK: call i32 @llvm.genx.GenISA.intatomicraw.i32.p3i32.i32(i32 addrspace(3)* @spinlock, i32 %[[SPINLOCK_AS_INT1]], i32 0, i32 6)
8080
%call.i1.i = call spir_func i32 @__builtin_IB_atomic_xchg_local_i32(i32 addrspace(3)* %call4.i, i32 0)
81-
call spir_func void @__builtin_IB_memfence(i1 zeroext true, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false)
81+
call spir_func void @__builtin_IB_memfence(i1 zeroext true, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false, i1 zeroext false)
8282
br label %if.end5.i
8383

8484
if.end5.i: ; preds = %if.then3.i, %while.body.i
@@ -94,12 +94,12 @@ test_spinlock.exit: ; preds = %entry, %if.end5.i
9494
declare spir_func i32 addrspace(3)* @__builtin_IB_get_local_lock()
9595
declare spir_func i32 @__builtin_IB_atomic_xchg_local_i32(i32 addrspace(3)*, i32)
9696
declare spir_func i32 @__builtin_IB_atomic_cmpxchg_local_i32(i32 addrspace(3)*, i32, i32)
97-
declare spir_func void @__builtin_IB_memfence(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
97+
declare spir_func void @__builtin_IB_memfence(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
9898

9999
; CHECK: declare i32 @llvm.genx.GenISA.icmpxchgatomicraw.i32.p3i32.i32(i32 addrspace(3)*, i32, i32, i32)
100100
; CHECK: declare i32 @llvm.genx.GenISA.intatomicraw.i32.p3i32.i32(i32 addrspace(3)*, i32, i32, i32)
101101
; CHECK: declare i32 @__builtin_IB_get_local_id_x()
102102
; CHECK: declare i32 @__builtin_IB_get_local_id_y()
103103
; CHECK: declare i32 @__builtin_IB_get_local_id_z()
104-
; CHECK: declare void @llvm.genx.GenISA.memoryfence(i1, i1, i1, i1, i1, i1, i1, i1)
104+
; CHECK: declare void @llvm.genx.GenISA.memoryfence(i1, i1, i1, i1, i1, i1, i1)
105105
; CHECK: declare void @llvm.genx.GenISA.threadgroupbarrier()

IGC/GenISAIntrinsics/Intrinsic_definitions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,8 +1426,7 @@
14261426
("bool", "L3_Flush_Texture_Data"),
14271427
("bool", "L3_Flush_Instructions"),
14281428
("bool", "Fence has global effect"),
1429-
("bool", "L1 Invalidate"),
1430-
("bool", "Force Local LSC scope")],
1429+
("bool", "L1 Invalidate")],
14311430
"Convergent"]],
14321431
####################################################################################################
14331432
"GenISA_mov_identity": ["",

IGC/LLVM3DBuilder/BuiltinsFrontendDefinitions.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,6 @@ llvm::Value* LLVM3DBuilder<preserveNames, T, Inserter>::Create_MemoryFence(
874874
this->getInt1(flushInstructionCache),
875875
this->getInt1(globalFence),
876876
this->getInt1(false),
877-
this->getInt1(false),
878877
};
879878
llvm::Module* module = this->GetInsertBlock()->getParent()->getParent();
880879
return this->CreateCall(

0 commit comments

Comments
 (0)