-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[X86][Combine] Ensure single use chain in extract-load combine #136520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The problem is that `SrcBC = peekThroughBitcasts(Src)` doesn't ensure single use chain. It results in the situation when a cast may have multiple users and instead of replacing a load we introduce a new one. The situation is worsened by the fact that we've replaced the token from the original load and its correct memory order now is not guaranteed.
@llvm/pr-subscribers-backend-x86 Author: Evgenii Kudriashov (e-kud) ChangesThe problem is that Full diff: https://github.com/llvm/llvm-project/pull/136520.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 024b653f78cb5..1faf55025db47 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46258,7 +46258,8 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
// If we're extracting a single element from a broadcast load and there are
// no other users, just create a single load.
- if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+ if (peekThroughOneUseBitcasts(Src).getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ SrcBC.hasOneUse()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 022b25a241533..ce68eebd5b752 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -528,3 +528,73 @@ define i32 @main() nounwind {
%r = add i32 %e1, %e2
ret i32 %r
}
+
+; A test for incorrect combine for single value extraction from VBROADCAST_LOAD.
+; Wrong combine makes the second call (%t8) use the stored result in the
+; previous instructions instead of %t4.
+declare <2 x float> @ccosf(<2 x float>)
+define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr %p, ptr %arr) nounwind {
+; X86-SSE2-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $16, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: movups 24(%esi), %xmm0
+; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT: movhps %xmm0, (%eax)
+; X86-SSE2-NEXT: movaps 32(%esi), %xmm0
+; X86-SSE2-NEXT: calll ccosf@PLT
+; X86-SSE2-NEXT: movlps %xmm0, 32(%esi)
+; X86-SSE2-NEXT: movups (%esp), %xmm0 # 16-byte Reload
+; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE2-NEXT: calll ccosf@PLT
+; X86-SSE2-NEXT: addl $16, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X64-SSSE3: # %bb.0:
+; X64-SSSE3-NEXT: pushq %rbx
+; X64-SSSE3-NEXT: subq $16, %rsp
+; X64-SSSE3-NEXT: movq %rsi, %rbx
+; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-SSSE3-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill
+; X64-SSSE3-NEXT: movlpd %xmm0, (%rdi)
+; X64-SSSE3-NEXT: movaps 32(%rsi), %xmm0
+; X64-SSSE3-NEXT: callq ccosf@PLT
+; X64-SSSE3-NEXT: movlps %xmm0, 32(%rbx)
+; X64-SSSE3-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; X64-SSSE3-NEXT: callq ccosf@PLT
+; X64-SSSE3-NEXT: addq $16, %rsp
+; X64-SSSE3-NEXT: popq %rbx
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbx
+; X64-AVX-NEXT: subq $16, %rsp
+; X64-AVX-NEXT: movq %rsi, %rbx
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-NEXT: vmovlps %xmm0, (%rdi)
+; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0
+; X64-AVX-NEXT: callq ccosf@PLT
+; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx)
+; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX-NEXT: callq ccosf@PLT
+; X64-AVX-NEXT: addq $16, %rsp
+; X64-AVX-NEXT: popq %rbx
+; X64-AVX-NEXT: retq
+ %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3
+ %p2 = getelementptr inbounds [5 x <2 x float>], ptr %arr, i64 0, i64 4, i32 0
+ %t3 = load <4 x float>, ptr %p1, align 8
+ %t4 = shufflevector <4 x float> %t3, <4 x float> poison, <2 x i32> <i32 2, i32 3>
+ store <2 x float> %t4, ptr %p, align 16
+ %t5 = load <4 x float>, ptr %p2, align 32
+ %t6 = shufflevector <4 x float> %t5, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+ %t7 = call <2 x float> @ccosf(<2 x float> %t6)
+ store <2 x float> %t7, ptr %p2, align 32
+ %t8 = call <2 x float> @ccosf(<2 x float> %t4)
+ ret <2 x float> %t8
+}
|
There are two commits in the PR. If the test looks ok, I'll precommit it so PR itself has only a change of restored spills. Probably in precommit test I need to expand |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
The problem is that
SrcBC = peekThroughBitcasts(Src)
doesn't ensure single use chain. It results in the situation when a cast may have multiple users and instead of replacing a load we introduce a new one.The situation is worsened by the fact that we've replaced the token from the original load and its correct memory order now is not guaranteed.