From b605aab2700d1f657f09e898e77a13251e238596 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Sun, 20 Apr 2025 17:32:38 -0700 Subject: [PATCH 1/2] Test precommit --- llvm/test/CodeGen/X86/extractelement-load.ll | 67 ++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 022b25a241533..dfbf9f3b19fc2 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -528,3 +528,70 @@ define i32 @main() nounwind { %r = add i32 %e1, %e2 ret i32 %r } + +; A test for incorrect combine for single value extraction from VBROADCAST_LOAD. +; Wrong combine makes the second call (%t8) use the stored result in the +; previous instructions instead of %t4. +declare <2 x float> @ccosf(<2 x float>) +define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr %p, ptr %arr) nounwind { +; X86-SSE2-LABEL: multiuse_of_single_value_from_vbroadcast_load: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: movups 24(%esi), %xmm0 +; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movhps %xmm0, (%eax) +; X86-SSE2-NEXT: movaps 32(%esi), %xmm0 +; X86-SSE2-NEXT: calll ccosf@PLT +; X86-SSE2-NEXT: movlps %xmm0, 32(%esi) +; X86-SSE2-NEXT: movups (%esp), %xmm0 # 16-byte Reload +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE2-NEXT: calll ccosf@PLT +; X86-SSE2-NEXT: addl $16, %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: retl +; +; X64-SSSE3-LABEL: multiuse_of_single_value_from_vbroadcast_load: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: pushq %rbx +; X64-SSSE3-NEXT: subq $16, %rsp +; X64-SSSE3-NEXT: movq %rsi, %rbx +; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X64-SSSE3-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; X64-SSSE3-NEXT: movlpd %xmm0, (%rdi) +; X64-SSSE3-NEXT: movaps 32(%rsi), %xmm0 +; X64-SSSE3-NEXT: callq ccosf@PLT +; X64-SSSE3-NEXT: movlps %xmm0, 32(%rbx) +; X64-SSSE3-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSSE3-NEXT: callq ccosf@PLT +; X64-SSSE3-NEXT: addq $16, %rsp +; X64-SSSE3-NEXT: popq %rbx +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rbx +; X64-AVX-NEXT: movq %rsi, %rbx +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) +; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0 +; X64-AVX-NEXT: callq ccosf@PLT +; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx) +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: callq ccosf@PLT +; X64-AVX-NEXT: popq %rbx +; X64-AVX-NEXT: retq + %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3 + %p2 = getelementptr inbounds [5 x <2 x float>], ptr %arr, i64 0, i64 4, i32 0 + %t3 = load <4 x float>, ptr %p1, align 8 + %t4 = shufflevector <4 x float> %t3, <4 x float> poison, <2 x i32> + store <2 x float> %t4, ptr %p, align 16 + %t5 = load <4 x float>, ptr %p2, align 32 + %t6 = shufflevector <4 x float> %t5, <4 x float> poison, <2 x i32> + %t7 = call <2 x float> @ccosf(<2 x float> %t6) + store <2 x float> %t7, ptr %p2, align 32 + %t8 = call <2 x float> @ccosf(<2 x float> %t4) + ret <2 x float> %t8 +} From 4d0a3ae7818b25f8f4f362b960912c5f0a2d3345 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Sun, 20 Apr 2025 17:37:33 -0700 Subject: [PATCH 2/2] [X86][Combine] Ensure single use chain in extract-load combine The problem is that `SrcBC = peekThroughBitcasts(Src)` doesn't ensure single use chain. It results in the situation when a cast may have multiple users and instead of replacing a load we introduce a new one. The situation is worsened by the fact that we've replaced the token from the original load and its correct memory order now is not guaranteed. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++- llvm/test/CodeGen/X86/extractelement-load.ll | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 024b653f78cb5..1faf55025db47 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46258,7 +46258,8 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // If we're extracting a single element from a broadcast load and there are // no other users, just create a single load. - if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + if (peekThroughOneUseBitcasts(Src).getOpcode() == X86ISD::VBROADCAST_LOAD && + SrcBC.hasOneUse()) { auto *MemIntr = cast(SrcBC); unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index dfbf9f3b19fc2..ce68eebd5b752 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -573,14 +573,17 @@ define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr ; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: pushq %rbx +; X64-AVX-NEXT: subq $16, %rsp ; X64-AVX-NEXT: movq %rsi, %rbx -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; X64-AVX-NEXT: vmovlps %xmm0, (%rdi) ; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0 ; X64-AVX-NEXT: callq ccosf@PLT ; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx) -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; X64-AVX-NEXT: callq ccosf@PLT +; X64-AVX-NEXT: addq $16, %rsp ; X64-AVX-NEXT: popq %rbx ; X64-AVX-NEXT: retq %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3