diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dfaf58e753fb7..aab0809c92f43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46258,7 +46258,8 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // If we're extracting a single element from a broadcast load and there are // no other users, just create a single load. - if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + if (peekThroughOneUseBitcasts(Src).getOpcode() == X86ISD::VBROADCAST_LOAD && + SrcBC.hasOneUse()) { auto *MemIntr = cast(SrcBC); unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index c251f2a22f83a..ce68eebd5b752 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -573,14 +573,17 @@ define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr ; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: pushq %rbx +; X64-AVX-NEXT: subq $16, %rsp ; X64-AVX-NEXT: movq %rsi, %rbx -; X64-AVX-NEXT: vmovsd 32(%rsi), %xmm0 # xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; X64-AVX-NEXT: vmovlps %xmm0, (%rdi) ; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0 ; X64-AVX-NEXT: callq ccosf@PLT ; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx) -; X64-AVX-NEXT: vmovddup 32(%rbx), %xmm0 # xmm0 = mem[0,0] +; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; X64-AVX-NEXT: callq ccosf@PLT +; X64-AVX-NEXT: addq $16, %rsp ; X64-AVX-NEXT: popq %rbx ; X64-AVX-NEXT: retq %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3