@@ -528,3 +528,70 @@ define i32 @main() nounwind {
528
528
%r = add i32 %e1 , %e2
529
529
ret i32 %r
530
530
}
531
+
532
+ ; A test for incorrect combine for single value extraction from VBROADCAST_LOAD.
533
+ ; Wrong combine makes the second call (%t8) use the stored result in the
534
+ ; previous instructions instead of %t4.
535
+ declare <2 x float > @ccosf (<2 x float >)
536
+ define dso_local <2 x float > @multiuse_of_single_value_from_vbroadcast_load (ptr %p , ptr %arr ) nounwind {
537
+ ; X86-SSE2-LABEL: multiuse_of_single_value_from_vbroadcast_load:
538
+ ; X86-SSE2: # %bb.0:
539
+ ; X86-SSE2-NEXT: pushl %esi
540
+ ; X86-SSE2-NEXT: subl $16, %esp
541
+ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
542
+ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
543
+ ; X86-SSE2-NEXT: movups 24(%esi), %xmm0
544
+ ; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill
545
+ ; X86-SSE2-NEXT: movhps %xmm0, (%eax)
546
+ ; X86-SSE2-NEXT: movaps 32(%esi), %xmm0
547
+ ; X86-SSE2-NEXT: calll ccosf@PLT
548
+ ; X86-SSE2-NEXT: movlps %xmm0, 32(%esi)
549
+ ; X86-SSE2-NEXT: movups (%esp), %xmm0 # 16-byte Reload
550
+ ; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
551
+ ; X86-SSE2-NEXT: calll ccosf@PLT
552
+ ; X86-SSE2-NEXT: addl $16, %esp
553
+ ; X86-SSE2-NEXT: popl %esi
554
+ ; X86-SSE2-NEXT: retl
555
+ ;
556
+ ; X64-SSSE3-LABEL: multiuse_of_single_value_from_vbroadcast_load:
557
+ ; X64-SSSE3: # %bb.0:
558
+ ; X64-SSSE3-NEXT: pushq %rbx
559
+ ; X64-SSSE3-NEXT: subq $16, %rsp
560
+ ; X64-SSSE3-NEXT: movq %rsi, %rbx
561
+ ; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
562
+ ; X64-SSSE3-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill
563
+ ; X64-SSSE3-NEXT: movlpd %xmm0, (%rdi)
564
+ ; X64-SSSE3-NEXT: movaps 32(%rsi), %xmm0
565
+ ; X64-SSSE3-NEXT: callq ccosf@PLT
566
+ ; X64-SSSE3-NEXT: movlps %xmm0, 32(%rbx)
567
+ ; X64-SSSE3-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
568
+ ; X64-SSSE3-NEXT: callq ccosf@PLT
569
+ ; X64-SSSE3-NEXT: addq $16, %rsp
570
+ ; X64-SSSE3-NEXT: popq %rbx
571
+ ; X64-SSSE3-NEXT: retq
572
+ ;
573
+ ; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load:
574
+ ; X64-AVX: # %bb.0:
575
+ ; X64-AVX-NEXT: pushq %rbx
576
+ ; X64-AVX-NEXT: movq %rsi, %rbx
577
+ ; X64-AVX-NEXT: vmovsd 32(%rsi), %xmm0 # xmm0 = mem[0],zero
578
+ ; X64-AVX-NEXT: vmovsd %xmm0, (%rdi)
579
+ ; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0
580
+ ; X64-AVX-NEXT: callq ccosf@PLT
581
+ ; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx)
582
+ ; X64-AVX-NEXT: vmovddup 32(%rbx), %xmm0 # xmm0 = mem[0,0]
583
+ ; X64-AVX-NEXT: callq ccosf@PLT
584
+ ; X64-AVX-NEXT: popq %rbx
585
+ ; X64-AVX-NEXT: retq
586
+ %p1 = getelementptr [5 x <2 x float >], ptr %arr , i64 0 , i64 3
587
+ %p2 = getelementptr inbounds [5 x <2 x float >], ptr %arr , i64 0 , i64 4 , i32 0
588
+ %t3 = load <4 x float >, ptr %p1 , align 8
589
+ %t4 = shufflevector <4 x float > %t3 , <4 x float > poison, <2 x i32 > <i32 2 , i32 3 >
590
+ store <2 x float > %t4 , ptr %p , align 16
591
+ %t5 = load <4 x float >, ptr %p2 , align 32
592
+ %t6 = shufflevector <4 x float > %t5 , <4 x float > poison, <2 x i32 > <i32 0 , i32 1 >
593
+ %t7 = call <2 x float > @ccosf (<2 x float > %t6 )
594
+ store <2 x float > %t7 , ptr %p2 , align 32
595
+ %t8 = call <2 x float > @ccosf (<2 x float > %t4 )
596
+ ret <2 x float > %t8
597
+ }
0 commit comments