Skip to content

Commit 8cd7527

Browse files
committed
[VPlan] Use predicate in VPInstruction::computeCost for selects. (llvm#170278)
In some cases, the lowering a select depends on the predicate. If the condition of a select is a compare instruction, thread the predicate through to the TTI hook. PR: llvm#170278 (cherry picked from commit 50916a4)
1 parent 751f98f commit 8cd7527

File tree

4 files changed

+70
-29
lines changed

4 files changed

+70
-29
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -991,9 +991,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
991991

992992
switch (getOpcode()) {
993993
case Instruction::Select: {
994-
// TODO: It may be possible to improve this by analyzing where the
995-
// condition operand comes from.
996-
CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
994+
llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
995+
match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
997996
auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
998997
auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
999998
if (!vputils::onlyFirstLaneUsed(this)) {

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -996,14 +996,16 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
996996
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]]
997997
; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]]
998998
; TFA_INTERLEAVE: [[VECTOR_BODY]]:
999-
; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP18:.*]] ]
1000-
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP18]] ]
1001-
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP18]] ]
999+
; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP19:.*]] ]
1000+
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP19]] ]
1001+
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP19]] ]
10021002
; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8
10031003
; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
10041004
; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
1005-
; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = fcmp ule double [[TMP5]], 0.000000e+00
1006-
; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ule double [[TMP6]], 0.000000e+00
1005+
; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
1006+
; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
1007+
; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = xor i1 [[TMP9]], true
1008+
; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = xor i1 [[TMP10]], true
10071009
; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false
10081010
; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false
10091011
; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00
@@ -1014,11 +1016,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
10141016
; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true
10151017
; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor i1 [[TMP14]], true
10161018
; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
1017-
; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB16:.*]], label %[[TMP18]]
1018-
; TFA_INTERLEAVE: [[BB16]]:
1019+
; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[TMP19]]
1020+
; TFA_INTERLEAVE: [[BB18]]:
10191021
; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8
1020-
; TFA_INTERLEAVE-NEXT: br label %[[TMP18]]
1021-
; TFA_INTERLEAVE: [[TMP18]]:
1022+
; TFA_INTERLEAVE-NEXT: br label %[[TMP19]]
1023+
; TFA_INTERLEAVE: [[TMP19]]:
10221024
; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
10231025
; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1
10241026
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]

llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,26 @@ exit:
7373
%1 = select i1 %all.off, i32 1, i32 %0
7474
ret i32 %1
7575
}
76+
77+
define i32 @select_vpinst_for_tail_folding(i8 %n) {
78+
; CHECK: LV: Checking a loop in 'select_vpinst_for_tail_folding'
79+
; CHECK: Cost of 1 for VF 2: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
80+
; CHECK: Cost of 1 for VF 4: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
81+
; CHECK: LV: Selecting VF: 4
82+
83+
entry:
84+
%c = icmp ne i8 %n, 0
85+
%ext = zext i1 %c to i32
86+
br label %loop
87+
88+
loop:
89+
%iv = phi i32 [ %ext, %entry ], [ %iv.next, %loop ]
90+
%red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
91+
%iv.next = add i32 %iv, 1
92+
%red.next = mul i32 %red, %iv
93+
%ec = icmp eq i32 %iv, 12
94+
br i1 %ec, label %exit, label %loop
95+
96+
exit:
97+
ret i32 %red.next
98+
}

llvm/test/Transforms/LoopVectorize/select-cmp.ll

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,14 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
136136
; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
137137
; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
138138
; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
139-
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
140-
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
141-
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
142-
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
139+
; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
140+
; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
141+
; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
142+
; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
143+
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true
144+
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true
145+
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true
146+
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true
143147
; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
144148
; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
145149
; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -512,10 +516,14 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
512516
; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
513517
; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
514518
; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
515-
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
516-
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
517-
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
518-
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
519+
; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
520+
; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
521+
; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
522+
; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
523+
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true
524+
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true
525+
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true
526+
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true
519527
; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
520528
; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
521529
; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -700,10 +708,14 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
700708
; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
701709
; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
702710
; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
703-
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp fast one float [[TMP8]], 3.000000e+00
704-
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp fast one float [[TMP9]], 3.000000e+00
705-
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp fast one float [[TMP10]], 3.000000e+00
706-
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp fast one float [[TMP11]], 3.000000e+00
711+
; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
712+
; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
713+
; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
714+
; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
715+
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true
716+
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true
717+
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true
718+
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true
707719
; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
708720
; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
709721
; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -888,10 +900,14 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
888900
; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
889901
; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
890902
; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
891-
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp one float [[TMP8]], 3.000000e+00
892-
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp one float [[TMP9]], 3.000000e+00
893-
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp one float [[TMP10]], 3.000000e+00
894-
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp one float [[TMP11]], 3.000000e+00
903+
; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
904+
; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
905+
; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
906+
; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
907+
; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true
908+
; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true
909+
; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true
910+
; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true
895911
; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
896912
; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
897913
; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -1043,7 +1059,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
10431059
; CHECK-VF1IC4: [[VECTOR_PH]]:
10441060
; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
10451061
; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1046-
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = icmp ne i32 [[A]], 3
1062+
; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = icmp eq i32 [[A]], 3
1063+
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = xor i1 [[TMP1]], true
10471064
; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
10481065
; CHECK-VF1IC4: [[VECTOR_BODY]]:
10491066
; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]

0 commit comments

Comments
 (0)