[VPlan] Use predicate in VPInstruction::computeCost for selects. (llvm#170278)

fhahn · fhahn · commit 8cd7527ce438 · 2025-12-09T13:53:33.000Z
In some cases, the lowering a select depends on the predicate. If the condition of a select is a compare instruction, thread the predicate through to the TTI hook. PR: llvm#170278 (cherry picked from commit 50916a4)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -991,9 +991,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
 
   switch (getOpcode()) {
   case Instruction::Select: {
-    // TODO: It may be possible to improve this by analyzing where the
-    // condition operand comes from.
-    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
     auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
     auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
     if (!vputils::onlyFirstLaneUsed(this)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -996,14 +996,16 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]]
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFA_INTERLEAVE:       [[VECTOR_BODY]]:
-; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP18:.*]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP18]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP18]] ]
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP19:.*]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP19]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP19]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp ule double [[TMP5]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ule double [[TMP6]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP9]], true
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP10]], true
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00
@@ -1014,11 +1016,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP13]], true
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP14]], true
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
-; TFA_INTERLEAVE-NEXT:    br i1 [[TMP17]], label %[[BB16:.*]], label %[[TMP18]]
-; TFA_INTERLEAVE:       [[BB16]]:
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[TMP19]]
+; TFA_INTERLEAVE:       [[BB18]]:
 ; TFA_INTERLEAVE-NEXT:    store double [[SPEC_SELECT]], ptr [[P]], align 8
-; TFA_INTERLEAVE-NEXT:    br label %[[TMP18]]
-; TFA_INTERLEAVE:       [[TMP18]]:
+; TFA_INTERLEAVE-NEXT:    br label %[[TMP19]]
+; TFA_INTERLEAVE:       [[TMP19]]:
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 1
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -73,3 +73,26 @@ exit:
   %1 = select i1 %all.off, i32 1, i32 %0
   ret i32 %1
 }
+
+define i32 @select_vpinst_for_tail_folding(i8 %n) {
+; CHECK: LV: Checking a loop in 'select_vpinst_for_tail_folding'
+; CHECK: Cost of 1 for VF 2: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
+; CHECK: Cost of 1 for VF 4: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
+; CHECK: LV: Selecting VF: 4
+
+entry:
+  %c = icmp ne i8 %n, 0
+  %ext = zext i1 %c to i32
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %ext, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %red.next = mul i32 %red, %iv
+  %ec = icmp eq i32 %iv, 12
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -136,10 +136,14 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -512,10 +516,14 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -700,10 +708,14 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast one float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast one float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast one float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast one float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -888,10 +900,14 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp one float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp one float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp one float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp one float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -1043,7 +1059,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4:       [[VECTOR_PH]]:
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]