Skip to content

Commit bfd4155

Browse files
authored
[VPlan] Don't apply predication discount to non-originally-predicated blocks (#160449)
Split off from #158690. Currently if an instruction needs predicated due to tail folding, it will also have a predicated discount applied to it in multiple places. This is likely inaccurate because we can expect a tail folded instruction to be executed on every iteration bar the last. This fixes it by checking if the instruction/block was originally predicated, and in doing so prevents vectorization with tail folding where we would have had to scalarize the memory op anyway. On llvm-test-suite this causes 4 loops in total to no longer be vectorized with -O3 on arm64-apple-darwin, and there's no observable performance impact.
1 parent 1ffe79d commit bfd4155

File tree

10 files changed

+134
-567
lines changed

10 files changed

+134
-567
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,30 @@ class LoopVectorizationCostModel {
12321232
/// Superset of instructions that return true for isScalarWithPredication.
12331233
bool isPredicatedInst(Instruction *I) const;
12341234

1235+
/// A helper function that returns how much we should divide the cost of a
1236+
/// predicated block by. Typically this is the reciprocal of the block
1237+
/// probability, i.e. if we return X we are assuming the predicated block will
1238+
/// execute once for every X iterations of the loop header so the block should
1239+
/// only contribute 1/X of its cost to the total cost calculation, but when
1240+
/// optimizing for code size it will just be 1 as code size costs don't depend
1241+
/// on execution probabilities.
1242+
///
1243+
/// TODO: We should use actual block probability here, if available.
1244+
/// Currently, we always assume predicated blocks have a 50% chance of
1245+
/// executing, apart from blocks that are only predicated due to tail folding.
1246+
inline unsigned
1247+
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1248+
BasicBlock *BB) const {
1249+
// If a block wasn't originally predicated but was predicated due to
1250+
// e.g. tail folding, don't divide the cost. Tail folded loops may still be
1251+
// predicated in the final vector loop iteration, but for most loops that
1252+
// don't have low trip counts we can expect their probability to be close to
1253+
// zero.
1254+
if (!Legal->blockNeedsPredication(BB))
1255+
return 1;
1256+
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
1257+
}
1258+
12351259
/// Return the costs for our two available strategies for lowering a
12361260
/// div/rem operation which requires speculating at least one lane.
12371261
/// First result is for scalarization (will be invalid for scalable
@@ -2887,7 +2911,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
28872911
// Scale the cost by the probability of executing the predicated blocks.
28882912
// This assumes the predicated block for each vector lane is equally
28892913
// likely.
2890-
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2914+
ScalarizationCost =
2915+
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
28912916
}
28922917

28932918
InstructionCost SafeDivisorCost = 0;
@@ -5032,7 +5057,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
50325057
}
50335058

50345059
// Scale the total scalar cost by block probability.
5035-
ScalarCost /= getPredBlockCostDivisor(CostKind);
5060+
ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
50365061

50375062
// Compute the discount. A non-negative discount means the vector version
50385063
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5082,10 +5107,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
50825107
// stores and instructions that may divide by zero) will now be
50835108
// unconditionally executed. For the scalar case, we may not always execute
50845109
// the predicated block, if it is an if-else block. Thus, scale the block's
5085-
// cost by the probability of executing it. blockNeedsPredication from
5086-
// Legal is used so as to not include all blocks in tail folded loops.
5087-
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5088-
BlockCost /= getPredBlockCostDivisor(CostKind);
5110+
// cost by the probability of executing it.
5111+
// getPredBlockCostDivisor will return 1 for blocks that are only predicated
5112+
// by the header mask when folding the tail.
5113+
if (VF.isScalar())
5114+
BlockCost /= getPredBlockCostDivisor(CostKind, BB);
50895115

50905116
Cost += BlockCost;
50915117
}
@@ -5164,7 +5190,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
51645190
// conditional branches, but may not be executed for each vector lane. Scale
51655191
// the cost by the probability of executing the predicated block.
51665192
if (isPredicatedInst(I)) {
5167-
Cost /= getPredBlockCostDivisor(CostKind);
5193+
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
51685194

51695195
// Add the cost of an i1 extract and a branch
51705196
auto *VecI1Ty =
@@ -6732,6 +6758,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
67326758
SkipCostComputation.contains(UI);
67336759
}
67346760

6761+
unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
6762+
return CM.getPredBlockCostDivisor(CostKind, BB);
6763+
}
6764+
67356765
InstructionCost
67366766
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
67376767
VPCostContext &CostCtx) const {

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
5050
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
5151
int64_t Step);
5252

53-
/// A helper function that returns how much we should divide the cost of a
54-
/// predicated block by. Typically this is the reciprocal of the block
55-
/// probability, i.e. if we return X we are assuming the predicated block will
56-
/// execute once for every X iterations of the loop header so the block should
57-
/// only contribute 1/X of its cost to the total cost calculation, but when
58-
/// optimizing for code size it will just be 1 as code size costs don't depend
59-
/// on execution probabilities.
60-
///
61-
/// TODO: We should use actual block probability here, if available. Currently,
62-
/// we always assume predicated blocks have a 50% chance of executing.
63-
inline unsigned
64-
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
65-
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
66-
}
67-
6853
/// A range of powers-of-2 vectorization factors with fixed start and
6954
/// adjustable end. The range includes start and excludes end, e.g.,:
7055
/// [1, 16) = {1, 2, 4, 8}
@@ -367,6 +352,10 @@ struct VPCostContext {
367352
/// has already been pre-computed.
368353
bool skipCostComputation(Instruction *UI, bool IsVector) const;
369354

355+
/// \returns how much the cost of a predicated block should be divided by.
356+
/// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor.
357+
unsigned getPredBlockCostDivisor(BasicBlock *BB) const;
358+
370359
/// Returns the OperandInfo for \p V, if it is a live-in.
371360
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
372361

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3349,7 +3349,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
33493349
// Scale the cost by the probability of executing the predicated blocks.
33503350
// This assumes the predicated block for each vector lane is equally
33513351
// likely.
3352-
ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3352+
ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
33533353
return ScalarCost;
33543354
}
33553355
case Instruction::Load:

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 10 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -613,63 +613,17 @@ exit:
613613
define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
614614
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
615615
; COMMON-SAME: ptr [[DST:%.*]]) {
616-
; COMMON-NEXT: [[ENTRY:.*:]]
617-
; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
618-
; COMMON: [[VECTOR_PH]]:
619-
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
620-
; COMMON: [[VECTOR_BODY]]:
621-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
622-
; COMMON: [[PRED_STORE_IF]]:
623-
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
624-
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
625-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
626-
; COMMON: [[PRED_STORE_CONTINUE]]:
627-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
628-
; COMMON: [[PRED_STORE_IF1]]:
629-
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
630-
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
631-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
632-
; COMMON: [[PRED_STORE_CONTINUE2]]:
633-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
634-
; COMMON: [[PRED_STORE_IF3]]:
635-
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
636-
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
637-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
638-
; COMMON: [[PRED_STORE_CONTINUE4]]:
639-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
640-
; COMMON: [[PRED_STORE_IF5]]:
641-
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
642-
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
643-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
644-
; COMMON: [[PRED_STORE_CONTINUE6]]:
645-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
646-
; COMMON: [[PRED_STORE_IF7]]:
647-
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
648-
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
649-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
650-
; COMMON: [[PRED_STORE_CONTINUE8]]:
651-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
652-
; COMMON: [[PRED_STORE_IF9]]:
653-
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
654-
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
655-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
656-
; COMMON: [[PRED_STORE_CONTINUE10]]:
657-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
658-
; COMMON: [[PRED_STORE_IF11]]:
659-
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
660-
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
661-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
662-
; COMMON: [[PRED_STORE_CONTINUE12]]:
663-
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
664-
; COMMON: [[PRED_STORE_IF13]]:
665-
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
666-
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
667-
; COMMON-NEXT: br label %[[EXIT]]
616+
; COMMON-NEXT: [[ENTRY:.*]]:
617+
; COMMON-NEXT: br label %[[LOOP:.*]]
618+
; COMMON: [[LOOP]]:
619+
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
620+
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
621+
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
622+
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
623+
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
624+
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
625+
; COMMON-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
668626
; COMMON: [[EXIT]]:
669-
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
670-
; COMMON: [[SCALAR_PH]]:
671-
; COMMON-NEXT: br label %[[EXIT1:.*]]
672-
; COMMON: [[EXIT1]]:
673627
; COMMON-NEXT: ret void
674628
;
675629
entry:

0 commit comments

Comments
 (0)