@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180
180
STATISTIC(LoopsVectorized, "Number of loops vectorized");
181
181
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
182
182
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
183
+ STATISTIC(CSAsVectorized,
184
+ "Number of conditional scalar assignments vectorized");
183
185
184
186
static cl::opt<bool> EnableEpilogueVectorization(
185
187
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500
502
virtual std::pair<BasicBlock *, Value *>
501
503
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502
504
505
+ /// For all vectorized CSAs, replace uses of live-out scalar from the orignal
506
+ /// loop with the extracted scalar from the vector loop for.
507
+ void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
508
+
503
509
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
504
510
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
505
511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2932
2938
TargetTransformInfo::TCK_RecipThroughput);
2933
2939
}
2934
2940
2941
+ void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
2942
+ for (const auto &CSA: Plan.getCSAStates()) {
2943
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
2944
+ assert(VPDataUpdate &&
2945
+ "VPDataUpdate must have been introduced prior to fixing live outs");
2946
+ Value *V = VPDataUpdate->getUnderlyingValue();
2947
+ Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
2948
+ /*NeedsScalar=*/true);
2949
+ // Fix LCSSAPhis
2950
+ llvm::SmallPtrSet<PHINode *, 2> ToFix;
2951
+ for (User *U : V->users())
2952
+ if (auto *Phi = dyn_cast<PHINode>(U);
2953
+ Phi && Phi->getParent() == LoopExitBlock)
2954
+ ToFix.insert(Phi);
2955
+ for (PHINode *Phi : ToFix)
2956
+ Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
2957
+ }
2958
+ }
2959
+
2935
2960
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2936
2961
VPlan &Plan) {
2937
2962
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2972
2997
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
2973
2998
IVEndValues[Entry.first], LoopMiddleBlock,
2974
2999
VectorLoop->getHeader(), Plan, State);
3000
+
3001
+ fixCSALiveOuts(State, Plan);
2975
3002
}
2976
3003
2977
3004
// Fix live-out phis not already fixed earlier.
@@ -4110,7 +4137,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4110
4137
// found modulo the vectorization factor is not zero, try to fold the tail
4111
4138
// by masking.
4112
4139
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4113
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4114
4140
if (foldTailByMasking()) {
4115
4141
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4116
4142
LLVM_DEBUG(
@@ -4482,6 +4508,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4482
4508
case VPDef::VPEVLBasedIVPHISC:
4483
4509
case VPDef::VPPredInstPHISC:
4484
4510
case VPDef::VPBranchOnMaskSC:
4511
+ case VPRecipeBase::VPCSADataUpdateSC:
4512
+ case VPRecipeBase::VPCSAExtractScalarSC:
4513
+ case VPRecipeBase::VPCSAHeaderPHISC:
4485
4514
continue;
4486
4515
case VPDef::VPReductionSC:
4487
4516
case VPDef::VPActiveLaneMaskPHISC:
@@ -6995,6 +7024,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6995
7024
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6996
7025
return;
6997
7026
7027
+ CM.setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
7028
+
6998
7029
// Invalidate interleave groups if all blocks of loop will be predicated.
6999
7030
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7000
7031
!useMaskedInterleavedAccesses(TTI)) {
@@ -8476,9 +8507,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8476
8507
return Recipe;
8477
8508
8478
8509
VPHeaderPHIRecipe *PhiRecipe = nullptr;
8479
- assert((Legal->isReductionVariable(Phi) ||
8480
- Legal->isFixedOrderRecurrence(Phi)) &&
8481
- "can only widen reductions and fixed-order recurrences here");
8482
8510
VPValue *StartV = Operands[0];
8483
8511
if (Legal->isReductionVariable(Phi)) {
8484
8512
const RecurrenceDescriptor &RdxDesc =
@@ -8488,12 +8516,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8488
8516
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8489
8517
CM.isInLoopReduction(Phi),
8490
8518
CM.useOrderedReductions(RdxDesc));
8491
- } else {
8519
+ } else if (Legal->isFixedOrderRecurrence(Phi)) {
8492
8520
// TODO: Currently fixed-order recurrences are modeled as chains of
8493
8521
// first-order recurrences. If there are no users of the intermediate
8494
8522
// recurrences in the chain, the fixed order recurrence should be modeled
8495
8523
// directly, enabling more efficient codegen.
8496
8524
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8525
+ } else if (Legal->isCSAPhi(Phi)) {
8526
+ VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
8527
+ VPValue *InitData = State->getVPInitData();
8528
+ // When the VF=getFixed(1), InitData is just InitScalar.
8529
+ if (!InitData)
8530
+ InitData = State->getVPInitScalar();
8531
+ PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
8532
+ State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8533
+ } else {
8534
+ llvm_unreachable(
8535
+ "can only widen reductions, fixed-order recurrences, and CSAs here");
8497
8536
}
8498
8537
8499
8538
PhisToFix.push_back(PhiRecipe);
@@ -8523,6 +8562,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8523
8562
make_range(Operands.begin(), Operands.end()));
8524
8563
8525
8564
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8565
+ auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
8566
+ return CSADescriptor::isCSASelect(CSA.second, SI);
8567
+ });
8568
+ if (CSADescIt != Legal->getCSAs().end()) {
8569
+ PHINode *CSAPhi = CSADescIt->first;
8570
+ VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
8571
+ VPValue *VPDataPhi = State->getPhiRecipe();
8572
+ auto *R = new VPCSADataUpdateRecipe(
8573
+ SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
8574
+ State->setDataUpdate(R);
8575
+ return R;
8576
+ }
8577
+
8526
8578
return new VPWidenSelectRecipe(
8527
8579
*SI, make_range(Operands.begin(), Operands.end()));
8528
8580
}
@@ -8535,6 +8587,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8535
8587
return tryToWiden(Instr, Operands, VPBB);
8536
8588
}
8537
8589
8590
+ /// Add CSA Recipes that can occur before each instruction in the input IR
8591
+ /// is processed and introduced into VPlan.
8592
+ static void
8593
+ addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
8594
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8595
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8596
+ VPlan &Plan) {
8597
+
8598
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8599
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
8600
+ [&](ElementCount VF) { return VF.isScalar(); }, Range);
8601
+
8602
+ for (const auto &CSA : CSAs) {
8603
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn(
8604
+ CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8605
+
8606
+ // Scalar VF builds the scalar version of the loop. In that case,
8607
+ // no maintenence of mask nor extraction in middle block is needed.
8608
+ if (IsScalarVF) {
8609
+ VPCSAState *S = new VPCSAState(VPInitScalar);
8610
+ Plan.addCSAState(CSA.first, S);
8611
+ continue;
8612
+ }
8613
+
8614
+ auto *VPInitMask = new VPInstruction(VPInstruction::CSAInitMask, {}, DL,
8615
+ "csa.init.mask");
8616
+ auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
8617
+ {VPInitScalar}, DL, "csa.init.data");
8618
+ PreheaderVPBB->appendRecipe(VPInitMask);
8619
+ PreheaderVPBB->appendRecipe(VPInitData);
8620
+
8621
+ auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
8622
+ DL, "csa.mask.phi");
8623
+ HeaderVPBB->appendRecipe(VPMaskPhi);
8624
+
8625
+ auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
8626
+ Plan.addCSAState(CSA.first, S);
8627
+ }
8628
+ }
8629
+
8630
+ /// Add CSA Recipes that must occur after each instruction in the input IR
8631
+ /// is processed and introduced into VPlan.
8632
+ static void
8633
+ addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
8634
+ const LoopVectorizationLegality::CSAList &CSAs,
8635
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8636
+ VPlan &Plan) {
8637
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8638
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
8639
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
8640
+ return;
8641
+
8642
+ for (const auto &CSA : CSAs) {
8643
+ VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
8644
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
8645
+
8646
+ assert(VPDataUpdate &&
8647
+ "VPDataUpdate must have been introduced prior to postprocess");
8648
+ assert(CSA.second.getCond() &&
8649
+ "CSADescriptor must know how to describe the condition");
8650
+ auto GetVPValue = [&](Value *I) {
8651
+ return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
8652
+ };
8653
+ VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
8654
+ VPValue *VPInitScalar = CSAState->getVPInitScalar();
8655
+
8656
+ // The CSA optimization wants to use a condition such that when it is
8657
+ // true, a new value is assigned. However, it is possible that a true lane
8658
+ // in WidenedCond corresponds to selection of the initial value instead.
8659
+ // In that case, we must use the negation of WidenedCond.
8660
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8661
+ VPValue *CondToUse = WidenedCond;
8662
+ if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
8663
+ CSA.first) {
8664
+ auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
8665
+ VPNotCond->insertBefore(
8666
+ GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8667
+ CondToUse = VPNotCond;
8668
+ }
8669
+
8670
+ auto *VPAnyActive = new VPInstruction(
8671
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
8672
+ VPAnyActive->insertBefore(
8673
+ GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8674
+
8675
+ auto *VPMaskSel = new VPInstruction(
8676
+ VPInstruction::CSAMaskSel,
8677
+ {CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
8678
+ VPMaskSel->insertAfter(VPAnyActive);
8679
+ VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
8680
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8681
+ new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
8682
+
8683
+ MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
8684
+
8685
+ // Update CSAState with new recipes
8686
+ CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
8687
+ CSAState->setVPAnyActive(VPAnyActive);
8688
+ }
8689
+ }
8690
+
8538
8691
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8539
8692
ElementCount MaxVF) {
8540
8693
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8591,7 +8744,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8591
8744
// VPWidenPointerInductionRecipe and induction increments.
8592
8745
static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
8593
8746
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8594
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8747
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8748
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8595
8749
auto MiddleVPBB =
8596
8750
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
8597
8751
// No edge from the middle block to the unique exit block has been inserted
@@ -8620,6 +8774,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
8620
8774
return P && Inductions.contains(P);
8621
8775
})))
8622
8776
continue;
8777
+ // Exit values for CSAs are computed and updated outside of VPlan and
8778
+ // independent of induction recipes.
8779
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8780
+ // live-outs.
8781
+ if (isa<VPCSADataUpdateRecipe>(V) &&
8782
+ (isa<Instruction>(IncomingValue) &&
8783
+ any_of(IncomingValue->users(), [&CSAs](User *U) {
8784
+ auto *P = dyn_cast<PHINode>(U);
8785
+ return P && CSAs.contains(P);
8786
+ })))
8787
+ continue;
8623
8788
ExitingValuesToFix.insert({&ExitPhi, V});
8624
8789
}
8625
8790
return ExitingValuesToFix;
@@ -8861,6 +9026,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8861
9026
bool HasNUW = Style == TailFoldingStyle::None;
8862
9027
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8863
9028
9029
+ // CSA vectorization is only supported for None or DataWithEVL tail folding
9030
+ // styles.
9031
+ // FIXME: Implement CSA for more tail folding styles
9032
+ if (Style != TailFoldingStyle::None &&
9033
+ Style != TailFoldingStyle::DataWithEVL && !Legal->getCSAs().empty())
9034
+ return nullptr;
9035
+
9036
+ addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
9037
+ Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
9038
+ Range, *Plan);
9039
+
8864
9040
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8865
9041
8866
9042
// ---------------------------------------------------------------------------
@@ -8967,6 +9143,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8967
9143
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8968
9144
}
8969
9145
9146
+ VPBasicBlock *MiddleVPBB =
9147
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
9148
+ addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
9149
+ Range, *Plan);
9150
+
8970
9151
// After here, VPBB should not be used.
8971
9152
VPBB = nullptr;
8972
9153
@@ -8976,8 +9157,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8976
9157
"VPBasicBlock");
8977
9158
RecipeBuilder.fixHeaderPhis();
8978
9159
8979
- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock(
8980
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9160
+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9161
+ collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
9162
+ Legal->getInductionVars(), Legal->getCSAs());
8981
9163
8982
9164
addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix);
8983
9165
addUsersInExitBlock(*Plan, ExitingValuesToFix);
@@ -10074,6 +10256,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10074
10256
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10075
10257
EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10076
10258
++LoopsVectorized;
10259
+ CSAsVectorized += LVL.getCSAs().size();
10077
10260
10078
10261
// Second pass vectorizes the epilogue and adjusts the control flow
10079
10262
// edges from the first pass.
@@ -10166,6 +10349,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10166
10349
PSI, Checks);
10167
10350
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10168
10351
++LoopsVectorized;
10352
+ CSAsVectorized += LVL.getCSAs().size();
10169
10353
10170
10354
// Add metadata to disable runtime unrolling a scalar loop when there
10171
10355
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments