Skip to content

Commit 9531454

Browse files
[LV] Build VPlan for CSA
1 parent fe63c2f commit 9531454

File tree

9 files changed

+1672
-193
lines changed

9 files changed

+1672
-193
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+192-8
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180180
STATISTIC(LoopsVectorized, "Number of loops vectorized");
181181
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
182182
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
183+
STATISTIC(CSAsVectorized,
184+
"Number of conditional scalar assignments vectorized");
183185

184186
static cl::opt<bool> EnableEpilogueVectorization(
185187
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500502
virtual std::pair<BasicBlock *, Value *>
501503
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502504

505+
/// For all vectorized CSAs, replace uses of live-out scalar from the orignal
506+
/// loop with the extracted scalar from the vector loop for.
507+
void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
508+
503509
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
504510
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
505511

@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29322938
TargetTransformInfo::TCK_RecipThroughput);
29332939
}
29342940

2941+
void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
2942+
for (const auto &CSA: Plan.getCSAStates()) {
2943+
VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
2944+
assert(VPDataUpdate &&
2945+
"VPDataUpdate must have been introduced prior to fixing live outs");
2946+
Value *V = VPDataUpdate->getUnderlyingValue();
2947+
Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
2948+
/*NeedsScalar=*/true);
2949+
// Fix LCSSAPhis
2950+
llvm::SmallPtrSet<PHINode *, 2> ToFix;
2951+
for (User *U : V->users())
2952+
if (auto *Phi = dyn_cast<PHINode>(U);
2953+
Phi && Phi->getParent() == LoopExitBlock)
2954+
ToFix.insert(Phi);
2955+
for (PHINode *Phi : ToFix)
2956+
Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
2957+
}
2958+
}
2959+
29352960
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29362961
VPlan &Plan) {
29372962
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29722997
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
29732998
IVEndValues[Entry.first], LoopMiddleBlock,
29742999
VectorLoop->getHeader(), Plan, State);
3000+
3001+
fixCSALiveOuts(State, Plan);
29753002
}
29763003

29773004
// Fix live-out phis not already fixed earlier.
@@ -4110,7 +4137,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
41104137
// found modulo the vectorization factor is not zero, try to fold the tail
41114138
// by masking.
41124139
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4113-
setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
41144140
if (foldTailByMasking()) {
41154141
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
41164142
LLVM_DEBUG(
@@ -4482,6 +4508,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44824508
case VPDef::VPEVLBasedIVPHISC:
44834509
case VPDef::VPPredInstPHISC:
44844510
case VPDef::VPBranchOnMaskSC:
4511+
case VPRecipeBase::VPCSADataUpdateSC:
4512+
case VPRecipeBase::VPCSAExtractScalarSC:
4513+
case VPRecipeBase::VPCSAHeaderPHISC:
44854514
continue;
44864515
case VPDef::VPReductionSC:
44874516
case VPDef::VPActiveLaneMaskPHISC:
@@ -6995,6 +7024,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69957024
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
69967025
return;
69977026

7027+
CM.setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
7028+
69987029
// Invalidate interleave groups if all blocks of loop will be predicated.
69997030
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
70007031
!useMaskedInterleavedAccesses(TTI)) {
@@ -8476,9 +8507,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84768507
return Recipe;
84778508

84788509
VPHeaderPHIRecipe *PhiRecipe = nullptr;
8479-
assert((Legal->isReductionVariable(Phi) ||
8480-
Legal->isFixedOrderRecurrence(Phi)) &&
8481-
"can only widen reductions and fixed-order recurrences here");
84828510
VPValue *StartV = Operands[0];
84838511
if (Legal->isReductionVariable(Phi)) {
84848512
const RecurrenceDescriptor &RdxDesc =
@@ -8488,12 +8516,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84888516
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
84898517
CM.isInLoopReduction(Phi),
84908518
CM.useOrderedReductions(RdxDesc));
8491-
} else {
8519+
} else if (Legal->isFixedOrderRecurrence(Phi)){
84928520
// TODO: Currently fixed-order recurrences are modeled as chains of
84938521
// first-order recurrences. If there are no users of the intermediate
84948522
// recurrences in the chain, the fixed order recurrence should be modeled
84958523
// directly, enabling more efficient codegen.
84968524
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8525+
} else if (Legal->isCSAPhi(Phi)) {
8526+
VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
8527+
VPValue *InitData = State->getVPInitData();
8528+
// When the VF=getFixed(1), InitData is just InitScalar.
8529+
if (!InitData)
8530+
InitData = State->getVPInitScalar();
8531+
PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
8532+
State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8533+
} else {
8534+
llvm_unreachable(
8535+
"can only widen reductions, fixed-order recurrences, and CSAs here");
84978536
}
84988537

84998538
PhisToFix.push_back(PhiRecipe);
@@ -8523,6 +8562,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85238562
make_range(Operands.begin(), Operands.end()));
85248563

85258564
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8565+
auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
8566+
return CSADescriptor::isCSASelect(CSA.second, SI);
8567+
});
8568+
if (CSADescIt != Legal->getCSAs().end()) {
8569+
PHINode *CSAPhi = CSADescIt->first;
8570+
VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
8571+
VPValue *VPDataPhi = State->getPhiRecipe();
8572+
auto *R = new VPCSADataUpdateRecipe(
8573+
SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
8574+
State->setDataUpdate(R);
8575+
return R;
8576+
}
8577+
85268578
return new VPWidenSelectRecipe(
85278579
*SI, make_range(Operands.begin(), Operands.end()));
85288580
}
@@ -8535,6 +8587,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85358587
return tryToWiden(Instr, Operands, VPBB);
85368588
}
85378589

8590+
/// Add CSA Recipes that can occur before each instruction in the input IR
8591+
/// is processed and introduced into VPlan.
8592+
static void
8593+
addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
8594+
Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8595+
VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8596+
VPlan &Plan) {
8597+
8598+
// Don't build full CSA for VF=ElementCount::getFixed(1)
8599+
bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
8600+
[&](ElementCount VF) { return VF.isScalar(); }, Range);
8601+
8602+
for (const auto &CSA : CSAs) {
8603+
VPValue *VPInitScalar = Plan.getOrAddLiveIn(
8604+
CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8605+
8606+
// Scalar VF builds the scalar version of the loop. In that case,
8607+
// no maintenence of mask nor extraction in middle block is needed.
8608+
if (IsScalarVF) {
8609+
VPCSAState *S = new VPCSAState(VPInitScalar);
8610+
Plan.addCSAState(CSA.first, S);
8611+
continue;
8612+
}
8613+
8614+
auto *VPInitMask = new VPInstruction(VPInstruction::CSAInitMask, {}, DL,
8615+
"csa.init.mask");
8616+
auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
8617+
{VPInitScalar}, DL, "csa.init.data");
8618+
PreheaderVPBB->appendRecipe(VPInitMask);
8619+
PreheaderVPBB->appendRecipe(VPInitData);
8620+
8621+
auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
8622+
DL, "csa.mask.phi");
8623+
HeaderVPBB->appendRecipe(VPMaskPhi);
8624+
8625+
auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
8626+
Plan.addCSAState(CSA.first, S);
8627+
}
8628+
}
8629+
8630+
/// Add CSA Recipes that must occur after each instruction in the input IR
8631+
/// is processed and introduced into VPlan.
8632+
static void
8633+
addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
8634+
const LoopVectorizationLegality::CSAList &CSAs,
8635+
VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8636+
VPlan &Plan) {
8637+
// Don't build CSA for VF=ElementCount::getFixed(1)
8638+
if (LoopVectorizationPlanner::getDecisionAndClampRange(
8639+
[&](ElementCount VF) { return VF.isScalar(); }, Range))
8640+
return;
8641+
8642+
for (const auto &CSA : CSAs) {
8643+
VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
8644+
VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
8645+
8646+
assert(VPDataUpdate &&
8647+
"VPDataUpdate must have been introduced prior to postprocess");
8648+
assert(CSA.second.getCond() &&
8649+
"CSADescriptor must know how to describe the condition");
8650+
auto GetVPValue = [&](Value *I) {
8651+
return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
8652+
};
8653+
VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
8654+
VPValue *VPInitScalar = CSAState->getVPInitScalar();
8655+
8656+
// The CSA optimization wants to use a condition such that when it is
8657+
// true, a new value is assigned. However, it is possible that a true lane
8658+
// in WidenedCond corresponds to selection of the initial value instead.
8659+
// In that case, we must use the negation of WidenedCond.
8660+
// i.e. select cond new_val old_val versus select cond.not old_val new_val
8661+
VPValue *CondToUse = WidenedCond;
8662+
if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
8663+
CSA.first) {
8664+
auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
8665+
VPNotCond->insertBefore(
8666+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8667+
CondToUse = VPNotCond;
8668+
}
8669+
8670+
auto *VPAnyActive = new VPInstruction(
8671+
VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
8672+
VPAnyActive->insertBefore(
8673+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8674+
8675+
auto *VPMaskSel = new VPInstruction(
8676+
VPInstruction::CSAMaskSel,
8677+
{CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
8678+
VPMaskSel->insertAfter(VPAnyActive);
8679+
VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
8680+
VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8681+
new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
8682+
8683+
MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
8684+
8685+
// Update CSAState with new recipes
8686+
CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
8687+
CSAState->setVPAnyActive(VPAnyActive);
8688+
}
8689+
}
8690+
85388691
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
85398692
ElementCount MaxVF) {
85408693
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8591,7 +8744,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
85918744
// VPWidenPointerInductionRecipe and induction increments.
85928745
static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
85938746
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8594-
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8747+
const MapVector<PHINode *, InductionDescriptor> &Inductions,
8748+
const MapVector<PHINode *, CSADescriptor> &CSAs) {
85958749
auto MiddleVPBB =
85968750
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
85978751
// No edge from the middle block to the unique exit block has been inserted
@@ -8620,6 +8774,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
86208774
return P && Inductions.contains(P);
86218775
})))
86228776
continue;
8777+
// Exit values for CSAs are computed and updated outside of VPlan and
8778+
// independent of induction recipes.
8779+
// TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8780+
// live-outs.
8781+
if (isa<VPCSADataUpdateRecipe>(V) &&
8782+
(isa<Instruction>(IncomingValue) &&
8783+
any_of(IncomingValue->users(), [&CSAs](User *U) {
8784+
auto *P = dyn_cast<PHINode>(U);
8785+
return P && CSAs.contains(P);
8786+
})))
8787+
continue;
86238788
ExitingValuesToFix.insert({&ExitPhi, V});
86248789
}
86258790
return ExitingValuesToFix;
@@ -8861,6 +9026,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
88619026
bool HasNUW = Style == TailFoldingStyle::None;
88629027
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
88639028

9029+
// CSA vectorization is only supported for None or DataWithEVL tail folding
9030+
// styles.
9031+
// FIXME: Implement CSA for more tail folding styles
9032+
if (Style != TailFoldingStyle::None &&
9033+
Style != TailFoldingStyle::DataWithEVL && !Legal->getCSAs().empty())
9034+
return nullptr;
9035+
9036+
addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
9037+
Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
9038+
Range, *Plan);
9039+
88649040
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
88659041

88669042
// ---------------------------------------------------------------------------
@@ -8967,6 +9143,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89679143
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
89689144
}
89699145

9146+
VPBasicBlock *MiddleVPBB =
9147+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
9148+
addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
9149+
Range, *Plan);
9150+
89709151
// After here, VPBB should not be used.
89719152
VPBB = nullptr;
89729153

@@ -8976,8 +9157,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89769157
"VPBasicBlock");
89779158
RecipeBuilder.fixHeaderPhis();
89789159

8979-
MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock(
8980-
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9160+
MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9161+
collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
9162+
Legal->getInductionVars(), Legal->getCSAs());
89819163

89829164
addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix);
89839165
addUsersInExitBlock(*Plan, ExitingValuesToFix);
@@ -10074,6 +10256,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007410256
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
1007510257
EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
1007610258
++LoopsVectorized;
10259+
CSAsVectorized += LVL.getCSAs().size();
1007710260

1007810261
// Second pass vectorizes the epilogue and adjusts the control flow
1007910262
// edges from the first pass.
@@ -10166,6 +10349,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016610349
PSI, Checks);
1016710350
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
1016810351
++LoopsVectorized;
10352+
CSAsVectorized += LVL.getCSAs().size();
1016910353

1017010354
// Add metadata to disable runtime unrolling a scalar loop when there
1017110355
// are no runtime checks about strides and memory. A scalar loop that is

llvm/lib/Transforms/Vectorize/VPlan.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
216216

217217
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
218218
iterator It = begin();
219-
while (It != end() && It->isPhi())
219+
while (It != end() && vputils::isPhi(*It))
220220
It++;
221221
return It;
222222
}
@@ -858,6 +858,9 @@ VPlan::~VPlan() {
858858
delete VPV;
859859
if (BackedgeTakenCount)
860860
delete BackedgeTakenCount;
861+
862+
for (std::pair<PHINode *, VPCSAState *> &S : CSAStates)
863+
delete S.second;
861864
}
862865

863866
VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
@@ -1032,7 +1035,7 @@ void VPlan::execute(VPTransformState *State) {
10321035
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
10331036
for (VPRecipeBase &R : Header->phis()) {
10341037
// Skip phi-like recipes that generate their backedege values themselves.
1035-
if (isa<VPWidenPHIRecipe>(&R))
1038+
if (vputils::isPhiThatGeneratesBackedge(R))
10361039
continue;
10371040

10381041
if (isa<VPWidenPointerInductionRecipe>(&R) ||

0 commit comments

Comments
 (0)