Skip to content

Commit fda0eee

Browse files
fhahnaokblast
authored andcommitted
[VPlan] Add VPInstruction to unpack vector values to scalars. (llvm#155670)
Add a new Unpack VPInstruction (name to be improved) to explicitly extract scalars values from vectors. Test changes are movements of the extracts: they are no generated together and also directly after the producer. Depends on llvm#155102 (included in PR) PR: llvm#155670
1 parent 84d3c28 commit fda0eee

File tree

76 files changed

+1721
-1609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+1721
-1609
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
71927192
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
71937193
// cost model is complete for better cost estimates.
71947194
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7195-
VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
7195+
VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks,
7196+
BestVPlan);
71967197
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
71977198
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
71987199
bool HasBranchWeights =

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10071007
/// Creates a fixed-width vector containing all operands. The number of
10081008
/// operands matches the vector element count.
10091009
BuildVector,
1010+
/// Extracts all lanes from its (non-scalable) vector operand. This is an
1011+
/// abstract VPInstruction whose single defined VPValue represents VF
1012+
/// scalars extracted from a vector, to be replaced by VF ExtractElement
1013+
/// VPInstructions.
1014+
Unpack,
10101015
/// Compute the final result of a AnyOf reduction with select(cmp(),x,y),
10111016
/// where one of (x,y) is loop invariant, and both x and y are integer type.
10121017
ComputeAnyOfResult,
@@ -2715,6 +2720,15 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
27152720
return R && classof(R);
27162721
}
27172722

2723+
static inline bool classof(const VPValue *VPV) {
2724+
const VPRecipeBase *R = VPV->getDefiningRecipe();
2725+
return R && classof(R);
2726+
}
2727+
2728+
static inline bool classof(const VPSingleDefRecipe *R) {
2729+
return classof(static_cast<const VPRecipeBase *>(R));
2730+
}
2731+
27182732
/// Generate the reduction in the loop.
27192733
void execute(VPTransformState &State) override;
27202734

@@ -3100,6 +3114,9 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
31003114
/// Returns true if this expression contains recipes that may have side
31013115
/// effects.
31023116
bool mayHaveSideEffects() const;
3117+
3118+
/// Returns true if the result of this VPExpressionRecipe is a single-scalar.
3119+
bool isSingleScalar() const;
31033120
};
31043121

31053122
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
110110
case VPInstruction::AnyOf:
111111
case VPInstruction::BuildStructVector:
112112
case VPInstruction::BuildVector:
113+
case VPInstruction::Unpack:
113114
return SetResultTyFromOp();
114115
case VPInstruction::ExtractLane:
115116
return inferScalarType(R->getOperand(1));

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
388388
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
389389
}
390390

391+
template <typename Op0_t, typename Op1_t>
392+
inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t>
393+
m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
394+
return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
395+
}
396+
391397
template <typename Op0_t>
392398
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
393399
m_ExtractLastLanePerPart(const Op0_t &Op0) {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
515515
case VPInstruction::ExtractPenultimateElement:
516516
case VPInstruction::FirstActiveLane:
517517
case VPInstruction::Not:
518+
case VPInstruction::Unpack:
518519
return 1;
519520
case Instruction::ICmp:
520521
case Instruction::FCmp:
@@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
12461247
case VPInstruction::StepVector:
12471248
case VPInstruction::ReductionStartVector:
12481249
case VPInstruction::VScale:
1250+
case VPInstruction::Unpack:
12491251
return false;
12501252
default:
12511253
return true;
@@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
12901292
case VPInstruction::PtrAdd:
12911293
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
12921294
case VPInstruction::WidePtrAdd:
1293-
return Op == getOperand(0);
1295+
// WidePtrAdd supports scalar and vector base addresses.
1296+
return false;
12941297
case VPInstruction::ComputeAnyOfResult:
12951298
case VPInstruction::ComputeFindIVResult:
12961299
return Op == getOperand(1);
@@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
14171420
case VPInstruction::ResumeForEpilogue:
14181421
O << "resume-for-epilogue";
14191422
break;
1423+
case VPInstruction::Unpack:
1424+
O << "unpack";
1425+
break;
14201426
default:
14211427
O << Instruction::getOpcodeName(getOpcode());
14221428
}
@@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const {
28882894
return false;
28892895
}
28902896

2897+
bool VPExpressionRecipe::isSingleScalar() const {
2898+
// Cannot use vputils::isSingleScalar(), because all external operands
2899+
// of the expression will be live-ins while bundled.
2900+
return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2901+
!isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2902+
}
2903+
28912904
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
28922905

28932906
void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
12241224
return;
12251225
}
12261226

1227+
uint64_t Idx;
1228+
if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
1229+
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1230+
Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1231+
return;
1232+
}
1233+
12271234
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
12281235
if (Phi->getNumOperands() == 1)
12291236
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -3780,7 +3787,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
37803787
BTC->replaceAllUsesWith(TCMO);
37813788
}
37823789

3783-
void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
3790+
void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
37843791
if (Plan.hasScalarVFOnly())
37853792
return;
37863793

@@ -3828,6 +3835,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
38283835
});
38293836
}
38303837
}
3838+
3839+
// Create explicit VPInstructions to convert vectors to scalars. The current
3840+
// implementation is conservative - it may miss some cases that may or may not
3841+
// be vector values. TODO: introduce Unpacks speculatively - remove them later
3842+
// if they are known to operate on scalar values.
3843+
for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3844+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3845+
if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
3846+
VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R))
3847+
continue;
3848+
for (VPValue *Def : R.definedValues()) {
3849+
// Skip recipes that are single-scalar or only have their first lane
3850+
// used.
3851+
// TODO: The Defs skipped here may or may not be vector values.
3852+
// Introduce Unpacks, and remove them later, if they are guaranteed to
3853+
// produce scalar values.
3854+
if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
3855+
continue;
3856+
3857+
// At the moment, we create unpacks only for scalar users outside
3858+
// replicate regions. Recipes inside replicate regions still extract the
3859+
// required lanes implicitly.
3860+
// TODO: Remove once replicate regions are unrolled completely.
3861+
auto IsCandidateUnpackUser = [Def](VPUser *U) {
3862+
VPRegionBlock *ParentRegion =
3863+
cast<VPRecipeBase>(U)->getParent()->getParent();
3864+
return U->usesScalars(Def) &&
3865+
(!ParentRegion || !ParentRegion->isReplicator());
3866+
};
3867+
if (none_of(Def->users(), IsCandidateUnpackUser))
3868+
continue;
3869+
3870+
auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3871+
if (R.isPhi())
3872+
Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3873+
else
3874+
Unpack->insertAfter(&R);
3875+
Def->replaceUsesWithIf(Unpack,
3876+
[&IsCandidateUnpackUser](VPUser &U, unsigned) {
3877+
return IsCandidateUnpackUser(&U);
3878+
});
3879+
}
3880+
}
3881+
}
38313882
}
38323883

38333884
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,10 @@ struct VPlanTransforms {
325325
static void materializeBackedgeTakenCount(VPlan &Plan,
326326
VPBasicBlock *VectorPH);
327327

328-
/// Add explicit Build[Struct]Vector recipes that combine multiple scalar
329-
/// values into single vectors.
330-
static void materializeBuildVectors(VPlan &Plan);
328+
/// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
329+
/// into vectors and Unpack recipes to extract scalars from vectors as
330+
/// needed.
331+
static void materializePacksAndUnpacks(VPlan &Plan);
331332

332333
/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
333334
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
465465
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
466466
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
467467
/// definitions for operands of \DefR.
468-
static VPRecipeWithIRFlags *
468+
static VPValue *
469469
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
470470
VPRecipeWithIRFlags *DefR, VPLane Lane,
471471
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
472+
VPValue *Op;
473+
if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
474+
auto LaneDefs = Def2LaneDefs.find(Op);
475+
if (LaneDefs != Def2LaneDefs.end())
476+
return LaneDefs->second[Lane.getKnownLane()];
477+
478+
VPValue *Idx =
479+
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
480+
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
481+
}
482+
472483
// Collect the operands at Lane, creating extracts as needed.
473484
SmallVector<VPValue *> NewOps;
474485
for (VPValue *Op : DefR->operands()) {
@@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
480491
continue;
481492
}
482493
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
494+
// Look through mandatory Unpack.
495+
[[maybe_unused]] bool Matched =
496+
match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
497+
assert(Matched && "original op must have been Unpack");
483498
NewOps.push_back(
484499
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
485500
continue;
@@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
547562
(isa<VPReplicateRecipe>(&R) &&
548563
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
549564
(isa<VPInstruction>(&R) &&
550-
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
565+
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
566+
cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
551567
continue;
552568

553569
auto *DefR = cast<VPRecipeWithIRFlags>(&R);

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) {
8484
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
8585
(PreservesUniformity(VPI->getOpcode()) &&
8686
all_of(VPI->operands(), isSingleScalar));
87+
if (isa<VPPartialReductionRecipe>(VPV))
88+
return false;
89+
if (isa<VPReductionRecipe>(VPV))
90+
return true;
91+
if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
92+
return Expr->isSingleScalar();
8793

8894
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
8995
return isa<VPExpandSCEVRecipe>(VPV);

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,12 +241,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
241241
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
242242
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
243243
; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
244+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
244245
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]]
245246
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
246247
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
247248
; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
248249
; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
249-
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
250250
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
251251
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
252252
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]

0 commit comments

Comments
 (0)