Skip to content

Commit a2a1372

Browse files
committed
[VPlan] Add VPInstruction to unpack vector values to scalars.
Add a new Unpack VPInstruction (name to be improved) to explicitly extract scalars values from vectors. Test changes are movements of the extracts: they are no generated together and also directly after the producer. Depends on llvm#155102 (included in PR) modified: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
1 parent f9cd2ee commit a2a1372

File tree

68 files changed

+1554
-1485
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+1554
-1485
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10601060
ResumeForEpilogue,
10611061
/// Returns the value for vscale.
10621062
VScale,
1063+
Unpack,
10631064
};
10641065

10651066
/// Returns true if this VPInstruction generates scalar values for all lanes.

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
109109
case VPInstruction::AnyOf:
110110
case VPInstruction::BuildStructVector:
111111
case VPInstruction::BuildVector:
112+
case VPInstruction::Unpack:
112113
return SetResultTyFromOp();
113114
case VPInstruction::ExtractLane:
114115
return inferScalarType(R->getOperand(1));

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
506506
case VPInstruction::ExtractPenultimateElement:
507507
case VPInstruction::FirstActiveLane:
508508
case VPInstruction::Not:
509+
case VPInstruction::Unpack:
509510
return 1;
510511
case Instruction::ICmp:
511512
case Instruction::FCmp:
@@ -1231,6 +1232,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
12311232
case VPInstruction::StepVector:
12321233
case VPInstruction::ReductionStartVector:
12331234
case VPInstruction::VScale:
1235+
case VPInstruction::Unpack:
12341236
return false;
12351237
default:
12361238
return true;
@@ -1274,8 +1276,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
12741276
return getNumOperands() > 1;
12751277
case VPInstruction::PtrAdd:
12761278
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1277-
case VPInstruction::WidePtrAdd:
1278-
return Op == getOperand(0);
12791279
case VPInstruction::ComputeAnyOfResult:
12801280
case VPInstruction::ComputeFindIVResult:
12811281
return Op == getOperand(1);
@@ -1399,6 +1399,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
13991399
case VPInstruction::ResumeForEpilogue:
14001400
O << "resume-for-epilogue";
14011401
break;
1402+
case VPInstruction::Unpack:
1403+
O << "unpack-into-scalars";
1404+
break;
14021405
default:
14031406
O << Instruction::getOpcodeName(getOpcode());
14041407
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
12251225
return;
12261226
}
12271227

1228+
VPValue *Idx;
1229+
if (match(&R, m_VPInstruction<Instruction::ExtractElement>(m_BuildVector(),
1230+
m_VPValue(Idx)))) {
1231+
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1232+
Def->replaceAllUsesWith(BuildVector->getOperand(
1233+
dyn_cast<ConstantInt>(Idx->getLiveInIRValue())->getZExtValue()));
1234+
return;
1235+
}
1236+
12281237
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
12291238
if (Phi->getNumOperands() == 1)
12301239
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -3734,6 +3743,47 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
37343743
});
37353744
}
37363745
}
3746+
3747+
// Create explicit VPInstructions to convert vectors to scalars.
3748+
for (VPBasicBlock *VPBB :
3749+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3750+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3751+
if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe>(&R))
3752+
continue;
3753+
for (VPValue *Def : R.definedValues()) {
3754+
if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
3755+
continue;
3756+
3757+
if (VPBB->getParent() != Plan.getVectorLoopRegion())
3758+
continue;
3759+
3760+
auto UsesVectorOrInsideReplicateRegion = [LoopRegion](VPUser *U) {
3761+
VPRegionBlock *ParentRegion =
3762+
cast<VPRecipeBase>(U)->getParent()->getParent();
3763+
return ParentRegion && ParentRegion != LoopRegion;
3764+
};
3765+
3766+
if (none_of(Def->users(),
3767+
[Def, &UsesVectorOrInsideReplicateRegion](VPUser *U) {
3768+
return !UsesVectorOrInsideReplicateRegion(U) &&
3769+
U->usesScalars(Def);
3770+
}))
3771+
continue;
3772+
3773+
auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3774+
if (R.isPhi())
3775+
Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3776+
else
3777+
Unpack->insertAfter(&R);
3778+
Def->replaceUsesWithIf(
3779+
Unpack,
3780+
[Def, &UsesVectorOrInsideReplicateRegion](VPUser &U, unsigned) {
3781+
return !UsesVectorOrInsideReplicateRegion(&U) &&
3782+
U.usesScalars(Def);
3783+
});
3784+
}
3785+
}
3786+
}
37373787
}
37383788

37393789
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -466,25 +466,39 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
466466
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
467467
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
468468
/// definitions for operands of \DefR.
469-
static VPRecipeWithIRFlags *
469+
static VPValue *
470470
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
471471
VPRecipeWithIRFlags *DefR, VPLane Lane,
472472
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
473+
474+
VPValue *Op;
475+
if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
476+
auto LaneDefs = Def2LaneDefs.find(Op);
477+
if (LaneDefs != Def2LaneDefs.end())
478+
return LaneDefs->second[Lane.getKnownLane()];
479+
480+
VPValue *Idx =
481+
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
482+
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
483+
}
484+
473485
// Collect the operands at Lane, creating extracts as needed.
474486
SmallVector<VPValue *> NewOps;
475487
for (VPValue *Op : DefR->operands()) {
488+
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
489+
match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
490+
NewOps.push_back(
491+
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
492+
continue;
493+
}
494+
476495
// If Op is a definition that has been unrolled, directly use the clone for
477496
// the corresponding lane.
478497
auto LaneDefs = Def2LaneDefs.find(Op);
479498
if (LaneDefs != Def2LaneDefs.end()) {
480499
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
481500
continue;
482501
}
483-
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
484-
NewOps.push_back(
485-
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
486-
continue;
487-
}
488502
if (vputils::isSingleScalar(Op)) {
489503
NewOps.push_back(Op);
490504
continue;
@@ -498,8 +512,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
498512
}
499513
VPValue *Idx =
500514
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
501-
VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
502-
NewOps.push_back(Ext);
515+
NewOps.push_back(
516+
Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}));
503517
}
504518

505519
VPRecipeWithIRFlags *New;
@@ -548,7 +562,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
548562
(isa<VPReplicateRecipe>(&R) &&
549563
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
550564
(isa<VPInstruction>(&R) &&
551-
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
565+
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes() && cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
552566
continue;
553567

554568
auto *DefR = cast<VPRecipeWithIRFlags>(&R);

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
8686
all_of(VPI->operands(), isSingleScalar));
8787

8888
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
89-
return isa<VPExpandSCEVRecipe>(VPV);
89+
return isa<VPExpandSCEVRecipe, VPPhi>(VPV);
9090
}
9191

9292
/// Return true if \p V is a header mask in \p Plan.

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,12 +242,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
242242
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
243243
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
244244
; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
245+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
245246
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]]
246247
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
247248
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
248249
; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
249250
; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
250-
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
251251
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
252252
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
253253
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
2727
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
2828
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
2929
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
30+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
31+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
3032
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
31-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
32-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]])
33-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
33+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
34+
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
3435
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
35-
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
3636
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]])
37-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
3837
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
38+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]])
3939
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
4040
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
4141
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
@@ -61,8 +61,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
6161
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
6262
; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
6363
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
64-
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
6564
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
65+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
6666
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]])
6767
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1
6868
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2

llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
1717
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
1818
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_LOAD_CONTINUE6]] ]
1919
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
20+
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
2021
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
2122
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
2223
; CHECK: pred.load.if:
@@ -59,7 +60,6 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
5960
; CHECK-NEXT: [[TMP24]] = phi <4 x i16> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
6061
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP24]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
6162
; CHECK-NEXT: [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32>
62-
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
6363
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
6464
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
6565
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,17 +288,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
288288
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
289289
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
290290
; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
291+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
292+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
293+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
294+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
291295
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
292296
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
293297
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
294298
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
295-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
296299
; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2
297-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
298300
; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2
299-
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
300301
; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2
301-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
302302
; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2
303303
; CHECK-NEXT: store i64 0, ptr [[A]], align 8
304304
; CHECK-NEXT: store i64 0, ptr [[B]], align 8

0 commit comments

Comments
 (0)