Skip to content

Commit c15cf30

Browse files
[VPlan] Add cost model for CSA
1 parent 9531454 commit c15cf30

File tree

4 files changed

+135
-83
lines changed

4 files changed

+135
-83
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -7279,7 +7279,16 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
72797279
static bool
72807280
planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF,
72817281
VPCostContext &CostCtx, Loop *TheLoop,
7282-
LoopVectorizationCostModel &CM) {
7282+
LoopVectorizationCostModel &CM,
7283+
LoopVectorizationLegality &Legal) {
7284+
7285+
// CSA cost is more complicated since there is significant overhead in the
7286+
// preheader and middle block. It also contains recipes that are not backed by
7287+
// underlying instructions in the original loop. This makes it difficult to
7288+
// model in the legacy cost model.
7289+
if (!Legal.getCSAs().empty())
7290+
return true;
7291+
72837292
// First collect all instructions for the recipes in Plan.
72847293
auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
72857294
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
@@ -7391,7 +7400,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
73917400
assert((BestFactor.Width == LegacyVF.Width ||
73927401
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
73937402
BestFactor.Width, CostCtx,
7394-
OrigLoop, CM)) &&
7403+
OrigLoop, CM, *Legal)) &&
73957404
" VPlan cost model and legacy cost model disagreed");
73967405
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
73977406
"when vectorizing, the scalar cost must be computed.");

llvm/lib/Transforms/Vectorize/VPlan.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -2498,6 +2498,9 @@ class VPCSAHeaderPHIRecipe final : public VPHeaderPHIRecipe {
24982498

24992499
void execute(VPTransformState &State) override;
25002500

2501+
InstructionCost computeCost(ElementCount VF,
2502+
VPCostContext &Ctx) const override;
2503+
25012504
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
25022505
/// Print the recipe.
25032506
void print(raw_ostream &O, const Twine &Indent,
@@ -2529,6 +2532,9 @@ class VPCSADataUpdateRecipe final : public VPSingleDefRecipe {
25292532

25302533
void execute(VPTransformState &State) override;
25312534

2535+
InstructionCost computeCost(ElementCount VF,
2536+
VPCostContext &Ctx) const override;
2537+
25322538
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
25332539
/// Print the recipe.
25342540
void print(raw_ostream &O, const Twine &Indent,
@@ -2575,6 +2581,9 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
25752581

25762582
void execute(VPTransformState &State) override;
25772583

2584+
InstructionCost computeCost(ElementCount VF,
2585+
VPCostContext &Ctx) const override;
2586+
25782587
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
25792588
/// Print the recipe.
25802589
void print(raw_ostream &O, const Twine &Indent,
@@ -2585,7 +2594,7 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
25852594
VPValue *getVPMaskSel() const { return getOperand(1); }
25862595
VPValue *getVPDataSel() const { return getOperand(2); }
25872596
VPValue *getVPCSAVLSel() const { return getOperand(3); }
2588-
bool usesEVL() { return getNumOperands() == 4; }
2597+
bool usesEVL() const { return getNumOperands() == 4; }
25892598
};
25902599

25912600
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+100
Original file line numberDiff line numberDiff line change
@@ -2148,6 +2148,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
21482148
State.set(this, DataPhi, Part);
21492149
}
21502150

2151+
InstructionCost VPCSAHeaderPHIRecipe::computeCost(ElementCount VF,
2152+
VPCostContext &Ctx) const {
2153+
if (VF.isScalar())
2154+
return 0;
2155+
2156+
InstructionCost C = 0;
2157+
auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
2158+
const TargetTransformInfo &TTI = Ctx.TTI;
2159+
2160+
// FIXME: These costs should be moved into VPInstruction::computeCost. We put
2161+
// them here for now since there is no VPInstruction::computeCost support.
2162+
// CSAInitMask
2163+
C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
2164+
// CSAInitData
2165+
C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
2166+
return C;
2167+
}
2168+
21512169
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
21522170
void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
21532171
VPSlotTracker &SlotTracker) const {
@@ -2176,6 +2194,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
21762194
}
21772195
}
21782196

2197+
InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
2198+
VPCostContext &Ctx) const {
2199+
if (VF.isScalar())
2200+
return 0;
2201+
2202+
InstructionCost C = 0;
2203+
auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
2204+
auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
2205+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2206+
const TargetTransformInfo &TTI = Ctx.TTI;
2207+
2208+
// Data Update
2209+
C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
2210+
2211+
// FIXME: These costs should be moved into VPInstruction::computeCost. We put
2212+
// them here for now since they are related to updating the data and there is
2213+
// no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
2214+
C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
2215+
// vp.reduce.or
2216+
C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
2217+
CostKind);
2218+
// VPVLSel
2219+
C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
2220+
// MaskUpdate
2221+
C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy, CostKind);
2222+
return C;
2223+
}
2224+
21792225
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
21802226
void VPCSAExtractScalarRecipe::print(raw_ostream &O, const Twine &Indent,
21812227
VPSlotTracker &SlotTracker) const {
@@ -2236,6 +2282,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
22362282
State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
22372283
}
22382284

2285+
InstructionCost
2286+
VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
2287+
VPCostContext &Ctx) const {
2288+
if (VF.isScalar())
2289+
return 0;
2290+
2291+
InstructionCost C = 0;
2292+
auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
2293+
auto *Int32VTy =
2294+
VectorType::get(IntegerType::getInt32Ty(VTy->getContext()), VF);
2295+
auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
2296+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2297+
const TargetTransformInfo &TTI = Ctx.TTI;
2298+
2299+
// StepVector
2300+
ArrayRef<Value *> Args;
2301+
IntrinsicCostAttributes CostAttrs(Intrinsic::stepvector, Int32VTy, Args);
2302+
C += TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2303+
// NegOneSplat
2304+
C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, Int32VTy);
2305+
// LastIdx
2306+
if (usesEVL()) {
2307+
C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
2308+
CostKind);
2309+
} else {
2310+
// ActiveLaneIdxs
2311+
C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy->getScalarType(),
2312+
CostKind);
2313+
// MaybeLastIdx
2314+
C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
2315+
CostKind);
2316+
// IsLaneZeroActive
2317+
C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, MaskTy,
2318+
CostKind);
2319+
// MaybeLastIdxEQZero
2320+
C += TTI.getArithmeticInstrCost(Instruction::ICmp, MaskTy->getScalarType(),
2321+
CostKind);
2322+
// And
2323+
C += TTI.getArithmeticInstrCost(Instruction::And , MaskTy->getScalarType(),
2324+
CostKind);
2325+
// LastIdx
2326+
C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
2327+
CostKind);
2328+
}
2329+
// ExtractFromVec
2330+
C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
2331+
// LastIdxGeZero
2332+
C += TTI.getArithmeticInstrCost(Instruction::ICmp, Int32VTy, CostKind);
2333+
// ChooseFromVecOrInit
2334+
C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
2335+
CostKind);
2336+
return C;
2337+
}
2338+
22392339
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
22402340
assert(State.Instance && "Branch on Mask works only on single instance.");
22412341

llvm/test/Transforms/LoopVectorize/RISCV/csa.ll

+14-80
Original file line numberDiff line numberDiff line change
@@ -3815,91 +3815,25 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
38153815
; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
38163816
; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
38173817
; NO-EVL: for.body.preheader:
3818-
; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
3819-
; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
3820-
; NO-EVL: vector.ph:
3821-
; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
3822-
; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
3823-
; NO-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
3824-
; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
3825-
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
3826-
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
3827-
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
3828-
; NO-EVL: vector.body:
3829-
; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
3830-
; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
3831-
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
3832-
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
3833-
; NO-EVL-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
3834-
; NO-EVL-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
3835-
; NO-EVL-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
3836-
; NO-EVL-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
3837-
; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
3838-
; NO-EVL-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
3839-
; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
3840-
; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
3841-
; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
3842-
; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
3843-
; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
3844-
; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
3845-
; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3846-
; NO-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
3847-
; NO-EVL-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3848-
; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
3849-
; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
3850-
; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
3851-
; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
3852-
; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
3853-
; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
3854-
; NO-EVL-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
3855-
; NO-EVL-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3856-
; NO-EVL-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
3857-
; NO-EVL-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3858-
; NO-EVL-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
3859-
; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
3860-
; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
3861-
; NO-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
3862-
; NO-EVL-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
3863-
; NO-EVL-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
3864-
; NO-EVL-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
3865-
; NO-EVL-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
3866-
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
3867-
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
3868-
; NO-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
3869-
; NO-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
3870-
; NO-EVL: middle.block:
3871-
; NO-EVL-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
3872-
; NO-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
3873-
; NO-EVL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
3874-
; NO-EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
3875-
; NO-EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
3876-
; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
3877-
; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
3878-
; NO-EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
3879-
; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
3880-
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
3881-
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
3882-
; NO-EVL: scalar.ph:
3883-
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
38843818
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
38853819
; NO-EVL: for.cond.cleanup.loopexit:
3886-
; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
3820+
; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
38873821
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
38883822
; NO-EVL: for.cond.cleanup:
3889-
; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
3823+
; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
38903824
; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
38913825
; NO-EVL: for.body:
3892-
; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
3893-
; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
3826+
; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
3827+
; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
38943828
; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
3895-
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
3896-
; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
3897-
; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
3898-
; NO-EVL-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
3899-
; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
3829+
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
3830+
; NO-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
3831+
; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
3832+
; NO-EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
3833+
; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
39003834
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
39013835
; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
3902-
; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
3836+
; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
39033837
;
39043838
; DATA-LABEL: @idx_scalar_dec(
39053839
; DATA-NEXT: entry:
@@ -4037,7 +3971,7 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
40373971
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
40383972
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
40393973
; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4040-
; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
3974+
; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
40413975
; NO-EVL: middle.block:
40423976
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
40433977
; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
@@ -4070,7 +4004,7 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
40704004
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
40714005
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
40724006
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
4073-
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
4007+
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
40744008
;
40754009
; DATA-LABEL: @simple_csa_int_select_neg_cond(
40764010
; DATA-NEXT: entry:
@@ -4207,7 +4141,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
42074141
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
42084142
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
42094143
; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4210-
; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
4144+
; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
42114145
; NO-EVL: middle.block:
42124146
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
42134147
; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
@@ -4241,7 +4175,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
42414175
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
42424176
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
42434177
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
4244-
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
4178+
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
42454179
;
42464180
; DATA-LABEL: @simple_csa_ptr_select(
42474181
; DATA-NEXT: entry:

0 commit comments

Comments
 (0)