From c354bdcb27ad73a1625c1b665d4bc7773f5ec8b2 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 1 Jun 2025 21:58:24 +0100 Subject: [PATCH 1/8] [VPlan] Unroll VPRedplicateRecipes by VF. --- .../Transforms/Vectorize/LoopVectorize.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 ++ llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++ .../Transforms/Vectorize/VPlanAnalysis.cpp | 2 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 61 +++++++++----- .../Transforms/Vectorize/VPlanTransforms.cpp | 16 ++++ .../Transforms/Vectorize/VPlanTransforms.h | 4 + llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 81 +++++++++++++++++++ .../X86/fixed-order-recurrence.ll | 6 -- ...terleave-ptradd-with-replicated-operand.ll | 51 +++++------- ...6-sunk-instruction-used-outside-of-loop.ll | 2 + .../invariant-store-vectorization-2.ll | 6 +- .../LoopVectorize/iv_outside_user.ll | 5 -- .../LoopVectorize/load-deref-pred-align.ll | 2 + .../Transforms/LoopVectorize/struct-return.ll | 8 +- .../Transforms/LoopVectorize/uniform-blend.ll | 4 + 16 files changed, 194 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fc8ebebcf21b7..d008cfc464d35 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7291,6 +7291,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); + VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1838562f26b82..a9d0182800db8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -261,6 +261,14 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { return Data.VPV2Scalars[Def][0]; } + // Look through BuildVector to avoid redundant extracts. + // TODO: Remove once replicate regions are unrolled explicitly. + auto *BV = dyn_cast(Def); + if (Lane.getKind() == VPLane::Kind::First && BV && + BV->getOpcode() == VPInstruction::BuildVector) { + return get(BV->getOperand(Lane.getKnownLane()), true); + } + assert(hasVectorValue(Def)); auto *VecPart = Data.VPV2Vector[Def]; if (!VecPart->getType()->isVectorTy()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 273df55188c16..3bc91eb9a1443 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -907,6 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags, BranchOnCount, BranchOnCond, Broadcast, + /// Creates a vector containing all operands. The vector element count + /// matches the number of operands. + BuildVector, + /// Creates a struct of vectors containing all operands. The vector element + /// count matches the number of operands. + BuildStructVector, ComputeAnyOfResult, ComputeFindLastIVResult, ComputeReductionResult, diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 81fc93bbf51fd..fac7d0ee13bbd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -107,6 +107,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: + case VPInstruction::BuildVector: + case VPInstruction::BuildStructVector: return SetResultTyFromOp(); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 90a04af60e3d8..a2b494e85611c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -493,6 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); + return State.get(getOperand(0), + VPLane(cast(getOperand(1)->getLiveInIRValue()) + ->getZExtValue())); Value *Vec = State.get(getOperand(0)); Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); return Builder.CreateExtractElement(Vec, Idx, Name); @@ -604,6 +607,33 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateVectorSplat( State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); } + case VPInstruction::BuildVector: { + auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); + Value *Res = PoisonValue::get( + toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands()))); + for (const auto &[Idx, Op] : enumerate(operands())) + Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), + State.Builder.getInt32(Idx)); + return Res; + } + case VPInstruction::BuildStructVector: { + // For struct types, we need to build a new 'wide' struct type, where each + // element is widened. + auto *STy = + cast(State.TypeAnalysis.inferScalarType(getOperand(0))); + Value *Res = PoisonValue::get( + toVectorizedTy(STy, ElementCount::getFixed(getNumOperands()))); + for (const auto &[Idx, Op] : enumerate(operands())) { + for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) { + Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); + Value *VectorValue = Builder.CreateExtractValue(Res, I); + VectorValue = + Builder.CreateInsertElement(VectorValue, ScalarValue, Idx); + Res = Builder.CreateInsertValue(Res, VectorValue, I); + } + } + return Res; + } case VPInstruction::ComputeAnyOfResult: { // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. @@ -872,10 +902,11 @@ void VPInstruction::execute(VPTransformState &State) { if (!hasResult()) return; assert(GeneratedValue && "generate must produce a value"); - assert( - (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || - State.VF.isScalar()) && - "scalar value but not only first lane defined"); + assert((((GeneratedValue->getType()->isVectorTy() || + GeneratedValue->getType()->isStructTy()) == + !GeneratesPerFirstLaneOnly) || + State.VF.isScalar()) && + "scalar value but not only first lane defined"); State.set(this, GeneratedValue, /*IsScalar*/ GeneratesPerFirstLaneOnly); } @@ -889,6 +920,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: + case VPInstruction::BuildVector: + case VPInstruction::BuildStructVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLastElement: @@ -1008,6 +1041,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Broadcast: O << "broadcast"; break; + case VPInstruction::BuildVector: + O << "buildvector"; + break; + case VPInstruction::BuildStructVector: + O << "buildstructvector"; + break; case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; @@ -2763,20 +2802,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { scalarizeInstruction(UI, this, VPLane(0), State); return; } - - // A store of a loop varying value to a uniform address only needs the last - // copy of the store. - if (isa(UI) && vputils::isSingleScalar(getOperand(1))) { - auto Lane = VPLane::getLastLaneForVF(State.VF); - scalarizeInstruction(UI, this, VPLane(Lane), State); - return; - } - - // Generate scalar instances for all VF lanes. - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - const unsigned EndLane = State.VF.getKnownMinValue(); - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - scalarizeInstruction(UI, this, VPLane(Lane), State); } bool VPReplicateRecipe::shouldPack() const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ea617f042566b..4177b5b1b956d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1140,6 +1140,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + // Look through Extract(Last|Penultimate)Element (BuildVector ....). + if (match(&R, + m_VPInstruction(m_VPValue(A))) || + match(&R, m_VPInstruction( + m_VPValue(A)))) { + unsigned Offset = cast(&R)->getOpcode() == + VPInstruction::ExtractLastElement + ? 1 + : 2; + auto *BV = dyn_cast(A); + if (BV && BV->getOpcode() == VPInstruction::BuildVector) { + Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset)); + return; + } + } + // Some simplifications can only be applied after unrolling. Perform them // below. if (!Plan->isUnrolled()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 34e2de4eb3b74..f45b7a7969d04 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,6 +99,10 @@ struct VPlanTransforms { /// Explicitly unroll \p Plan by \p UF. static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); + /// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p + /// VF. + static void unrollByVF(VPlan &Plan, ElementCount VF); + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 335301a927ceb..e48611ab4b923 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -15,6 +15,7 @@ #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanCFG.h" +#include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" @@ -430,3 +431,83 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { VPlanTransforms::removeDeadRecipes(Plan); } + +/// Create a single-scalar clone of RepR for lane \p Lane. +static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, + Type *IdxTy, VPReplicateRecipe *RepR, + VPLane Lane) { + // Collect the operands at Lane, creating extracts as needed. + SmallVector NewOps; + for (VPValue *Op : RepR->operands()) { + if (vputils::isSingleScalar(Op)) { + NewOps.push_back(Op); + continue; + } + VPValue *Ext; + if (Lane.getKind() == VPLane::Kind::ScalableLast) { + Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); + } else { + // Look through buildvector to avoid unnecessary extracts. + auto *BV = dyn_cast(Op); + if (BV && BV->getOpcode() == VPInstruction::BuildVector) { + NewOps.push_back(BV->getOperand(Lane.getKnownLane())); + continue; + } + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + NewOps.push_back(Ext); + } + + auto *New = + new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, + /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); + New->insertBefore(RepR); + return New; +} + +void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) { + Type *IdxTy = IntegerType::get( + Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *RepR = dyn_cast(&R); + if (!RepR || RepR->isSingleScalar()) + continue; + + VPBuilder Builder(RepR); + SmallVector LaneDefs; + // Stores to invariant addresses only need to store the last lane. + if (isa(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); + RepR->eraseFromParent(); + continue; + } + + /// Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); + + /// Users that only demand the first lane can use the definition for lane + /// 0. + RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { + return U.onlyFirstLaneUsed(RepR); + }); + + Type *ResTy = RepR->getUnderlyingInstr()->getType(); + // If needed, create a Build(Struct)Vector recipe to insert the scalar + // lane values into a vector. + if (!ResTy->isVoidTy()) { + VPValue *VecRes = Builder.createNaryOp( + ResTy->isStructTy() ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector, + LaneDefs); + RepR->replaceAllUsesWith(VecRes); + } + RepR->eraseFromParent(); + } + } +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index 83e9d6146755d..743aedee38012 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -398,12 +398,6 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3 ; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1 -; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2 -; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3 ; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4 ; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll index cdc7839bfc0f0..95258e65bbe3d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll @@ -32,42 +32,31 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]] ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]] +; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]] ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]] ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]] ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]] -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]] ; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]] -; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]] -; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]] -; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]] -; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]] +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]] ; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4 -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP30]], i32 -4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> @@ -85,7 +74,7 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]] ; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0 -; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1 ; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2 @@ -93,7 +82,7 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3 ; CHECK-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0 -; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP13]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1 ; CHECK-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2 @@ -101,19 +90,19 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3 ; CHECK-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4 ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0 -; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP14]], align 4 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1 ; CHECK-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2 ; CHECK-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4 ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3 -; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4 +; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP17]], align 4 ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 -; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4 +; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP15]], align 4 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 -; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4 +; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP18]], align 4 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 -; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4 +; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP19]], align 4 ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 ; CHECK-NEXT: store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -123,11 +112,11 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { ; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[M]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL26]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 ; CHECK-NEXT: [[P_4:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 4 ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[P_4]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll index 644f10b617eb7..db9be20f7d190 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -10,6 +10,8 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index 0306567c5ce98..a02fddc4cf72d 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -134,18 +134,14 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] -; CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index f801443b85d3f..7b7735434e67d 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -1101,7 +1101,6 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]] ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; VEC-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2 -; VEC-NEXT: [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]] ; VEC-NEXT: [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]] ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; VEC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 @@ -1293,8 +1292,6 @@ define i32 @iv_ext_used_outside( ptr %dst) { ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 ; VEC-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4 ; VEC-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1) -; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 -; VEC-NEXT: [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32 ; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 ; VEC-NEXT: [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32 ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 @@ -1389,9 +1386,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1 ; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2 -; VEC-NEXT: [[TMP3:%.*]] = add i64 2, -1 ; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1 -; VEC-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 1 ; VEC-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 ; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VEC: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 7a33fd092e293..8a326c9d0c083 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -669,6 +669,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index 50b9ba12af82d..0c5cf09430bb9 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -79,20 +79,20 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly ; CHECK: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}}) ; // Lane 0 ; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 -; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i64 0 ; CHECK: [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0 ; CHECK: [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1 ; CHECK: [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1 -; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0 +; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i64 0 ; CHECK: [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1 ; // Lane 1 ; CHECK: [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0 ; CHECK: [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0 -; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1 +; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i64 1 ; CHECK: [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0 ; CHECK: [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1 ; CHECK: [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1 -; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1 +; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i64 1 ; CHECK: [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1 ; // Store wide values: ; CHECK: [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index 130db548ca8cb..bdd7fef37e66d 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -229,6 +229,10 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x ptr> [[TMP36]], ptr [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP37]], ptr [[TMP8]], i32 3 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] ; CHECK: [[PRED_LOAD_IF]]: From 0800450605848882025ddee06fd65a49c1f813ad Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 10 Jun 2025 10:49:21 +0100 Subject: [PATCH 2/8] !fixup address latest comments, thanks --- .../Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +- llvm/lib/Transforms/Vectorize/VPlan.h | 9 +-- .../Transforms/Vectorize/VPlanAnalysis.cpp | 2 +- .../Transforms/Vectorize/VPlanPatternMatch.h | 16 ++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 81 +++++++++---------- .../Transforms/Vectorize/VPlanTransforms.cpp | 27 +++---- .../Transforms/Vectorize/VPlanTransforms.h | 7 +- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 28 +++---- 9 files changed, 95 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0ce6ee8a80556..87a5fb2fe741b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7294,7 +7294,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); - VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF); + VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a9d0182800db8..ec641e569731e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -263,10 +263,9 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { // Look through BuildVector to avoid redundant extracts. // TODO: Remove once replicate regions are unrolled explicitly. - auto *BV = dyn_cast(Def); - if (Lane.getKind() == VPLane::Kind::First && BV && - BV->getOpcode() == VPInstruction::BuildVector) { - return get(BV->getOperand(Lane.getKnownLane()), true); + if (Lane.getKind() == VPLane::Kind::First && match(Def, m_BuildVector())) { + auto *BuildVector = cast(Def); + return get(BuildVector->getOperand(Lane.getKnownLane()), true); } assert(hasVectorValue(Def)); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a4436e6b2532e..454accba7b4c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -907,12 +907,11 @@ class VPInstruction : public VPRecipeWithIRFlags, BranchOnCount, BranchOnCond, Broadcast, - /// Creates a vector containing all operands. The vector element count - /// matches the number of operands. - BuildVector, - /// Creates a struct of vectors containing all operands. The vector element - /// count matches the number of operands. + /// Creates a struct of fixed-width vectors containing all operands. The number of operands +/// matches the number of fields in the struct. BuildStructVector, + /// Creates a fixed-width vector containing all operands. The number of operands matches the vector element count. + BuildVector, ComputeAnyOfResult, ComputeFindLastIVResult, ComputeReductionResult, diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b79badf986f7b..da4a52203db3f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -108,8 +108,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: - case VPInstruction::BuildVector: case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: return SetResultTyFromOp(); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index dfd9fc3d4d719..7c427ac18e92b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -221,6 +221,9 @@ struct Recipe_match { if ((!matchRecipeAndOpcode(R) && ...)) return false; + auto *VPI = dyn_cast(R); + if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) + return true; assert(R->getNumOperands() == std::tuple_size::value && "recipe with matched opcode does not have the expected number of " "operands"); @@ -260,6 +263,10 @@ struct Recipe_match { } }; +template +using ZeroOpRecipe_match = + Recipe_match, Opcode, false, RecipeTys...>; + template using UnaryRecipe_match = Recipe_match, Opcode, false, RecipeTys...>; @@ -268,6 +275,10 @@ template using UnaryVPInstruction_match = UnaryRecipe_match; +template +using ZeroOpVPInstruction_match = + ZeroOpRecipe_match; + template using AllUnaryRecipe_match = UnaryRecipe_match; +inline ZeroOpVPInstruction_match +m_BuildVector() { + return ZeroOpVPInstruction_match(); +} + template inline UnaryVPInstruction_match m_VPInstruction(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7e0cdafd2f0a2..2eb03ab640b8c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -493,12 +493,10 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); + unsigned IdxToExtract = cast(getOperand(1)->getLiveInIRValue()) + ->getZExtValue(); return State.get(getOperand(0), - VPLane(cast(getOperand(1)->getLiveInIRValue()) - ->getZExtValue())); - Value *Vec = State.get(getOperand(0)); - Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); - return Builder.CreateExtractElement(Vec, Idx, Name); + VPLane(IdxToExtract)); } case Instruction::Freeze: { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); @@ -607,24 +605,17 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateVectorSplat( State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); } - case VPInstruction::BuildVector: { - auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); - Value *Res = PoisonValue::get( - toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands()))); - for (const auto &[Idx, Op] : enumerate(operands())) - Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), - State.Builder.getInt32(Idx)); - return Res; - } case VPInstruction::BuildStructVector: { // For struct types, we need to build a new 'wide' struct type, where each // element is widened. - auto *STy = + auto *StructTy = cast(State.TypeAnalysis.inferScalarType(getOperand(0))); + auto NumOfElements = ElementCount::getFixed(getNumOperands()); Value *Res = PoisonValue::get( - toVectorizedTy(STy, ElementCount::getFixed(getNumOperands()))); + toVectorizedTy(StructTy, NumOfElements)); + assert(NumOfElements.getKnownMinValue() == StructTy->getNumElements() && "number of operands must match number of elements in StructTy"); for (const auto &[Idx, Op] : enumerate(operands())) { - for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) { + for (unsigned I = 0 ; I != NumOfElements .getKnownMinValue(); I++) { Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); Value *VectorValue = Builder.CreateExtractValue(Res, I); VectorValue = @@ -634,6 +625,16 @@ Value *VPInstruction::generate(VPTransformState &State) { } return Res; } + case VPInstruction::BuildVector: { + auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); + auto NumOfElements = ElementCount::getFixed(getNumOperands()); + Value *Res = PoisonValue::get( + toVectorizedTy(ScalarTy, NumOfElements)); + for (const auto &[Idx, Op] : enumerate(operands())) + Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), + State.Builder.getInt32(Idx)); + return Res; + } case VPInstruction::ReductionStartVector: { if (State.VF.isScalar()) return State.get(getOperand(0), true); @@ -933,8 +934,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: - case VPInstruction::BuildVector: case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLastElement: @@ -1056,12 +1057,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Broadcast: O << "broadcast"; break; - case VPInstruction::BuildVector: - O << "buildvector"; - break; case VPInstruction::BuildStructVector: O << "buildstructvector"; break; + case VPInstruction::BuildVector: + O << "buildvector"; + break; case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; @@ -2797,30 +2798,28 @@ static void scalarizeInstruction(const Instruction *Instr, void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); - if (State.Lane) { // Generate a single instance. - assert((State.VF.isScalar() || !isSingleScalar()) && - "uniform recipe shouldn't be predicated"); - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - scalarizeInstruction(UI, this, *State.Lane, State); - // Insert scalar instance packing it into a vector. - if (State.VF.isVector() && shouldPack()) { - // If we're constructing lane 0, initialize to start from poison. - if (State.Lane->isFirstLane()) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - Value *Poison = - PoisonValue::get(VectorType::get(UI->getType(), State.VF)); - State.set(this, Poison); - } - State.packScalarIntoVectorizedValue(this, *State.Lane); - } - return; - } - if (IsSingleScalar) { - // Uniform within VL means we need to generate lane 0. + if (!State.Lane) { + assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions must be unrolled"); scalarizeInstruction(UI, this, VPLane(0), State); return; } + + assert((State.VF.isScalar() || !isSingleScalar()) && + "uniform recipe shouldn't be predicated"); + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); + scalarizeInstruction(UI, this, *State.Lane, State); + // Insert scalar instance packing it into a vector. + if (State.VF.isVector() && shouldPack()) { + // If we're constructing lane 0, initialize to start from poison. + if (State.Lane->isFirstLane()) { + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + Value *Poison = + PoisonValue::get(VectorType::get(UI->getType(), State.VF)); + State.set(this, Poison); + } + State.packScalarIntoVectorizedValue(this, *State.Lane); + } } bool VPReplicateRecipe::shouldPack() const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b9d029dd2eea5..df65874f605e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1140,21 +1140,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - // Look through Extract(Last|Penultimate)Element (BuildVector ....). - if (match(&R, - m_VPInstruction(m_VPValue(A))) || - match(&R, m_VPInstruction( - m_VPValue(A)))) { - unsigned Offset = cast(&R)->getOpcode() == - VPInstruction::ExtractLastElement - ? 1 - : 2; - auto *BV = dyn_cast(A); - if (BV && BV->getOpcode() == VPInstruction::BuildVector) { - Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset)); - return; - } + // Look through ExtractLastElement (BuildVector ....). + if (match(&R, m_VPInstruction( + m_BuildVector()))) { + auto *BuildVector = cast(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(BuildVector->getNumOperands() - 1)); + return; } + // Look through ExtractPenultimateElement (BuildVector ....). + if (match(&R, m_VPInstruction( + m_BuildVector()))) { + auto *BuildVector = cast(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(BuildVector->getNumOperands() - 2)); + return; +} // Some simplifications can only be applied after unrolling. Perform them // below. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index f45b7a7969d04..c848c2057c6d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,9 +99,10 @@ struct VPlanTransforms { /// Explicitly unroll \p Plan by \p UF. static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); - /// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p - /// VF. - static void unrollByVF(VPlan &Plan, ElementCount VF); + /// Replace replicating VPReplicateRecipes outside replicate regions in \p + /// Plan with \p VF single-scalar recipes. + /// TODO: Also unroll VPReplicateRegions by VF. + static void replicateByVF(VPlan &Plan, ElementCount VF); /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 1a86926937f4e..9b04761149f00 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -447,7 +447,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { VPlanTransforms::removeDeadRecipes(Plan); } -/// Create a single-scalar clone of RepR for lane \p Lane. +/// Create a single-scalar clone of \p RepR for lane \p Lane. static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPReplicateRecipe *RepR, VPLane Lane) { @@ -458,20 +458,18 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, NewOps.push_back(Op); continue; } - VPValue *Ext; if (Lane.getKind() == VPLane::Kind::ScalableLast) { - Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); - } else { - // Look through buildvector to avoid unnecessary extracts. - auto *BV = dyn_cast(Op); - if (BV && BV->getOpcode() == VPInstruction::BuildVector) { - NewOps.push_back(BV->getOperand(Lane.getKnownLane())); - continue; - } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); - Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + NewOps.push_back(Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); + continue; + } + // Look through buildvector to avoid unnecessary extracts. + if (match(Op, m_BuildVector())) { + NewOps.push_back(cast(Op)->getOperand(Lane.getKnownLane())); + continue; } + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } @@ -482,7 +480,7 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, return New; } -void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) { +void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { Type *IdxTy = IntegerType::get( Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -494,7 +492,7 @@ void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) { VPBuilder Builder(RepR); SmallVector LaneDefs; - // Stores to invariant addresses only need to store the last lane. + // Stores to invariant addresses need to store the last lane only. if (isa(RepR->getUnderlyingInstr()) && vputils::isSingleScalar(RepR->getOperand(1))) { cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); From 884b9d3cd83d8d699c12804337c28408ba4d2a9b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 10 Jun 2025 10:53:21 +0100 Subject: [PATCH 3/8] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++++--- .../Transforms/Vectorize/VPlanPatternMatch.h | 6 ++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 21 +++++++++---------- .../Transforms/Vectorize/VPlanTransforms.cpp | 12 ++++++----- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 6 ++++-- 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 454accba7b4c1..8764454a74985 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -907,10 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags, BranchOnCount, BranchOnCond, Broadcast, - /// Creates a struct of fixed-width vectors containing all operands. The number of operands -/// matches the number of fields in the struct. + /// Creates a struct of fixed-width vectors containing all operands. The + /// number of operands + /// matches the number of fields in the struct. BuildStructVector, - /// Creates a fixed-width vector containing all operands. The number of operands matches the vector element count. + /// Creates a fixed-width vector containing all operands. The number of + /// operands matches the vector element count. BuildVector, ComputeAnyOfResult, ComputeFindLastIVResult, diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 7c427ac18e92b..48a96e010473b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -276,8 +276,7 @@ using UnaryVPInstruction_match = UnaryRecipe_match; template -using ZeroOpVPInstruction_match = - ZeroOpRecipe_match; +using ZeroOpVPInstruction_match = ZeroOpRecipe_match; template using AllUnaryRecipe_match = @@ -310,8 +309,7 @@ using AllBinaryRecipe_match = BinaryRecipe_match; -inline ZeroOpVPInstruction_match -m_BuildVector() { +inline ZeroOpVPInstruction_match m_BuildVector() { return ZeroOpVPInstruction_match(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2eb03ab640b8c..4a1d7fd856e6e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -493,10 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); - unsigned IdxToExtract = cast(getOperand(1)->getLiveInIRValue()) - ->getZExtValue(); - return State.get(getOperand(0), - VPLane(IdxToExtract)); + unsigned IdxToExtract = + cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); + return State.get(getOperand(0), VPLane(IdxToExtract)); } case Instruction::Freeze: { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); @@ -611,11 +610,11 @@ Value *VPInstruction::generate(VPTransformState &State) { auto *StructTy = cast(State.TypeAnalysis.inferScalarType(getOperand(0))); auto NumOfElements = ElementCount::getFixed(getNumOperands()); - Value *Res = PoisonValue::get( - toVectorizedTy(StructTy, NumOfElements)); - assert(NumOfElements.getKnownMinValue() == StructTy->getNumElements() && "number of operands must match number of elements in StructTy"); + Value *Res = PoisonValue::get(toVectorizedTy(StructTy, NumOfElements)); + assert(NumOfElements.getKnownMinValue() == StructTy->getNumElements() && + "number of operands must match number of elements in StructTy"); for (const auto &[Idx, Op] : enumerate(operands())) { - for (unsigned I = 0 ; I != NumOfElements .getKnownMinValue(); I++) { + for (unsigned I = 0; I != NumOfElements.getKnownMinValue(); I++) { Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); Value *VectorValue = Builder.CreateExtractValue(Res, I); VectorValue = @@ -628,8 +627,7 @@ Value *VPInstruction::generate(VPTransformState &State) { case VPInstruction::BuildVector: { auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); auto NumOfElements = ElementCount::getFixed(getNumOperands()); - Value *Res = PoisonValue::get( - toVectorizedTy(ScalarTy, NumOfElements)); + Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); for (const auto &[Idx, Op] : enumerate(operands())) Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), State.Builder.getInt32(Idx)); @@ -2800,7 +2798,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (!State.Lane) { - assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions must be unrolled"); + assert(IsSingleScalar && + "VPReplicateRecipes outside replicate regions must be unrolled"); scalarizeInstruction(UI, this, VPLane(0), State); return; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index df65874f605e5..906763e6d891d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1142,18 +1142,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // Look through ExtractLastElement (BuildVector ....). if (match(&R, m_VPInstruction( - m_BuildVector()))) { + m_BuildVector()))) { auto *BuildVector = cast(R.getOperand(0)); - Def->replaceAllUsesWith(BuildVector->getOperand(BuildVector->getNumOperands() - 1)); + Def->replaceAllUsesWith( + BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). if (match(&R, m_VPInstruction( - m_BuildVector()))) { + m_BuildVector()))) { auto *BuildVector = cast(R.getOperand(0)); - Def->replaceAllUsesWith(BuildVector->getOperand(BuildVector->getNumOperands() - 2)); + Def->replaceAllUsesWith( + BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; -} + } // Some simplifications can only be applied after unrolling. Perform them // below. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 9b04761149f00..2b15e9ca08f7a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -459,12 +459,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, continue; } if (Lane.getKind() == VPLane::Kind::ScalableLast) { - NewOps.push_back(Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); + NewOps.push_back( + Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; } // Look through buildvector to avoid unnecessary extracts. if (match(Op, m_BuildVector())) { - NewOps.push_back(cast(Op)->getOperand(Lane.getKnownLane())); + NewOps.push_back( + cast(Op)->getOperand(Lane.getKnownLane())); continue; } VPValue *Idx = From 14e296c503a6959d1e688b0bb64ed4f687f5396f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 17 Jun 2025 22:50:38 +0100 Subject: [PATCH 4/8] !fixup update tests. --- .../LoopVectorize/first-order-recurrence.ll | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7684d274a75cf..17c2be64f1a31 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1055,16 +1055,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2 -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2 -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -1152,16 +1146,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) { ; SINK-AFTER-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2 ; SINK-AFTER-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4 ; SINK-AFTER-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6 -; SINK-AFTER-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2 -; SINK-AFTER-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2 ; SINK-AFTER-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2 ; SINK-AFTER-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2 -; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] -; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] ; SINK-AFTER-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] -; SINK-AFTER-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4 -; SINK-AFTER-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4 ; SINK-AFTER-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4 ; SINK-AFTER-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 From cc1a77960d908c7cb76ba920eea5dba6dbb160df Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 21 Jun 2025 20:43:52 +0100 Subject: [PATCH 5/8] !fixup address comments, thanks --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 +++--- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++++------ .../Transforms/Vectorize/VPlanTransforms.cpp | 1 + .../Transforms/Vectorize/VPlanTransforms.h | 7 ++++--- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 19 +++++++++++-------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 91d901a154102..0aeaa496674d0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -936,9 +936,9 @@ class VPInstruction : public VPRecipeWithIRFlags, BranchOnCount, BranchOnCond, Broadcast, - /// Creates a struct of fixed-width vectors containing all operands. The - /// number of operands - /// matches the number of fields in the struct. + /// Given operands of (the same) struct type, creates a struct of fixed- + /// width vectors each containing a struct field of all operands. The + /// number of operands matches the element count of every vector. BuildStructVector, /// Creates a fixed-width vector containing all operands. The number of /// operands matches the vector element count. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 70a24d6467648..790e5d22b5f46 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -610,15 +610,12 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::BuildStructVector: { // For struct types, we need to build a new 'wide' struct type, where each - // element is widened. + // element is widened, i.e. we crate a struct of vectors . auto *StructTy = cast(State.TypeAnalysis.inferScalarType(getOperand(0))); - auto NumOfElements = ElementCount::getFixed(getNumOperands()); - Value *Res = PoisonValue::get(toVectorizedTy(StructTy, NumOfElements)); - assert(NumOfElements.getKnownMinValue() == StructTy->getNumElements() && - "number of operands must match number of elements in StructTy"); + Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); for (const auto &[Idx, Op] : enumerate(operands())) { - for (unsigned I = 0; I != NumOfElements.getKnownMinValue(); I++) { + for (unsigned I = 0; I != StructTy->getNumElements(); I++) { Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); Value *VectorValue = Builder.CreateExtractValue(Res, I); VectorValue = @@ -2688,6 +2685,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { } State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, *State.Lane)); + } } bool VPReplicateRecipe::shouldPack() const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6186747ebf1e7..da198845378aa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1148,6 +1148,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } + // Look through ExtractPenultimateElement (BuildVector ....). if (match(&R, m_VPInstruction( m_BuildVector()))) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 652289483e555..40885cd52a127 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,9 +99,10 @@ struct VPlanTransforms { /// Explicitly unroll \p Plan by \p UF. static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); - /// Replace replicating VPReplicateRecipes outside replicate regions in \p - /// Plan with \p VF single-scalar recipes. - /// TODO: Also unroll VPReplicateRegions by VF. + /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan + /// with \p VF single-scalar recipes. + /// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby + /// dissolving the latter. static void replicateByVF(VPlan &Plan, ElementCount VF); /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 52919df530145..1efca5ee40164 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -511,22 +511,25 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); + if (RepR->getNumUsers() == 0) { + RepR->eraseFromParent(); + continue; + } + /// Users that only demand the first lane can use the definition for lane /// 0. RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); }); - Type *ResTy = RepR->getUnderlyingInstr()->getType(); // If needed, create a Build(Struct)Vector recipe to insert the scalar // lane values into a vector. - if (!ResTy->isVoidTy()) { - VPValue *VecRes = Builder.createNaryOp( - ResTy->isStructTy() ? VPInstruction::BuildStructVector - : VPInstruction::BuildVector, - LaneDefs); - RepR->replaceAllUsesWith(VecRes); - } + Type *ResTy = RepR->getUnderlyingInstr()->getType(); + VPValue *VecRes = Builder.createNaryOp( + ResTy->isStructTy() ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector, + LaneDefs); + RepR->replaceAllUsesWith(VecRes); RepR->eraseFromParent(); } } From af2d2c065fcf95022a865148dbbd8e18424f12bc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 25 Jun 2025 13:05:39 +0100 Subject: [PATCH 6/8] !fixup update after merge --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +- .../LoopVectorize/struct-return-replicate.ll | 96 +++++++++---------- 2 files changed, 55 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 45d8e2ed09567..3fae1ebffb703 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -551,9 +551,13 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); - unsigned IdxToExtract = - cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); - return State.get(getOperand(0), VPLane(IdxToExtract)); + if (getOperand(1)->isLiveIn()) { + unsigned IdxToExtract = cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); + return State.get(getOperand(0), VPLane(IdxToExtract)); + } + Value *Vec = State.get(getOperand(0)); + Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); + return Builder.CreateExtractElement(Vec, Idx, Name); } case Instruction::Freeze: { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll index fe53334cb25a7..717d1f9ae6fdf 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll @@ -23,19 +23,19 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 ; VF4-NEXT: [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0 -; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 ; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i64> } poison, <4 x i64> [[TMP11]], 0 ; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i64 } [[TMP5]], 0 ; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i64> } [[TMP12]], 0 -; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i32 1 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i64 1 ; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i64> } [[TMP12]], <4 x i64> [[TMP15]], 0 ; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i64 } [[TMP7]], 0 ; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i64> } [[TMP16]], 0 -; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i32 2 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i64 2 ; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i64> } [[TMP16]], <4 x i64> [[TMP19]], 0 ; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP9]], 0 ; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i64> } [[TMP20]], 0 -; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i32 3 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i64 3 ; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0 ; VF4-NEXT: [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0 ; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]] @@ -64,22 +64,22 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0 -; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0 ; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP6]], 0 ; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0 -; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i64 1 ; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0 ; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 ; VF2IC2-NEXT: [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]] ; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 ; VF2IC2-NEXT: [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]] ; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0 -; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0 +; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i64 0 ; VF2IC2-NEXT: [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0 ; VF2IC2-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP17]], 0 ; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { <2 x i64> } [[TMP20]], 0 -; VF2IC2-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1 +; VF2IC2-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i64 1 ; VF2IC2-NEXT: [[TMP24:%.*]] = insertvalue { <2 x i64> } [[TMP20]], <2 x i64> [[TMP23]], 0 ; VF2IC2-NEXT: [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0 ; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0 @@ -133,35 +133,35 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 ; VF4-NEXT: [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0 -; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 ; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP11]], 0 ; VF4-NEXT: [[TMP13:%.*]] = extractvalue { float, float } [[TMP3]], 1 ; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 1 -; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i32 0 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i64 0 ; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP15]], 1 ; VF4-NEXT: [[TMP17:%.*]] = extractvalue { float, float } [[TMP5]], 0 ; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP16]], 0 -; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i32 1 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i64 1 ; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP16]], <4 x float> [[TMP19]], 0 ; VF4-NEXT: [[TMP21:%.*]] = extractvalue { float, float } [[TMP5]], 1 ; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP20]], 1 -; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i32 1 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i64 1 ; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP20]], <4 x float> [[TMP23]], 1 ; VF4-NEXT: [[TMP25:%.*]] = extractvalue { float, float } [[TMP7]], 0 ; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0 -; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i64 2 ; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP27]], 0 ; VF4-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP7]], 1 ; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP28]], 1 -; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i32 2 +; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i64 2 ; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP28]], <4 x float> [[TMP31]], 1 ; VF4-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP9]], 0 ; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP32]], 0 -; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i32 3 +; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i64 3 ; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP32]], <4 x float> [[TMP35]], 0 ; VF4-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP9]], 1 ; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 1 -; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i32 3 +; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i64 3 ; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP39]], 1 ; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0 ; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1 @@ -194,38 +194,38 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0 -; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0 ; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP4]], 1 ; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP9]], 1 -; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i32 0 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i64 0 ; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP9]], <2 x float> [[TMP12]], 1 ; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { float, float } [[TMP6]], 0 ; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP13]], 0 -; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1 +; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i64 1 ; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP13]], <2 x float> [[TMP16]], 0 ; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { float, float } [[TMP6]], 1 ; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1 -; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i32 1 +; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 ; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1 ; VF2IC2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 ; VF2IC2-NEXT: [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]] ; VF2IC2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 ; VF2IC2-NEXT: [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]] ; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0 -; VF2IC2-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i32 0 +; VF2IC2-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0 ; VF2IC2-NEXT: [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0 ; VF2IC2-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP23]], 1 ; VF2IC2-NEXT: [[TMP30:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP28]], 1 -; VF2IC2-NEXT: [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i32 0 +; VF2IC2-NEXT: [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i64 0 ; VF2IC2-NEXT: [[TMP32:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP28]], <2 x float> [[TMP31]], 1 ; VF2IC2-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP25]], 0 ; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP32]], 0 -; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i32 1 +; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i64 1 ; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP32]], <2 x float> [[TMP35]], 0 ; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP25]], 1 ; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP36]], 1 -; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i32 1 +; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP36]], <2 x float> [[TMP39]], 1 ; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 0 ; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 0 @@ -290,51 +290,51 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 ; VF4-NEXT: [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]] ; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0 -; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 ; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP11]], 0 ; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 1 ; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1 -; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i32 0 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i64 0 ; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], <4 x i32> [[TMP15]], 1 ; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 2 ; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], 2 -; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i32 0 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i64 0 ; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], <4 x i32> [[TMP19]], 2 ; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 0 ; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], 0 -; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i32 1 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i64 1 ; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], <4 x i32> [[TMP23]], 0 ; VF4-NEXT: [[TMP25:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 1 ; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], 1 -; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i32 1 +; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i64 1 ; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], <4 x i32> [[TMP27]], 1 ; VF4-NEXT: [[TMP29:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 2 ; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], 2 -; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i32 1 +; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i64 1 ; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], <4 x i32> [[TMP31]], 2 ; VF4-NEXT: [[TMP33:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 0 ; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], 0 -; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i32 2 +; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i64 2 ; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], <4 x i32> [[TMP35]], 0 ; VF4-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 1 ; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], 1 -; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i32 2 +; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i64 2 ; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], <4 x i32> [[TMP39]], 1 ; VF4-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 2 ; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], 2 -; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i32 2 +; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i64 2 ; VF4-NEXT: [[TMP44:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], <4 x i32> [[TMP43]], 2 ; VF4-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 0 ; VF4-NEXT: [[TMP46:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], 0 -; VF4-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i32 3 +; VF4-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i64 3 ; VF4-NEXT: [[TMP48:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], <4 x i32> [[TMP47]], 0 ; VF4-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 1 ; VF4-NEXT: [[TMP50:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], 1 -; VF4-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i32 3 +; VF4-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i64 3 ; VF4-NEXT: [[TMP52:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], <4 x i32> [[TMP51]], 1 ; VF4-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 2 ; VF4-NEXT: [[TMP54:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], 2 -; VF4-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i32 3 +; VF4-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i64 3 ; VF4-NEXT: [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2 ; VF4-NEXT: [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0 ; VF4-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]] @@ -371,54 +371,54 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 ; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]] ; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0 -; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0 ; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0 ; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 1 ; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], 1 -; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 0 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i64 0 ; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], <2 x i32> [[TMP12]], 1 ; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 2 ; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], 2 -; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 0 +; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i64 0 ; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], <2 x i32> [[TMP16]], 2 ; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 0 ; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], 0 -; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i32 1 +; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i64 1 ; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], <2 x i32> [[TMP20]], 0 ; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 1 ; VF2IC2-NEXT: [[TMP23:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], 1 -; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i32 1 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i64 1 ; VF2IC2-NEXT: [[TMP25:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], <2 x i32> [[TMP24]], 1 ; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 2 ; VF2IC2-NEXT: [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2 -; VF2IC2-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i32 1 +; VF2IC2-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i64 1 ; VF2IC2-NEXT: [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2 ; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0 ; VF2IC2-NEXT: [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]] ; VF2IC2-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1 ; VF2IC2-NEXT: [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]] ; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0 -; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0 +; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i64 0 ; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0 ; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 1 ; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], 1 -; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i32 0 +; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i64 0 ; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], <2 x i32> [[TMP39]], 1 ; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 2 ; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], 2 -; VF2IC2-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i32 0 +; VF2IC2-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i64 0 ; VF2IC2-NEXT: [[TMP44:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], <2 x i32> [[TMP43]], 2 ; VF2IC2-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 0 ; VF2IC2-NEXT: [[TMP46:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], 0 -; VF2IC2-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i32 1 +; VF2IC2-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i64 1 ; VF2IC2-NEXT: [[TMP48:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], <2 x i32> [[TMP47]], 0 ; VF2IC2-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 1 ; VF2IC2-NEXT: [[TMP50:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], 1 -; VF2IC2-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i32 1 +; VF2IC2-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i64 1 ; VF2IC2-NEXT: [[TMP52:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], <2 x i32> [[TMP51]], 1 ; VF2IC2-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 2 ; VF2IC2-NEXT: [[TMP54:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], 2 -; VF2IC2-NEXT: [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i32 1 +; VF2IC2-NEXT: [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i64 1 ; VF2IC2-NEXT: [[TMP56:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], <2 x i32> [[TMP55]], 2 ; VF2IC2-NEXT: [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0 ; VF2IC2-NEXT: [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0 From ab6665cb0dafa69b4221a25dc740e0e9490374b5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 25 Jun 2025 20:01:26 +0100 Subject: [PATCH 7/8] !fixup address latest comments, thanks --- .../Transforms/Vectorize/VPlanPatternMatch.h | 12 +++++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 33 +++++++++---------- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 23 ++++++------- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 8178f9b87a88e..efea99f22d086 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -221,12 +221,16 @@ struct Recipe_match { } bool match(const VPRecipeBase *R) const { + if (std::tuple_size::value == 0) { + assert(Opcode == VPInstruction::BuildVector && + "can only match BuildVector with empty ops"); + auto *VPI = dyn_cast(R); + return VPI && VPI->getOpcode() == VPInstruction::BuildVector; + } + if ((!matchRecipeAndOpcode(R) && ...)) return false; - auto *VPI = dyn_cast(R); - if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) - return true; assert(R->getNumOperands() == std::tuple_size::value && "recipe with matched opcode does not have the expected number of " "operands"); @@ -312,6 +316,8 @@ using AllBinaryRecipe_match = BinaryRecipe_match; +/// BuildVector is matches only its opcode, w/o matching its operands as the +/// number of operands is not fixed. inline ZeroOpVPInstruction_match m_BuildVector() { return ZeroOpVPInstruction_match(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3fae1ebffb703..29312ec6b36ca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -670,17 +670,19 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::BuildStructVector: { // For struct types, we need to build a new 'wide' struct type, where each - // element is widened, i.e. we crate a struct of vectors . + // element is widened, i.e., we create a struct of vectors. auto *StructTy = cast(State.TypeAnalysis.inferScalarType(getOperand(0))); Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); - for (const auto &[Idx, Op] : enumerate(operands())) { - for (unsigned I = 0; I != StructTy->getNumElements(); I++) { - Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I); - Value *VectorValue = Builder.CreateExtractValue(Res, I); + for (const auto &[LaneIndex, Op] : enumerate(operands())) { + for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); + FieldIndex++) { + Value *ScalarValue = + Builder.CreateExtractValue(State.get(Op, true), FieldIndex); + Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex); VectorValue = - Builder.CreateInsertElement(VectorValue, ScalarValue, Idx); - Res = Builder.CreateInsertValue(Res, VectorValue, I); + Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex); + Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex); } } return Res; @@ -2727,25 +2729,22 @@ void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (!State.Lane) { - assert(IsSingleScalar && - "VPReplicateRecipes outside replicate regions must be unrolled"); + assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " + "must have already been unrolled"); scalarizeInstruction(UI, this, VPLane(0), State); return; } assert((State.VF.isScalar() || !isSingleScalar()) && "uniform recipe shouldn't be predicated"); + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); scalarizeInstruction(UI, this, *State.Lane, State); // Insert scalar instance packing it into a vector. if (State.VF.isVector() && shouldPack()) { - Value *WideValue; - // If we're constructing lane 0, initialize to start from poison. - if (State.Lane->isFirstLane()) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); - } else { - WideValue = State.get(this); - } + Value *WideValue = + State.Lane->isFirstLane() + ? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) + : State.get(this); State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, *State.Lane)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 1efca5ee40164..f198fbc7a0a08 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -498,24 +498,25 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { continue; VPBuilder Builder(RepR); - SmallVector LaneDefs; - // Stores to invariant addresses need to store the last lane only. - if (isa(RepR->getUnderlyingInstr()) && - vputils::isSingleScalar(RepR->getOperand(1))) { - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); + if (RepR->getNumUsers() == 0) { + if (isa(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + // Stores to invariant addresses need to store the last lane only. + cloneForLane(Plan, Builder, IdxTy, RepR, + VPLane::getLastLaneForVF(VF)); + } else { + // Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); + } RepR->eraseFromParent(); continue; } - /// Create single-scalar version of RepR for all lanes. + SmallVector LaneDefs; for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); - if (RepR->getNumUsers() == 0) { - RepR->eraseFromParent(); - continue; - } - /// Users that only demand the first lane can use the definition for lane /// 0. RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { From ae3e3c43bc51c00b1bf8336c626a41b54d9d58af Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 26 Jun 2025 09:55:13 +0100 Subject: [PATCH 8/8] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 29312ec6b36ca..f51ed706efd88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -552,8 +552,9 @@ Value *VPInstruction::generate(VPTransformState &State) { case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); if (getOperand(1)->isLiveIn()) { - unsigned IdxToExtract = cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); - return State.get(getOperand(0), VPLane(IdxToExtract)); + unsigned IdxToExtract = + cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); + return State.get(getOperand(0), VPLane(IdxToExtract)); } Value *Vec = State.get(getOperand(0)); Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);