Skip to content

Commit 133f50a

Browse files
committed
!fixup Address latest comments, thanks
1 parent 14f2feb commit 133f50a

File tree

6 files changed

+61
-43
lines changed

6 files changed

+61
-43
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10121012
/// Creates a fixed-width vector containing all operands. The number of
10131013
/// operands matches the vector element count.
10141014
BuildVector,
1015+
/// Extracts all lanes from its (non-scalable) vector operand. This is an
1016+
/// abstract VPInstruction whose single defined VPValue represents VF
1017+
/// scalars extracted from a vector, to be replaced by VF ExtractElement
1018+
/// VPInstructions.
1019+
Unpack,
10151020
/// Compute the final result of a AnyOf reduction with select(cmp(),x,y),
10161021
/// where one of (x,y) is loop invariant, and both x and y are integer type.
10171022
ComputeAnyOfResult,
@@ -1064,11 +1069,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10641069
ResumeForEpilogue,
10651070
/// Returns the value for vscale.
10661071
VScale,
1067-
/// Extracts all lanes from its (non-scalable) vector operand. This is an
1068-
/// abstract VPInstruction whose single defined VPValue represents VF
1069-
/// scalars extracted from a vector, to be replaced by VF ExtractElement
1070-
/// VPInstructions.
1071-
Unpack,
10721072
};
10731073

10741074
/// Returns true if this VPInstruction generates scalar values for all lanes.
@@ -3116,6 +3116,9 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
31163116
/// Returns true if this expression contains recipes that may have side
31173117
/// effects.
31183118
bool mayHaveSideEffects() const;
3119+
3120+
/// Returns true if the result of this VPExpressionRecipe is a single-scalar.
3121+
bool isSingleScalar() const;
31193122
};
31203123

31213124
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2886,6 +2886,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const {
28862886
return false;
28872887
}
28882888

2889+
bool VPExpressionRecipe::isSingleScalar() const {
2890+
// Cannot use vputils::isSingleScalar(), because all external operands
2891+
// of the expression will be live-ins while bundled.
2892+
return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2893+
!isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2894+
}
2895+
28892896
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
28902897

28912898
void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3795,7 +3795,10 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
37953795
}
37963796
}
37973797

3798-
// Create explicit VPInstructions to convert vectors to scalars.
3798+
// Create explicit VPInstructions to convert vectors to scalars. The current
3799+
// implementation is conservative - it may miss some cases that may or may not
3800+
// be vector values. TODO: introduce Unpacks speculatively - remove them later
3801+
// if they are known to operate on scalar values.
37993802
for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
38003803
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
38013804
if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
@@ -3810,11 +3813,10 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
38103813
if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
38113814
continue;
38123815

3813-
// At the moment, we only create unpacks for scalar users outside
3814-
// replicate regions. Recipes inside replicate regions still manually
3815-
// extract the required lanes.
3816-
// TODO: Remove once replicate regions are
3817-
// unrolled completely.
3816+
// At the moment, we create unpacks only for scalar users outside
3817+
// replicate regions. Recipes inside replicate regions still extract the
3818+
// required lanes implicitly.
3819+
// TODO: Remove once replicate regions are unrolled completely.
38183820
auto IsCandidateUnpackUser = [Def](VPUser *U) {
38193821
VPRegionBlock *ParentRegion =
38203822
cast<VPRecipeBase>(U)->getParent()->getParent();

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,9 @@ struct VPlanTransforms {
325325
static void materializeBackedgeTakenCount(VPlan &Plan,
326326
VPBasicBlock *VectorPH);
327327

328-
/// Add explicit Build[Struct]Vector recipes that combine multiple scalar
329-
/// values into single vectors and Unpack recipes to extract scalars from a
330-
/// vector as needed.
328+
/// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
329+
/// into vectors and Unpack recipes to extract scalars from vectors as
330+
/// needed.
331331
static void materializePacksAndUnpacks(VPlan &Plan);
332332

333333
/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) {
8484
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
8585
(PreservesUniformity(VPI->getOpcode()) &&
8686
all_of(VPI->operands(), isSingleScalar));
87+
if (isa<VPPartialReductionRecipe>(VPV))
88+
return false;
89+
if (isa<VPReductionRecipe>(VPV))
90+
return true;
91+
if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
92+
return Expr->isSingleScalar();
8793

8894
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
8995
return isa<VPExpandSCEVRecipe>(VPV);

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -693,20 +693,20 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
693693
; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
694694
; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
695695
; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
696-
; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
697696
; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
698-
; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
699697
; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
700-
; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
701698
; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
702-
; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
703699
; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
704-
; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
705700
; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
706-
; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
707701
; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
708-
; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
709702
; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
703+
; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
704+
; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
705+
; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
706+
; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
707+
; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
708+
; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
709+
; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
710710
; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
711711
; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
712712
; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
@@ -847,32 +847,32 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
847847
; I64-NEXT: [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0
848848
; I64-NEXT: [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1
849849
; I64-NEXT: [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]]
850-
; I64-NEXT: [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
851-
; I64-NEXT: [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
852-
; I64-NEXT: [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
853-
; I64-NEXT: [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
854-
; I64-NEXT: [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
855-
; I64-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
856-
; I64-NEXT: [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
857-
; I64-NEXT: [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
858-
; I64-NEXT: [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
859-
; I64-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
860-
; I64-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
861850
; I64-NEXT: [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0
862-
; I64-NEXT: store double [[TMP84]], ptr [[TMP76]], align 8
863851
; I64-NEXT: [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1
864-
; I64-NEXT: store double [[TMP85]], ptr [[TMP77]], align 8
852+
; I64-NEXT: [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
865853
; I64-NEXT: [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0
866-
; I64-NEXT: store double [[TMP86]], ptr [[TMP78]], align 8
867854
; I64-NEXT: [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1
868-
; I64-NEXT: store double [[TMP87]], ptr [[TMP79]], align 8
855+
; I64-NEXT: [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
869856
; I64-NEXT: [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0
870-
; I64-NEXT: store double [[TMP88]], ptr [[TMP80]], align 8
871857
; I64-NEXT: [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1
872-
; I64-NEXT: store double [[TMP89]], ptr [[TMP81]], align 8
858+
; I64-NEXT: [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
873859
; I64-NEXT: [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0
874-
; I64-NEXT: store double [[TMP90]], ptr [[TMP82]], align 8
875860
; I64-NEXT: [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1
861+
; I64-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
862+
; I64-NEXT: [[TMP94:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
863+
; I64-NEXT: [[TMP95:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
864+
; I64-NEXT: [[TMP96:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
865+
; I64-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
866+
; I64-NEXT: [[TMP98:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
867+
; I64-NEXT: [[TMP99:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
868+
; I64-NEXT: [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
869+
; I64-NEXT: store double [[TMP84]], ptr [[TMP93]], align 8
870+
; I64-NEXT: store double [[TMP85]], ptr [[TMP94]], align 8
871+
; I64-NEXT: store double [[TMP86]], ptr [[TMP95]], align 8
872+
; I64-NEXT: store double [[TMP87]], ptr [[TMP96]], align 8
873+
; I64-NEXT: store double [[TMP88]], ptr [[TMP97]], align 8
874+
; I64-NEXT: store double [[TMP89]], ptr [[TMP98]], align 8
875+
; I64-NEXT: store double [[TMP90]], ptr [[TMP99]], align 8
876876
; I64-NEXT: store double [[TMP91]], ptr [[TMP83]], align 8
877877
; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
878878
; I64-NEXT: [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -928,17 +928,17 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
928928
; I32-NEXT: [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2
929929
; I32-NEXT: [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3
930930
; I32-NEXT: [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]]
931+
; I32-NEXT: [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
932+
; I32-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
933+
; I32-NEXT: [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
934+
; I32-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
931935
; I32-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
932936
; I32-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]]
933937
; I32-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]]
934938
; I32-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]]
935-
; I32-NEXT: [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
936939
; I32-NEXT: store double [[TMP41]], ptr [[TMP37]], align 8
937-
; I32-NEXT: [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
938940
; I32-NEXT: store double [[TMP42]], ptr [[TMP38]], align 8
939-
; I32-NEXT: [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
940941
; I32-NEXT: store double [[TMP43]], ptr [[TMP39]], align 8
941-
; I32-NEXT: [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
942942
; I32-NEXT: store double [[TMP44]], ptr [[TMP40]], align 8
943943
; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
944944
; I32-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100

0 commit comments

Comments
 (0)