diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 5ab31a687ec5e..3590522e0e4d1 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -497,10 +497,10 @@ class TargetLoweringBase { return true; } - /// Return true if multiple condition registers are available. - bool hasMultipleConditionRegisters() const { - return HasMultipleConditionRegisters; - } + /// Return true if multiple (allocatable) predicate registers are available + /// for \p VT. If there is only a single register the code generator will + /// sink comparisons into the blocks of their users. + virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; } /// Return true if the target has BitExtract instructions. bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } @@ -2389,7 +2389,7 @@ class TargetLoweringBase { EVT VT) const { // If a target has multiple condition registers, then it likely has logical // operations on those registers. - if (hasMultipleConditionRegisters()) + if (hasMultiplePredicateRegisters(VT)) return false; // Only do the transform if the value won't be split into multiple // registers. @@ -2496,15 +2496,6 @@ class TargetLoweringBase { StackPointerRegisterToSaveRestore = R; } - /// Tells the code generator that the target has multiple (allocatable) - /// condition registers that can be used to store the results of comparisons - /// for use by selects and conditional branches. With multiple condition - /// registers, the code generator will not aggressively sink comparisons into - /// the blocks of their users. - void setHasMultipleConditionRegisters(bool hasManyRegs = true) { - HasMultipleConditionRegisters = hasManyRegs; - } - /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be @@ -3470,13 +3461,6 @@ class TargetLoweringBase { private: const TargetMachine &TM; - /// Tells the code generator that the target has multiple (allocatable) - /// condition registers that can be used to store the results of comparisons - /// for use by selects and conditional branches. With multiple condition - /// registers, the code generator will not aggressively sink comparisons into - /// the blocks of their users. - bool HasMultipleConditionRegisters; - /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 86f28293ba9ff..e8ab4011b86a2 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1771,8 +1771,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { - if (TLI.hasMultipleConditionRegisters()) +static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp, + const TargetLowering &TLI) { + EVT ResVT = TLI.getValueType(DL, Cmp->getType()); + if (TLI.hasMultiplePredicateRegisters(ResVT)) return false; // Avoid sinking soft-FP comparisons, since this can move them into a loop. @@ -2137,7 +2139,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI, } bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { - if (sinkCmpExpression(Cmp, *TLI)) + if (sinkCmpExpression(*DL, Cmp, *TLI)) return true; if (combineToUAddWithOverflow(Cmp, ModifiedDT)) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 7a28f7892cbf3..c8dc633cb6753 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) MaxGluedStoresPerMemcpy = 0; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; - HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index cf2ae5fd027c7..f303c26bd0827 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1358,6 +1358,10 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMinimumJumpTableEntries() const override; bool softPromoteHalfType() const override { return true; } + + virtual bool hasMultiplePredicateRegisters(EVT VT) const { + return VT.isVector(); + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0f65df0763cc8..ec0353feb3dc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); - // FIXME: This is only partially true. If we have to do vector compares, any - // SGPR pair can be a condition register. If we have a uniform condition, we - // are better off doing SALU operations, where there is only one SCC. For now, - // we don't have a way of knowing during instruction selection if a condition - // will be uniform and we always use vector compares. Assume we are using - // vector compares until that is fixed. - setHasMultipleConditionRegisters(true); - setMinCmpXchgSizeInBits(32); setSupportsUnalignedAtomics(false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index b2fd31cb2346e..166a9099a0d47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + virtual bool hasMultiplePredicateRegisters(EVT VT) const override { + // FIXME: This is only partially true. If we have to do vector compares, + // any SGPR pair can be a condition register. If we have a uniform + // condition, we are better off doing SALU operations, where there is only + // one SCC. For now, we don't have a way of knowing during instruction + // selection if a condition will be uniform and we always use vector + // compares. Assume we are using vector compares until that is fixed. + return true; + } }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index cec1e507f08f2..c49e2e1ffbd12 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1454,10 +1454,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. - if (Subtarget.useCRBits()) { - setHasMultipleConditionRegisters(); + if (Subtarget.useCRBits()) setJumpIsExpensive(); - } // TODO: The default entry number is set to 64. This stops most jump table // generation on PPC. But it is good for current PPC HWs because the indirect @@ -19044,3 +19042,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } + +bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const { + // With 32 condition bits, we don't need to sink (and duplicate) compares + // aggressively in CodeGenPrep. + return Subtarget.useCRBits(); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 8907c3c5a81c3..7c6ab7bf5cf70 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1490,6 +1490,8 @@ namespace llvm { /// through to determine the optimal load/store instruction format. unsigned computeMOFlags(const SDNode *Parent, SDValue N, SelectionDAG &DAG) const; + + virtual bool hasMultiplePredicateRegisters(EVT VT) const override; }; // end class PPCTargetLowering namespace PPC { diff --git a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll index c7e80b1c3dbb6..93879d41a2543 100644 --- a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll +++ b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll @@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu" define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) { ; CHECK-LABEL: vector_loop_with_icmp: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #15 // =0xf +; CHECK-NEXT: mov w9, #15 // =0xf ; CHECK-NEXT: mov w10, #4 // =0x4 -; CHECK-NEXT: adrp x9, .LCPI0_0 +; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: adrp x11, .LCPI0_1 -; CHECK-NEXT: dup v0.2d, x8 +; CHECK-NEXT: dup v0.2d, x9 ; CHECK-NEXT: dup v1.2d, x10 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1] -; CHECK-NEXT: add x9, x0, #8 -; CHECK-NEXT: mov w10, #16 // =0x10 -; CHECK-NEXT: mov w11, #1 // =0x1 +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #1 // =0x1 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %pred.store.continue18 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: add v2.2d, v2.2d, v1.2d ; CHECK-NEXT: add v3.2d, v3.2d, v1.2d -; CHECK-NEXT: subs x10, x10, #4 -; CHECK-NEXT: add x9, x9, #16 +; CHECK-NEXT: subs x9, x9, #4 +; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: b.eq .LBB0_10 ; CHECK-NEXT: .LBB0_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d -; CHECK-NEXT: xtn v4.2s, v4.2d -; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h -; CHECK-NEXT: umov w12, v4.h[0] -; CHECK-NEXT: tbz w12, #0, .LBB0_4 -; CHECK-NEXT: // %bb.3: // %pred.store.if +; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d +; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: xtn v4.4h, v4.4s +; CHECK-NEXT: umov w11, v4.h[0] +; CHECK-NEXT: tbnz w11, #0, .LBB0_6 +; CHECK-NEXT: // %bb.3: // %pred.store.continue ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: stur w11, [x9, #-8] -; CHECK-NEXT: .LBB0_4: // %pred.store.continue +; CHECK-NEXT: umov w11, v4.h[1] +; CHECK-NEXT: tbnz w11, #0, .LBB0_7 +; CHECK-NEXT: .LBB0_4: // %pred.store.continue6 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: dup v4.2d, x8 -; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d -; CHECK-NEXT: xtn v4.2s, v4.2d -; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h -; CHECK-NEXT: umov w12, v4.h[1] -; CHECK-NEXT: tbz w12, #0, .LBB0_6 -; CHECK-NEXT: // %bb.5: // %pred.store.if5 +; CHECK-NEXT: umov w11, v4.h[2] +; CHECK-NEXT: tbnz w11, #0, .LBB0_8 +; CHECK-NEXT: .LBB0_5: // %pred.store.continue8 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: stur w11, [x9, #-4] -; CHECK-NEXT: .LBB0_6: // %pred.store.continue6 +; CHECK-NEXT: umov w11, v4.h[3] +; CHECK-NEXT: tbz w11, #0, .LBB0_1 +; CHECK-NEXT: b .LBB0_9 +; CHECK-NEXT: .LBB0_6: // %pred.store.if ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: dup v4.2d, x8 -; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d -; CHECK-NEXT: xtn v4.2s, v4.2d -; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h -; CHECK-NEXT: umov w12, v4.h[2] -; CHECK-NEXT: tbz w12, #0, .LBB0_8 -; CHECK-NEXT: // %bb.7: // %pred.store.if7 +; CHECK-NEXT: stur w10, [x8, #-8] +; CHECK-NEXT: umov w11, v4.h[1] +; CHECK-NEXT: tbz w11, #0, .LBB0_4 +; CHECK-NEXT: .LBB0_7: // %pred.store.if5 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: str w11, [x9] -; CHECK-NEXT: .LBB0_8: // %pred.store.continue8 +; CHECK-NEXT: stur w10, [x8, #-4] +; CHECK-NEXT: umov w11, v4.h[2] +; CHECK-NEXT: tbz w11, #0, .LBB0_5 +; CHECK-NEXT: .LBB0_8: // %pred.store.if7 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: dup v4.2d, x8 -; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d -; CHECK-NEXT: xtn v4.2s, v4.2d -; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h -; CHECK-NEXT: umov w12, v4.h[3] -; CHECK-NEXT: tbz w12, #0, .LBB0_1 -; CHECK-NEXT: // %bb.9: // %pred.store.if9 +; CHECK-NEXT: str w10, [x8] +; CHECK-NEXT: umov w11, v4.h[3] +; CHECK-NEXT: tbz w11, #0, .LBB0_1 +; CHECK-NEXT: .LBB0_9: // %pred.store.if9 ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: str w11, [x9, #4] +; CHECK-NEXT: str w10, [x8, #4] ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup ; CHECK-NEXT: ret