Skip to content

Commit

Permalink
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Browse files Browse the repository at this point in the history
Whilst reviewing PR #109289 and doing some analysis with various
tests involving predicated blocks I noticed that we're making
codegen and performance worse by sinking vector comparisons
multiple times into blocks. It looks like the sinkCmpExpression
in CodeGenPrepare was written for scalar comparisons where there
is only a single condition register, whereas vector comparisons
typically produce a vector result. For some targets, such a NEON
or SVE, there are multiple allocatable vector registers that can
store the result and so we should avoid sinking in that case.
  • Loading branch information
david-arm committed Oct 24, 2024
1 parent 6b93bd0 commit 97fcc44
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 43 deletions.
19 changes: 19 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,11 @@ class TargetLoweringBase {
return HasMultipleConditionRegisters;
}

/// Return true if multiple vector predicate registers are available.
bool hasMultipleVectorPredicateRegisters() const {
return HasMultipleVectorPredicateRegisters;
}

/// Return true if the target has BitExtract instructions.
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }

Expand Down Expand Up @@ -2505,6 +2510,15 @@ class TargetLoweringBase {
HasMultipleConditionRegisters = hasManyRegs;
}

/// Tells the code generator that the target has multiple (allocatable)
/// vector predicate registers that can be used to store the results of
/// vector comparisons. With multiple predicate registers, the code
/// generator will not aggressively sink vector comparisons into the blocks
/// of their users.
void setHasMultipleVectorPredicateRegisters(bool hasManyRegs = true) {
HasMultipleVectorPredicateRegisters = hasManyRegs;
}

/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
Expand Down Expand Up @@ -3477,6 +3491,11 @@ class TargetLoweringBase {
/// the blocks of their users.
bool HasMultipleConditionRegisters;

/// Tells the code generator that the target has multiple (allocatable)
/// vector predicate registers that can be used to store the results of
/// vector comparisons.
bool HasMultipleVectorPredicateRegisters;

/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1775,6 +1775,13 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
if (TLI.hasMultipleConditionRegisters())
return false;

// If this is a vector comparison the result may not depend upon setting a
// condition register, and if so it's probably better not to sink.
VectorType *VecType = dyn_cast<VectorType>(Cmp->getType());
if (VecType && VecType->getElementCount().isVector() &&
TLI.hasMultipleVectorPredicateRegisters())
return false;

// Avoid sinking soft-FP comparisons, since this can move them into a loop.
if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
return false;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
HasMultipleConditionRegisters = false;
HasMultipleVectorPredicateRegisters = false;
HasExtractBitsInsn = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget->getRegisterInfo());

if (Subtarget->hasNEON() || Subtarget->hasSVE())
setHasMultipleVectorPredicateRegisters(true);

// Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
Expand Down
82 changes: 39 additions & 43 deletions llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu"
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
; CHECK-LABEL: vector_loop_with_icmp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, #15 // =0xf
; CHECK-NEXT: mov w9, #15 // =0xf
; CHECK-NEXT: mov w10, #4 // =0x4
; CHECK-NEXT: adrp x9, .LCPI0_0
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: adrp x11, .LCPI0_1
; CHECK-NEXT: dup v0.2d, x8
; CHECK-NEXT: dup v0.2d, x9
; CHECK-NEXT: dup v1.2d, x10
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1]
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: mov w10, #16 // =0x10
; CHECK-NEXT: mov w11, #1 // =0x1
; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: mov w10, #1 // =0x1
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_1: // %pred.store.continue18
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
; CHECK-NEXT: subs x10, x10, #4
; CHECK-NEXT: add x9, x9, #16
; CHECK-NEXT: subs x9, x9, #4
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: b.eq .LBB0_10
; CHECK-NEXT: .LBB0_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
; CHECK-NEXT: umov w12, v4.h[0]
; CHECK-NEXT: tbz w12, #0, .LBB0_4
; CHECK-NEXT: // %bb.3: // %pred.store.if
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
; CHECK-NEXT: xtn v4.4h, v4.4s
; CHECK-NEXT: umov w11, v4.h[0]
; CHECK-NEXT: tbnz w11, #0, .LBB0_6
; CHECK-NEXT: // %bb.3: // %pred.store.continue
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: stur w11, [x9, #-8]
; CHECK-NEXT: .LBB0_4: // %pred.store.continue
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbnz w11, #0, .LBB0_7
; CHECK-NEXT: .LBB0_4: // %pred.store.continue6
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
; CHECK-NEXT: umov w12, v4.h[1]
; CHECK-NEXT: tbz w12, #0, .LBB0_6
; CHECK-NEXT: // %bb.5: // %pred.store.if5
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbnz w11, #0, .LBB0_8
; CHECK-NEXT: .LBB0_5: // %pred.store.continue8
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: stur w11, [x9, #-4]
; CHECK-NEXT: .LBB0_6: // %pred.store.continue6
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB0_1
; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .LBB0_6: // %pred.store.if
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
; CHECK-NEXT: umov w12, v4.h[2]
; CHECK-NEXT: tbz w12, #0, .LBB0_8
; CHECK-NEXT: // %bb.7: // %pred.store.if7
; CHECK-NEXT: stur w10, [x8, #-8]
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbz w11, #0, .LBB0_4
; CHECK-NEXT: .LBB0_7: // %pred.store.if5
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: str w11, [x9]
; CHECK-NEXT: .LBB0_8: // %pred.store.continue8
; CHECK-NEXT: stur w10, [x8, #-4]
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbz w11, #0, .LBB0_5
; CHECK-NEXT: .LBB0_8: // %pred.store.if7
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
; CHECK-NEXT: umov w12, v4.h[3]
; CHECK-NEXT: tbz w12, #0, .LBB0_1
; CHECK-NEXT: // %bb.9: // %pred.store.if9
; CHECK-NEXT: str w10, [x8]
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB0_1
; CHECK-NEXT: .LBB0_9: // %pred.store.if9
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: str w11, [x9, #4]
; CHECK-NEXT: str w10, [x8, #4]
; CHECK-NEXT: b .LBB0_1
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
; CHECK-NEXT: ret
Expand Down

0 comments on commit 97fcc44

Please sign in to comment.