Skip to content

Commit

Permalink
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Browse files Browse the repository at this point in the history
Whilst reviewing PR #109289 and doing some analysis with various
tests involving predicated blocks I noticed that we're making
codegen and performance worse by sinking vector comparisons
multiple times into blocks. It looks like the sinkCmpExpression
in CodeGenPrepare was written for scalar comparisons where there
is only a single condition register, whereas vector comparisons
typically produce a vector result and register pressure is much
lower. Given they are likely to be more expensive than scalar
comparisons it makes sense to avoid sinking too many. The
CodeGen/SystemZ/vec-perm-14.ll test does rely upon sinking a
vector comparison so I've kept that behaviour by permitting one
sink.

Alternatively, I could also introduce a TLI hook to query the
target if this is a preferred solution?
  • Loading branch information
david-arm committed Oct 21, 2024
1 parent deba620 commit fb59ac6
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 312 deletions.
53 changes: 32 additions & 21 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1779,29 +1779,35 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
return false;

// Only insert a cmp in each block once.
DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
// Collect a list of non-phis users that are in blocks that are different to
// the definition block.
BasicBlock *DefBB = Cmp->getParent();
SmallSet<User *, 4> Users;
for (auto *U : Cmp->users()) {
Instruction *User = cast<Instruction>(U);
if (isa<PHINode>(User))
continue;

bool MadeChange = false;
for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
UI != E;) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
if (User->getParent() == DefBB)
continue;

// Preincrement use iterator so we don't invalidate it.
++UI;
Users.insert(User);
}

// Don't bother for PHI nodes.
if (isa<PHINode>(User))
continue;
// If this is a vector comparison the result will likely not depend upon
// setting a condition register, and it's probably too expensive to sink too
// many times.
VectorType *VecType = dyn_cast<VectorType>(Cmp->getType());
if (VecType && VecType->getElementCount().isVector() && Users.size() > 1)
return false;

// Figure out which BB this cmp is used in.
BasicBlock *UserBB = User->getParent();
BasicBlock *DefBB = Cmp->getParent();
// Only insert a cmp in each block once.
DenseMap<BasicBlock *, CmpInst *> InsertedCmps;

// If this user is in the same block as the cmp, don't change the cmp.
if (UserBB == DefBB)
continue;
bool MadeChange = false;
for (auto *U : Users) {
Instruction *UI = cast<Instruction>(U);
BasicBlock *UserBB = UI->getParent();

// If we have already inserted a cmp into this block, use it.
CmpInst *&InsertedCmp = InsertedCmps[UserBB];
Expand All @@ -1816,10 +1822,15 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
}

// Replace a use of the cmp with a use of the new cmp.
TheUse = InsertedCmp;
// Replace all uses of the cmp with a use of the new cmp and update the
// number of uses.
for (unsigned I = 0; I < U->getNumOperands(); I++)
if (U->getOperand(I) == Cmp) {
U->setOperand(I, InsertedCmp);
NumCmpUses++;
}

MadeChange = true;
++NumCmpUses;
}

// If we removed all uses, nuke the cmp.
Expand Down
82 changes: 39 additions & 43 deletions llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu"
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
; CHECK-LABEL: vector_loop_with_icmp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, #15 // =0xf
; CHECK-NEXT: mov w9, #15 // =0xf
; CHECK-NEXT: mov w10, #4 // =0x4
; CHECK-NEXT: adrp x9, .LCPI0_0
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: adrp x11, .LCPI0_1
; CHECK-NEXT: dup v0.2d, x8
; CHECK-NEXT: dup v0.2d, x9
; CHECK-NEXT: dup v1.2d, x10
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1]
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: mov w10, #16 // =0x10
; CHECK-NEXT: mov w11, #1 // =0x1
; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: mov w10, #1 // =0x1
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_1: // %pred.store.continue18
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
; CHECK-NEXT: subs x10, x10, #4
; CHECK-NEXT: add x9, x9, #16
; CHECK-NEXT: subs x9, x9, #4
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: b.eq .LBB0_10
; CHECK-NEXT: .LBB0_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
; CHECK-NEXT: umov w12, v4.h[0]
; CHECK-NEXT: tbz w12, #0, .LBB0_4
; CHECK-NEXT: // %bb.3: // %pred.store.if
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
; CHECK-NEXT: xtn v4.4h, v4.4s
; CHECK-NEXT: umov w11, v4.h[0]
; CHECK-NEXT: tbnz w11, #0, .LBB0_6
; CHECK-NEXT: // %bb.3: // %pred.store.continue
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: stur w11, [x9, #-8]
; CHECK-NEXT: .LBB0_4: // %pred.store.continue
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbnz w11, #0, .LBB0_7
; CHECK-NEXT: .LBB0_4: // %pred.store.continue6
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
; CHECK-NEXT: umov w12, v4.h[1]
; CHECK-NEXT: tbz w12, #0, .LBB0_6
; CHECK-NEXT: // %bb.5: // %pred.store.if5
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbnz w11, #0, .LBB0_8
; CHECK-NEXT: .LBB0_5: // %pred.store.continue8
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: stur w11, [x9, #-4]
; CHECK-NEXT: .LBB0_6: // %pred.store.continue6
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB0_1
; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .LBB0_6: // %pred.store.if
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
; CHECK-NEXT: umov w12, v4.h[2]
; CHECK-NEXT: tbz w12, #0, .LBB0_8
; CHECK-NEXT: // %bb.7: // %pred.store.if7
; CHECK-NEXT: stur w10, [x8, #-8]
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbz w11, #0, .LBB0_4
; CHECK-NEXT: .LBB0_7: // %pred.store.if5
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: str w11, [x9]
; CHECK-NEXT: .LBB0_8: // %pred.store.continue8
; CHECK-NEXT: stur w10, [x8, #-4]
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbz w11, #0, .LBB0_5
; CHECK-NEXT: .LBB0_8: // %pred.store.if7
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
; CHECK-NEXT: umov w12, v4.h[3]
; CHECK-NEXT: tbz w12, #0, .LBB0_1
; CHECK-NEXT: // %bb.9: // %pred.store.if9
; CHECK-NEXT: str w10, [x8]
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB0_1
; CHECK-NEXT: .LBB0_9: // %pred.store.if9
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: str w11, [x9, #4]
; CHECK-NEXT: str w10, [x8, #4]
; CHECK-NEXT: b .LBB0_1
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
; CHECK-NEXT: ret
Expand Down
Loading

0 comments on commit fb59ac6

Please sign in to comment.