Skip to content

[LSR] Only apply postincrement discount on address uses #149341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1420,8 +1420,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
}

unsigned LoopCost = 1;
if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
if (LU.Kind == LSRUse::Address &&
(TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType()))) {
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
Expand Down
70 changes: 37 additions & 33 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: subs.w r9, r1, #1
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: and r8, r9, #3
; CHECK-NEXT: and r6, r9, #3
; CHECK-NEXT: subs r7, r1, #2
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: b .LBB0_6
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: cbnz r6, .LBB0_7
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new
; CHECK-NEXT: bic r7, r9, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: movs r7, #4
; CHECK-NEXT: .LBB0_5: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r10, [r0, #16]!
; CHECK-NEXT: sub.w r9, r9, #4
; CHECK-NEXT: ldrd r5, r4, [r0, #-12]
; CHECK-NEXT: ldr r11, [r0, #-4]
; CHECK-NEXT: ldr r11, [r0, #16]!
; CHECK-NEXT: ldrd r5, r7, [r0, #-12]
; CHECK-NEXT: ldr r4, [r0, #-4]
; CHECK-NEXT: cmp r12, r5
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #3
; CHECK-NEXT: csel r5, r5, r12, gt
; CHECK-NEXT: cmp r5, r4
; CHECK-NEXT: csinc r6, r10, r8, le
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #2
; CHECK-NEXT: csel r5, r4, r5, gt
; CHECK-NEXT: cmp r5, r11
; CHECK-NEXT: addgt.w r6, r8, #2
; CHECK-NEXT: csel r7, r7, r5, gt
; CHECK-NEXT: cmp r7, r4
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #1
; CHECK-NEXT: csel r5, r11, r5, gt
; CHECK-NEXT: cmp r5, r10
; CHECK-NEXT: csel r6, r7, r6, gt
; CHECK-NEXT: add.w r7, r7, #4
; CHECK-NEXT: csel r12, r10, r5, gt
; CHECK-NEXT: addgt.w r6, r8, #3
; CHECK-NEXT: csel r7, r4, r7, gt
; CHECK-NEXT: add.w r8, r8, #4
; CHECK-NEXT: cmp r7, r11
; CHECK-NEXT: csel r10, r8, r6, gt
; CHECK-NEXT: csel r12, r11, r7, gt
; CHECK-NEXT: le lr, .LBB0_5
; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.7: @ %while.body.epil
; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload
; CHECK-NEXT: sub.w r9, r9, r8
; CHECK-NEXT: cbz r6, .LBB0_10
; CHECK-NEXT: .LBB0_7: @ %while.body.epil
; CHECK-NEXT: ldr r7, [r0, #4]
; CHECK-NEXT: sub.w r1, r1, r9
; CHECK-NEXT: cmp r12, r7
; CHECK-NEXT: csel r6, r1, r6, gt
; CHECK-NEXT: csel r10, r1, r10, gt
; CHECK-NEXT: csel r12, r7, r12, gt
; CHECK-NEXT: cmp.w r8, #1
; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1
; CHECK-NEXT: ldr r7, [r0, #8]
; CHECK-NEXT: cmp r12, r7
; CHECK-NEXT: csinc r6, r6, r1, le
; CHECK-NEXT: csinc r10, r10, r1, le
; CHECK-NEXT: csel r12, r7, r12, gt
; CHECK-NEXT: cmp.w r8, #2
; CHECK-NEXT: cmp r6, #2
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2
; CHECK-NEXT: ldr r0, [r0, #12]
; CHECK-NEXT: cmp r12, r0
; CHECK-NEXT: it gt
; CHECK-NEXT: addgt r6, r1, #2
; CHECK-NEXT: addgt.w r10, r1, #2
; CHECK-NEXT: csel r12, r0, r12, gt
; CHECK-NEXT: .LBB0_10: @ %while.end
; CHECK-NEXT: str.w r12, [r2]
; CHECK-NEXT: str r6, [r3]
; CHECK-NEXT: str.w r10, [r3]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%0 = load i32, ptr %pSrc, align 4
Expand Down
130 changes: 74 additions & 56 deletions llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s

; FIXME: Loop strength reduction makes suboptimal choices here due to the
; isLSRCostLess function preferring to minimise the number of addrecs even
; when it increases the total number of adds.

Comment on lines +4 to +7
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll be trying to fix this by adding ARMTTIImpl::isLSRCostLess which uses the Insns part of the cost (like AArch64TTIImpl::isLSRCostLess), but that causes other problems (something weird going on with the low overhead loops pass causing tail predication to not happen, loop strength reduction calculates NumBaseAdds incorrectly when multiple parts of a formula are loop-invariant) so I may not be able to.

define void @ptr_iv_v4i32(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i32 %y) {
; CHECK-LABEL: ptr_iv_v4i32:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: add.w r4, r0, r12
; CHECK-NEXT: add.w r3, r1, r12
; CHECK-NEXT: vldrw.u32 q1, [r4, q0, uxtw #2]
; CHECK-NEXT: add.w r12, r12, #64
; CHECK-NEXT: vadd.i32 q1, q1, r2
; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrw.32 q1, [r3, q0, uxtw #2]
; CHECK-NEXT: le lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
Expand Down Expand Up @@ -110,21 +116,23 @@ end:
define void @ptr_iv_v8i16(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i16 %y) {
; CHECK-LABEL: ptr_iv_v8i16:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr r3, .LCPI2_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: add.w r4, r0, r12
; CHECK-NEXT: add.w r3, r1, r12
; CHECK-NEXT: vldrh.u16 q1, [r4, q0, uxtw #1]
; CHECK-NEXT: add.w r12, r12, #64
; CHECK-NEXT: vadd.i16 q1, q1, r2
; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrh.16 q1, [r3, q0, uxtw #1]
; CHECK-NEXT: le lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
Expand Down Expand Up @@ -164,23 +172,25 @@ end:
define void @ptr_iv_v8i16_mult(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i16 %y) {
; CHECK-LABEL: ptr_iv_v8i16_mult:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: adr r3, .LCPI3_1
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vldrw.u32 q1, [r12]
; CHECK-NEXT: adr r4, .LCPI3_1
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q2, [r0, q0]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: adds r4, r0, r3
; CHECK-NEXT: add.w r12, r1, r3
; CHECK-NEXT: vldrh.u16 q2, [r4, q1]
; CHECK-NEXT: adds r3, #64
; CHECK-NEXT: vadd.i16 q2, q2, r2
; CHECK-NEXT: vstrh.16 q2, [r1, q1]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrh.16 q2, [r12, q0]
; CHECK-NEXT: le lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI3_0:
Expand Down Expand Up @@ -230,21 +240,23 @@ end:
define void @ptr_iv_v16i8(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i8 %y) {
; CHECK-LABEL: ptr_iv_v16i8:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr r3, .LCPI4_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q1, [r0, q0]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: add.w r4, r0, r12
; CHECK-NEXT: add.w r3, r1, r12
; CHECK-NEXT: vldrb.u8 q1, [r4, q0]
; CHECK-NEXT: add.w r12, r12, #64
; CHECK-NEXT: vadd.i8 q1, q1, r2
; CHECK-NEXT: vstrb.8 q1, [r1, q0]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrb.8 q1, [r3, q0]
; CHECK-NEXT: le lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI4_0:
Expand Down Expand Up @@ -292,23 +304,25 @@ end:
define void @ptr_iv_v16i8_mult(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i8 %y) {
; CHECK-LABEL: ptr_iv_v16i8_mult:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr.w r12, .LCPI5_0
; CHECK-NEXT: adr r3, .LCPI5_1
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vldrw.u32 q1, [r12]
; CHECK-NEXT: adr r4, .LCPI5_1
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q2, [r0, q0]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: adds r4, r0, r3
; CHECK-NEXT: add.w r12, r1, r3
; CHECK-NEXT: vldrb.u8 q2, [r4, q1]
; CHECK-NEXT: adds r3, #64
; CHECK-NEXT: vadd.i8 q2, q2, r2
; CHECK-NEXT: vstrb.8 q2, [r1, q1]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrb.8 q2, [r12, q0]
; CHECK-NEXT: le lr, .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI5_0:
Expand Down Expand Up @@ -374,21 +388,23 @@ end:
define void @ptr_iv_v4f32(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, float %y) {
; CHECK-LABEL: ptr_iv_v4f32:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: adr r3, .LCPI6_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: add.w r4, r0, r12
; CHECK-NEXT: add.w r3, r1, r12
; CHECK-NEXT: vldrw.u32 q1, [r4, q0, uxtw #2]
; CHECK-NEXT: add.w r12, r12, #64
; CHECK-NEXT: vadd.f32 q1, q1, r2
; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: vstrw.32 q1, [r3, q0, uxtw #2]
; CHECK-NEXT: le lr, .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI6_0:
Expand Down Expand Up @@ -485,16 +501,18 @@ define void @ptr_iv_v8f16(ptr noalias nocapture readonly %A, ptr noalias nocaptu
; CHECK-NEXT: vmov s0, r2
; CHECK-NEXT: mov.w lr, #249
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vmov.f16 r2, s0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: adr r2, .LCPI8_0
; CHECK-NEXT: vmov.f16 r12, s0
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT: adds r0, #64
; CHECK-NEXT: vadd.f16 q1, q1, r2
; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1]
; CHECK-NEXT: adds r1, #64
; CHECK-NEXT: adds r2, r0, r3
; CHECK-NEXT: vldrh.u16 q1, [r2, q0, uxtw #1]
; CHECK-NEXT: adds r2, r1, r3
; CHECK-NEXT: adds r3, #64
; CHECK-NEXT: vadd.f16 q1, q1, r12
; CHECK-NEXT: vstrh.16 q1, [r2, q0, uxtw #1]
; CHECK-NEXT: le lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: pop {r7, pc}
Expand Down
Loading
Loading