diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8e0cdc6f1a5e7..92999a9001917 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -497,10 +497,10 @@ class TargetLoweringBase { return true; } - /// Return true if multiple condition registers are available. - bool hasMultipleConditionRegisters() const { - return HasMultipleConditionRegisters; - } + /// Return true if multiple (allocatable) predicate registers are available + /// for \p VT. If there is only a single register the code generator will + /// sink comparisons into the blocks of their users. + virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; } /// Return true if the target has BitExtract instructions. bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } @@ -2389,7 +2389,7 @@ class TargetLoweringBase { EVT VT) const { // If a target has multiple condition registers, then it likely has logical // operations on those registers. - if (hasMultipleConditionRegisters()) + if (hasMultiplePredicateRegisters(MVT::i1)) return false; // Only do the transform if the value won't be split into multiple // registers. @@ -2496,15 +2496,6 @@ class TargetLoweringBase { StackPointerRegisterToSaveRestore = R; } - /// Tells the code generator that the target has multiple (allocatable) - /// condition registers that can be used to store the results of comparisons - /// for use by selects and conditional branches. With multiple condition - /// registers, the code generator will not aggressively sink comparisons into - /// the blocks of their users. - void setHasMultipleConditionRegisters(bool hasManyRegs = true) { - HasMultipleConditionRegisters = hasManyRegs; - } - /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be @@ -3473,13 +3464,6 @@ class TargetLoweringBase { private: const TargetMachine &TM; - /// Tells the code generator that the target has multiple (allocatable) - /// condition registers that can be used to store the results of comparisons - /// for use by selects and conditional branches. With multiple condition - /// registers, the code generator will not aggressively sink comparisons into - /// the blocks of their users. - bool HasMultipleConditionRegisters; - /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index f1ac3d95a8dd8..48dfe86327b08 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1770,8 +1770,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { - if (TLI.hasMultipleConditionRegisters()) +static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp, + const TargetLowering &TLI) { + EVT ResVT = TLI.getValueType(DL, Cmp->getType()); + if (TLI.hasMultiplePredicateRegisters(ResVT)) return false; // Avoid sinking soft-FP comparisons, since this can move them into a loop. @@ -2176,7 +2178,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI, } bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { - if (sinkCmpExpression(Cmp, *TLI)) + if (sinkCmpExpression(*DL, Cmp, *TLI)) return true; if (combineToUAddWithOverflow(Cmp, ModifiedDT)) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 5bcde0e1bbec8..a7d22ccd45175 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) MaxGluedStoresPerMemcpy = 0; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; - HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d696355bb062a..f1350e1eb53f3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1346,6 +1346,10 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMinimumJumpTableEntries() const override; bool softPromoteHalfType() const override { return true; } + + virtual bool hasMultiplePredicateRegisters(EVT VT) const override { + return VT.isVector(); + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e4b54c7d72b08..6e592e47e6a68 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); - // FIXME: This is only partially true. If we have to do vector compares, any - // SGPR pair can be a condition register. If we have a uniform condition, we - // are better off doing SALU operations, where there is only one SCC. For now, - // we don't have a way of knowing during instruction selection if a condition - // will be uniform and we always use vector compares. Assume we are using - // vector compares until that is fixed. - setHasMultipleConditionRegisters(true); - setMinCmpXchgSizeInBits(32); setSupportsUnalignedAtomics(false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index b2fd31cb2346e..166a9099a0d47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + virtual bool hasMultiplePredicateRegisters(EVT VT) const override { + // FIXME: This is only partially true. If we have to do vector compares, + // any SGPR pair can be a condition register. If we have a uniform + // condition, we are better off doing SALU operations, where there is only + // one SCC. For now, we don't have a way of knowing during instruction + // selection if a condition will be uniform and we always use vector + // compares. Assume we are using vector compares until that is fixed. + return true; + } }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d8f3095ed7fb6..fc4ca03eb85f7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1461,10 +1461,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. - if (Subtarget.useCRBits()) { - setHasMultipleConditionRegisters(); + if (Subtarget.useCRBits()) setJumpIsExpensive(); - } // TODO: The default entry number is set to 64. This stops most jump table // generation on PPC. But it is good for current PPC HWs because the indirect @@ -19137,3 +19135,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } + +bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const { + // With 32 condition bits, we don't need to sink (and duplicate) compares + // aggressively in CodeGenPrep. + return Subtarget.useCRBits(); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index dde45e4cf6f4a..7f2287f659990 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1492,6 +1492,8 @@ namespace llvm { /// through to determine the optimal load/store instruction format. unsigned computeMOFlags(const SDNode *Parent, SDValue N, SelectionDAG &DAG) const; + + virtual bool hasMultiplePredicateRegisters(EVT VT) const override; }; // end class PPCTargetLowering namespace PPC { diff --git a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll new file mode 100644 index 0000000000000..8b94cb2414f0d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) { +; CHECK-LABEL: no_sink_simple: +; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: tbz w0, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %s +; CHECK-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %t +; CHECK-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %d = icmp slt <4 x i32> %a, %b + br i1 %c, label %s, label %t + +s: + %s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer + store <4 x i32> %s1, ptr %p + ret <4 x i32> %s1 + +t: + %s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer + ret <4 x i32> %s2 +} + +define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: vector_loop_with_icmp: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: mov w10, #4 // =0x4 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x11, .LCPI1_1 +; CHECK-NEXT: dup v0.2d, x9 +; CHECK-NEXT: dup v1.2d, x10 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI1_1] +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #1 // =0x1 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_1: // %pred.store.continue18 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: add v2.2d, v2.2d, v1.2d +; CHECK-NEXT: add v3.2d, v3.2d, v1.2d +; CHECK-NEXT: subs x9, x9, #4 +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: b.eq .LBB1_10 +; CHECK-NEXT: .LBB1_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d +; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: xtn v4.4h, v4.4s +; CHECK-NEXT: umov w11, v4.h[0] +; CHECK-NEXT: tbnz w11, #0, .LBB1_6 +; CHECK-NEXT: // %bb.3: // %pred.store.continue +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: umov w11, v4.h[1] +; CHECK-NEXT: tbnz w11, #0, .LBB1_7 +; CHECK-NEXT: .LBB1_4: // %pred.store.continue6 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: umov w11, v4.h[2] +; CHECK-NEXT: tbnz w11, #0, .LBB1_8 +; CHECK-NEXT: .LBB1_5: // %pred.store.continue8 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: umov w11, v4.h[3] +; CHECK-NEXT: tbz w11, #0, .LBB1_1 +; CHECK-NEXT: b .LBB1_9 +; CHECK-NEXT: .LBB1_6: // %pred.store.if +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: stur w10, [x8, #-8] +; CHECK-NEXT: umov w11, v4.h[1] +; CHECK-NEXT: tbz w11, #0, .LBB1_4 +; CHECK-NEXT: .LBB1_7: // %pred.store.if5 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: stur w10, [x8, #-4] +; CHECK-NEXT: umov w11, v4.h[2] +; CHECK-NEXT: tbz w11, #0, .LBB1_5 +; CHECK-NEXT: .LBB1_8: // %pred.store.if7 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: str w10, [x8] +; CHECK-NEXT: umov w11, v4.h[3] +; CHECK-NEXT: tbz w11, #0, .LBB1_1 +; CHECK-NEXT: .LBB1_9: // %pred.store.if9 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: str w10, [x8, #4] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_10: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ] + %vec.ind = phi <4 x i64> [ , %entry ], [ %vec.ind.next, %pred.store.continue18 ] + %0 = icmp ult <4 x i64> %vec.ind, + %1 = extractelement <4 x i1> %0, i64 0 + br i1 %1, label %pred.store.if, label %pred.store.continue + +pred.store.if: + %2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 1, ptr %2, align 4 + br label %pred.store.continue + +pred.store.continue: + %3 = extractelement <4 x i1> %0, i64 1 + br i1 %3, label %pred.store.if5, label %pred.store.continue6 + +pred.store.if5: + %4 = or disjoint i64 %index, 1 + %5 = getelementptr inbounds i32, ptr %dest, i64 %4 + store i32 1, ptr %5, align 4 + br label %pred.store.continue6 + +pred.store.continue6: + %6 = extractelement <4 x i1> %0, i64 2 + br i1 %6, label %pred.store.if7, label %pred.store.continue8 + +pred.store.if7: + %7 = or disjoint i64 %index, 2 + %8 = getelementptr inbounds i32, ptr %dest, i64 %7 + store i32 1, ptr %8, align 4 + br label %pred.store.continue8 + +pred.store.continue8: + %9 = extractelement <4 x i1> %0, i64 3 + br i1 %9, label %pred.store.if9, label %pred.store.continue18 + +pred.store.if9: + %10 = or disjoint i64 %index, 3 + %11 = getelementptr inbounds i32, ptr %dest, i64 %10 + store i32 1, ptr %11, align 4 + br label %pred.store.continue18 + +pred.store.continue18: + %index.next = add i64 %index, 4 + %vec.ind.next = add <4 x i64> %vec.ind, + %24 = icmp eq i64 %index.next, 16 + br i1 %24, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll b/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll new file mode 100644 index 0000000000000..ca049763c5d2f --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) { +; CHECK-LABEL: no_sink_simple: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsls r0, r0, #31 +; CHECK-NEXT: beq .LBB0_2 +; CHECK-NEXT: @ %bb.1: @ %s +; CHECK-NEXT: vcmp.s32 gt, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB0_2: @ %t +; CHECK-NEXT: vcmp.s32 gt, q1, q0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: bx lr + %d = icmp slt <4 x i32> %a, %b + br i1 %c, label %s, label %t + +s: + %s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer + store <4 x i32> %s1, ptr %p + ret <4 x i32> %s1 + +t: + %s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer + ret <4 x i32> %s2 +} + +define arm_aapcs_vfpcc void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: vector_loop_with_icmp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: adr r1, .LCPI1_0 +; CHECK-NEXT: vmov.i32 q1, #0xf +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: vmov.i32 q2, #0xf +; CHECK-NEXT: vmov.i32 q3, #0xf +; CHECK-NEXT: vmov.i32 q4, #0xf +; CHECK-NEXT: mov.w r12, #4 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vcmp.u32 hi, q2, q0 +; CHECK-NEXT: lsls r2, r2, #31 +; CHECK-NEXT: it ne +; CHECK-NEXT: strne.w lr, [r0, r1, lsl #2] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vcmp.u32 hi, q3, q0 +; CHECK-NEXT: lsrs r2, r2, #4 +; CHECK-NEXT: lsls r2, r2, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: orrne r2, r1, #1 +; CHECK-NEXT: strne.w lr, [r0, r2, lsl #2] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vcmp.u32 hi, q4, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r12 +; CHECK-NEXT: lsrs r2, r2, #8 +; CHECK-NEXT: lsls r2, r2, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: orrne r2, r1, #2 +; CHECK-NEXT: strne.w lr, [r0, r2, lsl #2] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: lsrs r2, r2, #12 +; CHECK-NEXT: lsls r2, r2, #31 +; CHECK-NEXT: itt ne +; CHECK-NEXT: orrne r2, r1, #3 +; CHECK-NEXT: strne.w lr, [r0, r2, lsl #2] +; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: eor r2, r1, #16 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ] + %vec.ind = phi <4 x i32> [ , %entry ], [ %vec.ind.next, %pred.store.continue18 ] + %0 = icmp ult <4 x i32> %vec.ind, + %1 = extractelement <4 x i1> %0, i64 0 + br i1 %1, label %pred.store.if, label %pred.store.continue + +pred.store.if: + %2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 1, ptr %2, align 4 + br label %pred.store.continue + +pred.store.continue: + %3 = extractelement <4 x i1> %0, i64 1 + br i1 %3, label %pred.store.if5, label %pred.store.continue6 + +pred.store.if5: + %4 = or disjoint i64 %index, 1 + %5 = getelementptr inbounds i32, ptr %dest, i64 %4 + store i32 1, ptr %5, align 4 + br label %pred.store.continue6 + +pred.store.continue6: + %6 = extractelement <4 x i1> %0, i64 2 + br i1 %6, label %pred.store.if7, label %pred.store.continue8 + +pred.store.if7: + %7 = or disjoint i64 %index, 2 + %8 = getelementptr inbounds i32, ptr %dest, i64 %7 + store i32 1, ptr %8, align 4 + br label %pred.store.continue8 + +pred.store.continue8: + %9 = extractelement <4 x i1> %0, i64 3 + br i1 %9, label %pred.store.if9, label %pred.store.continue18 + +pred.store.if9: + %10 = or disjoint i64 %index, 3 + %11 = getelementptr inbounds i32, ptr %dest, i64 %10 + store i32 1, ptr %11, align 4 + br label %pred.store.continue18 + +pred.store.continue18: + %index.next = add i64 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %24 = icmp eq i64 %index.next, 16 + br i1 %24, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +}