diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3711870276295..7f9e01dbf7a26 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5580,10 +5580,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Scale the total scalar cost by block probability. ScalarCost /= getReciprocalPredBlockProb(); - // Compute the discount. A non-negative discount means the vector version - // of the instruction costs more, and scalarizing would be beneficial. - Discount += VectorCost - ScalarCost; - ScalarCosts[I] = ScalarCost; + // Compute the discount, unless this instruction must be scalarized due to + // tail folding, as then the vector cost is already the scalar cost. A + // non-negative discount means the vector version of the instruction costs + // more, and scalarizing would be beneficial. + if (!foldTailByMasking() || getWideningDecision(I, VF) != CM_Scalarize) { + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } } return Discount; diff --git a/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll b/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll new file mode 100644 index 0000000000000..ed84b73bd2128 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll @@ -0,0 +1,281 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "fold tail" --filter "estimated cost" --filter "costs" --filter "Selecting VF" --filter "loop costs" --version 5 +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s + +; REQUIRE: asserts + +target triple = "aarch64-unknown-linux-gnu" + +; These tests check that if the only way to vectorize is to tail fold a store by +; masking then we properly account for the cost of creating a predicated block +; for each vector element. + +define void @store_const_fixed_trip_count(ptr %dst) { +; CHECK-LABEL: 'store_const_fixed_trip_count' +; CHECK: LV: can fold tail by masking. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 2 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 16 for VF 4 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 4 costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 32 for VF 8 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 8 costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 64 for VF 16 For instruction: store i8 1, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 16 costs: 4. +; CHECK: LV: Selecting VF: 1. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr i8, ptr %dst, i64 %iv + store i8 1, ptr %gep, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 7 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @store_trunc_iv_fixed_trip_count(ptr %dst) { +; CHECK-LABEL: 'store_trunc_iv_fixed_trip_count' +; CHECK: LV: can fold tail by masking. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 2 costs: 6. +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 4 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 8 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, 7 +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 16 costs: 5. +; CHECK: LV: Selecting VF: 1. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i8 + %gep = getelementptr i8, ptr %dst, i64 %iv + store i8 %iv.trunc, ptr %gep, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 7 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; With an unknown trip count we need to use optsize otherwise we use a scalar +; epilogue instead of tail folding. +define void @store_const_unknown_trip_count(ptr %dst, i64 %limit) optsize { +; CHECK-LABEL: 'store_const_unknown_trip_count' +; CHECK: LV: can fold tail by masking. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 2 costs: 6. +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 4 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 8 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 16 costs: 5. +; CHECK: LV: Selecting VF: 1. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i8 + %gep = getelementptr i8, ptr %dst, i64 %iv + store i8 %iv.trunc, ptr %gep, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %limit + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @store_trunc_iv_unknown_trip_count(ptr %dst, i64 %limit) optsize { +; CHECK-LABEL: 'store_trunc_iv_unknown_trip_count' +; CHECK: LV: can fold tail by masking. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Scalar loop costs: 4. +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 2 costs: 6. +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 4 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 8 costs: 5. +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8 +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1 +; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, %limit +; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: LV: Vector loop of width 16 costs: 5. +; CHECK: LV: Selecting VF: 1. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i8 + %gep = getelementptr i8, ptr %dst, i64 %iv + store i8 %iv.trunc, ptr %gep, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %limit + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 6956030570636..22b1ce1327623 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -384,93 +384,14 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-LABEL: define void @latch_branch_cost( ; PRED-SAME: ptr [[DST:%.*]]) { ; PRED-NEXT: entry: -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99) -; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 -; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] -; PRED-NEXT: store i8 0, ptr [[TMP3]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 -; PRED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; PRED: pred.store.if1: -; PRED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 -; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]] -; PRED-NEXT: store i8 0, ptr [[TMP6]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] -; PRED: pred.store.continue2: -; PRED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 -; PRED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; PRED: pred.store.if3: -; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] -; PRED-NEXT: store i8 0, ptr [[TMP9]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]] -; PRED: pred.store.continue4: -; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 -; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] -; PRED: pred.store.if5: -; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 -; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] -; PRED-NEXT: store i8 0, ptr [[TMP12]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]] -; PRED: pred.store.continue6: -; PRED-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 -; PRED-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; PRED: pred.store.if7: -; PRED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] -; PRED-NEXT: store i8 0, ptr [[TMP15]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]] -; PRED: pred.store.continue8: -; PRED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 -; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; PRED: pred.store.if9: -; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5 -; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] -; PRED-NEXT: store i8 0, ptr [[TMP18]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE10]] -; PRED: pred.store.continue10: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 -; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; PRED: pred.store.if11: -; PRED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: store i8 0, ptr [[TMP21]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE12]] -; PRED: pred.store.continue12: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 -; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]] -; PRED: pred.store.if13: -; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] -; PRED: pred.store.continue14: -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) -; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104 -; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[FOR_BODY:%.*]] ; PRED: loop: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; PRED-NEXT: store i8 0, ptr [[GEP]], align 1 ; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; PRED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; PRED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -613,112 +534,9 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; PRED-LABEL: define i32 @header_mask_and_invariant_compare( ; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; PRED-NEXT: entry: -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; PRED: vector.memcheck: -; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4 -; PRED-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 2 -; PRED-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4 -; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]] -; PRED-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4 -; PRED-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4 -; PRED-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4 -; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]] -; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]] -; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; PRED-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]] -; PRED-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] -; PRED-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]] -; PRED-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]] -; PRED-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]] -; PRED-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] -; PRED-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]] -; PRED-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]] -; PRED-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]] -; PRED-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] -; PRED-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] -; PRED-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]] -; PRED-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]] -; PRED-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; PRED-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] -; PRED-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]] -; PRED-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]] -; PRED-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] -; PRED-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]] -; PRED-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]] -; PRED-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]] -; PRED-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] -; PRED-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]] -; PRED-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]] -; PRED-NEXT: br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[TMP12:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE37]] ] -; PRED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP7:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META4:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META7:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: [[TMP9:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]] -; PRED-NEXT: [[TMP10:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META9:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: [[TMP11:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP9]] -; PRED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer -; PRED-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]] -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP25]], i32 0 -; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0 -; PRED-NEXT: store i32 [[TMP27]], ptr [[E]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP25]], i32 1 -; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] -; PRED: pred.store.if32: -; PRED-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1 -; PRED-NEXT: store i32 [[TMP17]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]] -; PRED-NEXT: br label [[PRED_STORE_CONTINUE33]] -; PRED: pred.store.continue33: -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP25]], i32 2 -; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] -; PRED: pred.store.if34: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2 -; PRED-NEXT: store i32 [[TMP19]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]] -; PRED-NEXT: br label [[PRED_STORE_CONTINUE35]] -; PRED: pred.store.continue35: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP25]], i32 3 -; PRED-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] -; PRED: pred.store.if36: -; PRED-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3 -; PRED-NEXT: store i32 [[TMP21]], ptr [[E]], align 4, !alias.scope [[META11]], !noalias [[META13]] -; PRED-NEXT: br label [[PRED_STORE_CONTINUE37]] -; PRED: pred.store.continue37: -; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 -; PRED-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <4 x i1> [[TMP25]]), !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]] -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) -; PRED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP28]], i32 0 -; PRED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP_HEADER:%.*]] ; PRED: loop.header: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; PRED-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4 ; PRED-NEXT: [[L_B:%.*]] = load i32, ptr [[B]], align 4 ; PRED-NEXT: [[OR:%.*]] = or i32 [[L_B]], [[L_A]] @@ -733,7 +551,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; PRED: loop.latch: ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]] +; PRED-NEXT: br i1 [[C_1]], label [[EXIT:%.*]], label [[LOOP_HEADER]] ; PRED: exit: ; PRED-NEXT: ret i32 0 ; @@ -850,7 +668,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; PRED-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 -; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: @@ -870,7 +688,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 ; PRED-NEXT: [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512 -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -894,208 +712,34 @@ exit: ret void } -define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { -; DEFAULT-LABEL: define void @low_trip_count_fold_tail_scalarized_store( +define void @low_trip_count_store(ptr %dst) { +; DEFAULT-LABEL: define void @low_trip_count_store( ; DEFAULT-SAME: ptr [[DST:%.*]]) { ; DEFAULT-NEXT: entry: -; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; DEFAULT: vector.ph: -; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] -; DEFAULT: vector.body: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] -; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8 -; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6) -; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; DEFAULT: pred.store.if: -; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] -; DEFAULT-NEXT: [[TMP5:%.*]] = add i8 [[TMP0]], 0 -; DEFAULT-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE]] -; DEFAULT: pred.store.continue: -; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; DEFAULT: pred.store.if1: -; DEFAULT-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 -; DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]] -; DEFAULT-NEXT: [[TMP9:%.*]] = add i8 [[TMP0]], 1 -; DEFAULT-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE2]] -; DEFAULT: pred.store.continue2: -; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; DEFAULT: pred.store.if3: -; DEFAULT-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 2 -; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] -; DEFAULT-NEXT: [[TMP13:%.*]] = add i8 [[TMP0]], 2 -; DEFAULT-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE4]] -; DEFAULT: pred.store.continue4: -; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; DEFAULT: pred.store.if5: -; DEFAULT-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 3 -; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]] -; DEFAULT-NEXT: [[TMP17:%.*]] = add i8 [[TMP0]], 3 -; DEFAULT-NEXT: store i8 [[TMP17]], ptr [[TMP16]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE6]] -; DEFAULT: pred.store.continue6: -; DEFAULT-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 -; DEFAULT-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; DEFAULT: pred.store.if7: -; DEFAULT-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 4 -; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]] -; DEFAULT-NEXT: [[TMP21:%.*]] = add i8 [[TMP0]], 4 -; DEFAULT-NEXT: store i8 [[TMP21]], ptr [[TMP20]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE8]] -; DEFAULT: pred.store.continue8: -; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 -; DEFAULT-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; DEFAULT: pred.store.if9: -; DEFAULT-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 5 -; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] -; DEFAULT-NEXT: [[TMP25:%.*]] = add i8 [[TMP0]], 5 -; DEFAULT-NEXT: store i8 [[TMP25]], ptr [[TMP24]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE10]] -; DEFAULT: pred.store.continue10: -; DEFAULT-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 -; DEFAULT-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; DEFAULT: pred.store.if11: -; DEFAULT-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 6 -; DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]] -; DEFAULT-NEXT: [[TMP29:%.*]] = add i8 [[TMP0]], 6 -; DEFAULT-NEXT: store i8 [[TMP29]], ptr [[TMP28]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE12]] -; DEFAULT: pred.store.continue12: -; DEFAULT-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 -; DEFAULT-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] -; DEFAULT: pred.store.if13: -; DEFAULT-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 7 -; DEFAULT-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP31]] -; DEFAULT-NEXT: [[TMP33:%.*]] = add i8 [[TMP0]], 7 -; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1 -; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE14]] -; DEFAULT: pred.store.continue14: -; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; DEFAULT-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] -; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: -; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; DEFAULT-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; DEFAULT-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; -; PRED-LABEL: define void @low_trip_count_fold_tail_scalarized_store( +; PRED-LABEL: define void @low_trip_count_store( ; PRED-SAME: ptr [[DST:%.*]]) { ; PRED-NEXT: entry: -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] -; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8 -; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6) -; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; PRED-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] -; PRED-NEXT: [[TMP5:%.*]] = add i8 [[TMP0]], 0 -; PRED-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 -; PRED-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; PRED: pred.store.if1: -; PRED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 -; PRED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]] -; PRED-NEXT: [[TMP9:%.*]] = add i8 [[TMP0]], 1 -; PRED-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] -; PRED: pred.store.continue2: -; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 -; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; PRED: pred.store.if3: -; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = add i8 [[TMP0]], 2 -; PRED-NEXT: store i8 [[TMP13]], ptr [[TMP12]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]] -; PRED: pred.store.continue4: -; PRED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 -; PRED-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; PRED: pred.store.if5: -; PRED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 3 -; PRED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]] -; PRED-NEXT: [[TMP17:%.*]] = add i8 [[TMP0]], 3 -; PRED-NEXT: store i8 [[TMP17]], ptr [[TMP16]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] -; PRED: pred.store.continue6: -; PRED-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 -; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; PRED: pred.store.if7: -; PRED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: [[TMP21:%.*]] = add i8 [[TMP0]], 4 -; PRED-NEXT: store i8 [[TMP21]], ptr [[TMP20]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]] -; PRED: pred.store.continue8: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 -; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; PRED: pred.store.if9: -; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 5 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: [[TMP25:%.*]] = add i8 [[TMP0]], 5 -; PRED-NEXT: store i8 [[TMP25]], ptr [[TMP24]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE10]] -; PRED: pred.store.continue10: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 -; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; PRED: pred.store.if11: -; PRED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 6 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i8 [[TMP0]], 6 -; PRED-NEXT: store i8 [[TMP29]], ptr [[TMP28]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE12]] -; PRED: pred.store.continue12: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 -; PRED-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] -; PRED: pred.store.if13: -; PRED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 7 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i8 [[TMP0]], 7 -; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE14]] -; PRED: pred.store.continue14: -; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; PRED-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 ; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; PRED-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -1296,7 +940,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) ; DEFAULT-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; DEFAULT: middle.block: ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1327,7 +971,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; DEFAULT: loop.latch: ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; @@ -1519,7 +1163,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) ; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0 -; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; PRED-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: @@ -1549,7 +1193,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED: loop.latch: ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -1632,7 +1276,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; DEFAULT: middle.block: ; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: @@ -1648,7 +1292,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; DEFAULT-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32 ; DEFAULT-NEXT: store i32 [[T]], ptr [[DST]], align 4 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21 -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; @@ -1695,7 +1339,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; PRED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; PRED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; PRED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: @@ -1711,7 +1355,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; PRED-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32 ; PRED-NEXT: store i32 [[T]], ptr [[DST]], align 4 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21 -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -1768,37 +1412,16 @@ attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" } ; DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} ; DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} ; DEFAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]} ; DEFAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]]} -; DEFAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; PRED: [[META4]] = !{[[META5:![0-9]+]]} -; PRED: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} -; PRED: [[META6]] = distinct !{[[META6]], !"LVerDomain"} -; PRED: [[META7]] = !{[[META8:![0-9]+]]} -; PRED: [[META8]] = distinct !{[[META8]], [[META6]]} -; PRED: [[META9]] = !{[[META10:![0-9]+]]} -; PRED: [[META10]] = distinct !{[[META10]], [[META6]]} -; PRED: [[META11]] = !{[[META12:![0-9]+]]} -; PRED: [[META12]] = distinct !{[[META12]], [[META6]]} -; PRED: [[META13]] = !{[[META14:![0-9]+]], [[META5]], [[META8]], [[META10]]} -; PRED: [[META14]] = distinct !{[[META14]], [[META6]]} -; PRED: [[META15]] = !{[[META14]]} -; PRED: [[META16]] = !{[[META5]], [[META8]], [[META10]]} -; PRED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} -; PRED: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} -; PRED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} -; PRED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} -; PRED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} -; PRED: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} -; PRED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} -; PRED: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]]} -; PRED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]} -; PRED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]} +; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 4bb67c890f3cf..f1ba191397a24 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -257,71 +257,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; PRED-NEXT: entry: ; PRED-NEXT: [[MUL_X:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; PRED: vector.scevcheck: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: store i32 1, ptr [[TMP20]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] -; PRED: pred.store.if1: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]] -; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] -; PRED: pred.store.continue2: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) -; PRED-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0 -; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[FOR_BODY:%.*]] ; PRED: for.body: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; PRED-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32 ; PRED-NEXT: [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]] ; PRED-NEXT: [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64 @@ -329,7 +267,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 1, ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[FOR_BODY]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -428,106 +366,20 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-LABEL: define void @trunc_ivs_and_store( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; PRED-NEXT: entry: -; PRED-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; PRED: vector.scevcheck: ; PRED-NEXT: [[TMP1:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP1]] -; PRED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0 -; PRED-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]] -; PRED-NEXT: [[TMP5:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 -; PRED-NEXT: [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0 -; PRED-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false -; PRED-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0 -; PRED-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] -; PRED-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64> -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; PRED: pred.store.if2: -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1 -; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]] -; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE3]] -; PRED: pred.store.continue3: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; PRED: pred.store.if4: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] -; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE5]] -; PRED: pred.store.continue5: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; PRED: pred.store.if6: -; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3 -; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]] -; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]] -; PRED: pred.store.continue7: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0 -; PRED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; PRED-NEXT: [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 -; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]] +; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; PRED-NEXT: [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64 ; PRED-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]] ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label [[EXIT:%.*]], label [[LOOP]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -627,95 +479,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; PRED-NEXT: entry: ; PRED-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; PRED: vector.scevcheck: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; PRED: pred.store.if1: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] -; PRED: pred.store.continue2: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; PRED: pred.store.if3: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]] -; PRED: pred.store.continue4: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; PRED: pred.store.if5: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] -; PRED: pred.store.continue6: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0 -; PRED-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] ; PRED-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 ; PRED-NEXT: [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 @@ -724,7 +491,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -805,67 +572,16 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-LABEL: define void @exit_cond_zext_iv( ; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; PRED-NEXT: entry: -; PRED-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) -; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; PRED: vector.scevcheck: -; PRED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 -; PRED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 -; PRED-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32 -; PRED-NEXT: [[TMP3:%.*]] = add i32 1, [[TMP2]] -; PRED-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 -; PRED-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 -; PRED-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] -; PRED-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; PRED: vector.ph: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1 -; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer -; PRED-NEXT: br label [[VECTOR_BODY:%.*]] -; PRED: vector.body: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE5:%.*]] ] -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; PRED-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], -; PRED-NEXT: [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT3]] -; PRED-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; PRED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; PRED: pred.store.if: -; PRED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP10:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP9]], i32 2 -; PRED-NEXT: store i32 0, ptr [[TMP10]], align 8 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] -; PRED: pred.store.continue: -; PRED-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; PRED-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5]] -; PRED: pred.store.if4: -; PRED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 1 -; PRED-NEXT: [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2 -; PRED-NEXT: store i32 0, ptr [[TMP13]], align 8 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE5]] -; PRED: pred.store.continue5: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: -; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] -; PRED-NEXT: [[IV_CONV:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] +; PRED-NEXT: [[IV_CONV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP]] ] ; PRED-NEXT: [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2 ; PRED-NEXT: store i32 0, ptr [[GEP]], align 8 ; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64 ; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]] -; PRED-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; PRED-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -906,12 +622,4 @@ attributes #0 = { "target-features"="+sve" } ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} -; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} -; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} -; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll index a0696b3204dbd..cbbb9e7ed0847 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll @@ -21,7 +21,31 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.*]]> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>, vp<[[VF]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[CMP]]> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx> +; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2> +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[LOAD0:%.+]]> = ir<%0> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[LOAD1:%.+]]> = ir<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): for.body.1 +; CHECK-EMPTY: +; CHECK-NEXT: for.body.1: +; CHECK-NEXT: WIDEN ir<%add> = add nsw vp<[[LOAD1]]>, vp<[[LOAD0]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -30,13 +54,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx> -; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2> ; CHECK-NEXT: REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add nsw ir<%1>, ir<%0> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx4> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 55ff26c55b512..ed0de2fb6ad21 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -145,61 +145,98 @@ define void @example2(i32 %n, i32 %x) optsize { ; CHECK: vector.body15: ; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE28:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]] +; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT18]], ; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]] ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0 ; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] -; CHECK: pred.store.if21: +; CHECK: pred.load.if: ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]] -; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP23]], i64 0 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] -; CHECK: pred.store.continue22: +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY15]] ], [ [[TMP25]], [[PRED_STORE_IF21]] ] +; CHECK-NEXT: [[TMP40:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY15]] ], [ [[TMP32]], [[PRED_STORE_IF21]] ] ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1 ; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] -; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK: pred.load.if21: ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP29]], i64 1 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP27]] ; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP27]] -; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]] -; CHECK-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP31]], i64 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] -; CHECK: pred.store.continue24: +; CHECK: pred.load.continue22: +; CHECK-NEXT: [[TMP53:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_STORE_CONTINUE22]] ], [ [[TMP41]], [[PRED_STORE_IF23]] ] +; CHECK-NEXT: [[TMP54:%.*]] = phi <4 x i32> [ [[TMP40]], [[PRED_STORE_CONTINUE22]] ], [ [[TMP51]], [[PRED_STORE_IF23]] ] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2 ; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] -; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK: pred.load.if23: ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP37]], i64 2 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP35]] ; CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP35]] -; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]] -; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP39]], i64 2 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] -; CHECK: pred.store.continue26: +; CHECK: pred.load.continue24: +; CHECK-NEXT: [[TMP71:%.*]] = phi <4 x i32> [ [[TMP53]], [[PRED_STORE_CONTINUE24]] ], [ [[TMP60]], [[PRED_STORE_IF25]] ] +; CHECK-NEXT: [[TMP72:%.*]] = phi <4 x i32> [ [[TMP54]], [[PRED_STORE_CONTINUE24]] ], [ [[TMP61]], [[PRED_STORE_IF25]] ] ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3 -; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28]] -; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK: pred.load.if25: ; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> [[TMP71]], i32 [[TMP45]], i64 3 ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP43]] ; CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]] -; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]] +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP72]], i32 [[TMP47]], i64 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK: pred.load.continue26: +; CHECK-NEXT: [[TMP56:%.*]] = phi <4 x i32> [ [[TMP71]], [[PRED_STORE_CONTINUE26]] ], [ [[TMP52]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP57:%.*]] = phi <4 x i32> [ [[TMP72]], [[PRED_STORE_CONTINUE26]] ], [ [[TMP55]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP58:%.*]] = and <4 x i32> [[TMP57]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; CHECK: pred.store.if27: +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP58]], i64 0 ; CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] ; CHECK: pred.store.continue28: +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1 +; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK: pred.store.if29: +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP27]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i32> [[TMP58]], i64 1 +; CHECK-NEXT: store i32 [[TMP64]], ptr [[TMP63]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] +; CHECK: pred.store.continue30: +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2 +; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; CHECK: pred.store.if31: +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP35]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i32> [[TMP58]], i64 2 +; CHECK-NEXT: store i32 [[TMP67]], ptr [[TMP66]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] +; CHECK: pred.store.continue32: +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3 +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE28]] +; CHECK: pred.store.if33: +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <4 x i32> [[TMP58]], i64 3 +; CHECK-NEXT: store i32 [[TMP70]], ptr [[TMP69]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] +; CHECK: pred.store.continue34: ; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4 ; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]] ; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]] @@ -460,54 +497,80 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[VEC_IV]], splat (i64 257) ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[NEXT_GEP1]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> poison, i16 [[TMP8]], i64 0 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_STORE_IF]] ] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK: pred.load.if10: +; CHECK-NEXT: [[TMP29:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP3]], i64 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; CHECK: pred.load.continue11: +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i16> [ [[TMP27]], [[PRED_STORE_CONTINUE]] ], [ [[TMP9]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK: pred.load.if12: +; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[TMP32]], i64 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; CHECK: pred.load.continue13: +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i16> [ [[TMP10]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP14]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK: pred.load.if14: +; CHECK-NEXT: [[TMP33:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP34:%.*]] = load i16, ptr [[NEXT_GEP4]], align 2 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP34]], i64 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; CHECK: pred.load.continue15: +; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i16> [ [[TMP15]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP35]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i16> [[TMP20]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP21]], splat (i32 7) +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE1:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX5]] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP22]], i64 0 ; CHECK-NEXT: store i32 [[TMP5]], ptr [[NEXT_GEP6]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE1]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] -; CHECK: pred.store.if10: +; CHECK: pred.store.if16: ; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 4 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = zext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP22]], i64 1 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[NEXT_GEP7]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] -; CHECK: pred.store.continue11: +; CHECK: pred.store.continue17: ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 ; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] -; CHECK: pred.store.if12: +; CHECK: pred.store.if18: ; CHECK-NEXT: [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 8 ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2 -; CHECK-NEXT: [[TMP16:%.*]] = zext i16 [[TMP15]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP22]], i64 2 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[NEXT_GEP8]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] -; CHECK: pred.store.continue13: +; CHECK: pred.store.continue19: ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 ; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]] -; CHECK: pred.store.if14: +; CHECK: pred.store.if20: ; CHECK-NEXT: [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 12 ; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP4]], align 2 -; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP21]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP22]], i64 3 ; CHECK-NEXT: store i32 [[TMP23]], ptr [[NEXT_GEP9]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] -; CHECK: pred.store.continue15: +; CHECK: pred.store.continue21: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -515,9 +578,9 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: br i1 true, label [[TMP26:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP25:%.*]] -; CHECK: 25: +; CHECK: 35: ; CHECK-NEXT: br i1 poison, label [[TMP26]], label [[TMP25]], !llvm.loop [[LOOP13:![0-9]+]] -; CHECK: 26: +; CHECK: 36: ; CHECK-NEXT: ret void ; br label %1