diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index bf2e04caa0a61..a360c592d3ecc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2719,18 +2719,52 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, Ops[OpIdx++] = Op.getOperand(0); Ops[OpIdx++] = Op.getOperand(1); + std::bitset<16> DefinedLaneBytes = 0xFFFF; // Expand mask indices to byte indices and materialize them as operands for (int M : Mask) { for (size_t J = 0; J < LaneBytes; ++J) { // Lower undefs (represented by -1 in mask) to {0..J}, which use a // whole lane of vector input, to allow further reduction at VM. E.g. // match an 8x16 byte shuffle to an equivalent cheaper 32x4 shuffle. + if (M == -1) { + DefinedLaneBytes[OpIdx - 2] = 0; + } uint64_t ByteIndex = M == -1 ? J : (uint64_t)M * LaneBytes + J; Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32); } } - - return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops); + EVT VT = Op.getValueType(); + SDValue Shuffle = DAG.getNode(WebAssemblyISD::SHUFFLE, DL, VT, Ops); + + // If only the lower four or eight bytes are actually defined by the + // shuffle, insert an AND so a VM can know that it can ignore the higher, + // undef, lanes. + if (DefinedLaneBytes == 0xF) { + SDValue LowLaneMask[] = { + DAG.getConstant(uint32_t(-1), DL, MVT::i32), + DAG.getConstant(uint32_t(0), DL, MVT::i32), + DAG.getConstant(uint32_t(0), DL, MVT::i32), + DAG.getConstant(uint32_t(0), DL, MVT::i32), + }; + SDValue UndefMask = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, LowLaneMask); + SDValue MaskedShuffle = + DAG.getNode(ISD::AND, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, Shuffle), UndefMask); + return DAG.getBitcast(VT, MaskedShuffle); + } else if (DefinedLaneBytes == 0xFF) { + SDValue LowLaneMask[] = { + DAG.getConstant(uint64_t(-1), DL, MVT::i64), + DAG.getConstant(uint32_t(0), DL, MVT::i64), + }; + SDValue UndefMask = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64, LowLaneMask); + SDValue MaskedShuffle = + DAG.getNode(ISD::AND, DL, MVT::v2i64, + DAG.getBitcast(MVT::v2i64, Shuffle), UndefMask); + return DAG.getBitcast(VT, MaskedShuffle); + } + return Shuffle; } SDValue WebAssemblyTargetLowering::LowerSETCC(SDValue Op, diff --git a/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll index 7736e78271e55..0085c6cd82797 100644 --- a/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll +++ b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll @@ -10,9 +10,11 @@ define <4 x i32> @sext_high_v4i8(<8 x i8> %in) { ; SIMD128: .functype sext_high_v4i8 (v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0 -; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push3=, $pop2 +; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push4=, $pop3 +; SIMD128-NEXT: return $pop4 %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> %res = sext <4 x i8> %shuffle to <4 x i32> ret <4 x i32> %res @@ -23,9 +25,11 @@ define <4 x i32> @zext_high_v4i8(<8 x i8> %in) { ; SIMD128: .functype zext_high_v4i8 (v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push1=, $pop0 -; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push2=, $pop1 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push3=, $pop2 +; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push4=, $pop3 +; SIMD128-NEXT: return $pop4 %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> %res = zext <4 x i8> %shuffle to <4 x i32> ret <4 x i32> %res @@ -58,8 +62,10 @@ define <2 x i32> @sext_high_v2i16(<4 x i16> %in) { ; SIMD128: .functype sext_high_v2i16 (v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push1=, $pop0 -; SIMD128-NEXT: return $pop1 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push3=, $pop2 +; SIMD128-NEXT: return $pop3 %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> %res = sext <2 x i16> %shuffle to <2 x i32> ret <2 x i32> %res @@ -70,8 +76,10 @@ define <2 x i32> @zext_high_v2i16(<4 x i16> %in) { ; SIMD128: .functype zext_high_v2i16 (v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push1=, $pop0 -; SIMD128-NEXT: return $pop1 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push3=, $pop2 +; SIMD128-NEXT: return $pop3 %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> %res = zext <2 x i16> %shuffle to <2 x i32> ret <2 x i32> %res diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll index 7190e162eb010..27b7e8c6b01cd 100644 --- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll @@ -32,6 +32,8 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NEXT: v128.bitselect ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -76,6 +78,8 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-NEXT: v128.bitselect ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -112,6 +116,8 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -301,6 +307,8 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -328,6 +336,8 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) { ; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -355,6 +365,8 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -378,6 +390,8 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -399,6 +413,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) { ; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -420,6 +436,8 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -1484,6 +1502,8 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-NEXT: v128.bitselect ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -1526,6 +1546,8 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) { ; CHECK-NEXT: v128.bitselect ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -1561,6 +1583,8 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-NEXT: v128.and ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -1738,6 +1762,8 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -1763,6 +1789,8 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) { ; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -1789,6 +1817,8 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -1810,6 +1840,8 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -1829,6 +1861,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) { ; CHECK-NEXT: i32x4.min_u ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -1849,6 +1883,8 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NEXT: i32x4.max_s ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return entry: %conv = fptosi <4 x float> %x to <4 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-concat.ll b/llvm/test/CodeGen/WebAssembly/simd-concat.ll index 42ded8a47c199..4473f7ffc6a93 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-concat.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-concat.ll @@ -24,6 +24,8 @@ define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return %v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> ret <8 x i8> %v @@ -48,6 +50,8 @@ define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shuffle 0, 1, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> ret <4 x i8> %v @@ -60,6 +64,8 @@ define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> ret <4 x i16> %v diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll index 8459ec8101ff2..c98567eaaf7d6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -313,14 +313,16 @@ define <4 x double> @convert_low_s_v4f64(<8 x i32> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.convert_low_i32x4_s -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.convert_low_i32x4_s -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %a = sitofp <4 x i32> %v to <4 x double> @@ -333,14 +335,16 @@ define <4 x double> @convert_low_u_v4f64(<8 x i32> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.convert_low_i32x4_u -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.convert_low_i32x4_u -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %a = uitofp <4 x i32> %v to <4 x double> @@ -354,14 +358,16 @@ define <4 x double> @convert_low_s_v4f64_2(<8 x i32> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.convert_low_i32x4_s -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.convert_low_i32x4_s -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = sitofp <8 x i32> %x to <8 x double> %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> @@ -374,14 +380,16 @@ define <4 x double> @convert_low_u_v4f64_2(<8 x i32> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.convert_low_i32x4_u -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.convert_low_i32x4_u -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = uitofp <8 x i32> %x to <8 x double> %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> @@ -394,14 +402,16 @@ define <4 x double> @promote_low_v4f64(<8 x float> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.promote_low_f32x4 -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.promote_low_f32x4 -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> %a = fpext <4 x float> %v to <4 x double> @@ -414,14 +424,16 @@ define <4 x double> @promote_low_v4f64_2(<8 x float> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: f64x2.promote_low_f32x4 -; CHECK-NEXT: v128.store 16 +; CHECK-NEXT: v128.store 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.promote_low_f32x4 -; CHECK-NEXT: v128.store 0 +; CHECK-NEXT: v128.store 16 ; CHECK-NEXT: # fallthrough-return %v = fpext <8 x float> %x to <8 x double> %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> @@ -435,6 +447,8 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: f64x2.promote_low_f32x4 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <4 x float> %x, <4 x float> %y, <2 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll index c93b8aa7fb42e..19a6d8bc9d735 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll @@ -53,6 +53,8 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: f32x4.convert_i32x4_u @@ -109,6 +111,8 @@ define <4 x float> @extend_to_float_high_i8x16_s(<8 x i8> %x) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0, 0, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: i16x8.extend_low_i8x16_s ; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: f32x4.convert_i32x4_s diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll index 2445570bb8fa9..fa6e59198b3e3 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll @@ -171,6 +171,8 @@ define <8 x i16> @extend_lowish_i8x16_s(<16 x i8> %v) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: i16x8.extend_low_i8x16_s ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <16 x i8> %v, <16 x i8> undef, @@ -186,6 +188,8 @@ define <4 x i32> @extend_lowish_i16x8_s(<8 x i16> %v) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <8 x i16> %v, <8 x i16> undef, @@ -218,6 +222,8 @@ define <8 x i8> @extend_i1x8_i8(<8 x i1> %v) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const -1, 0 +; CHECK-NEXT: v128.and ; CHECK-NEXT: i32.const 7 ; CHECK-NEXT: i8x16.shl ; CHECK-NEXT: i32.const 7 diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll index 7228d5335a33f..53d8fc0bddb41 100644 --- a/llvm/test/CodeGen/WebAssembly/simd.ll +++ b/llvm/test/CodeGen/WebAssembly/simd.ll @@ -1261,7 +1261,9 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) { ; SIMD128: .functype shuffle_undef_v4i32 (v128, v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: return $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: return $pop2 ; ; NO-SIMD128-LABEL: shuffle_undef_v4i32: ; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () @@ -1520,7 +1522,9 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) { ; SIMD128: .functype shuffle_undef_v2i64 (v128, v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: return $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: return $pop2 ; ; NO-SIMD128-LABEL: shuffle_undef_v2i64: ; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> () @@ -1788,7 +1792,9 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) { ; SIMD128: .functype shuffle_undef_v4f32 (v128, v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: return $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: return $pop2 ; ; NO-SIMD128-LABEL: shuffle_undef_v4f32: ; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () @@ -2048,7 +2054,9 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) { ; SIMD128: .functype shuffle_undef_v2f64 (v128, v128) -> (v128) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: return $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: return $pop2 ; ; NO-SIMD128-LABEL: shuffle_undef_v2f64: ; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> () diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll index 1d194b640eab2..b1e201914922d 100644 --- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll +++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll @@ -18,12 +18,14 @@ define i32 @pairwise_add_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_add_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.add $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.add $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.add $push7=, $0, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.add $push4=, $pop6, $pop3 +; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg) ret i32 %res } @@ -33,15 +35,19 @@ define i16 @pairwise_add_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_add_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.add $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.add $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.add $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.add $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.add $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %arg) ret i16 %res } @@ -51,18 +57,22 @@ define i8 @pairwise_add_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_add_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.add $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i8x16.add $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i8x16.add $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.add $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.add $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.add $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %arg) ret i8 %res } @@ -84,12 +94,14 @@ define i32 @pairwise_mul_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_mul_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.mul $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.mul $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.mul $push7=, $0, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.mul $push4=, $pop6, $pop3 +; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 %res = tail call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %arg) ret i32 %res } @@ -99,15 +111,19 @@ define i16 @pairwise_mul_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_mul_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.mul $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.mul $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.mul $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.mul $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.mul $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %arg) ret i16 %res } @@ -171,12 +187,14 @@ define i32 @pairwise_and_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_and_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.and $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.and $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.and $push7=, $0, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.and $push4=, $pop6, $pop3 +; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 %res = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %arg) ret i32 %res } @@ -186,15 +204,19 @@ define i16 @pairwise_and_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_and_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.and $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.and $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.and $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.and $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.and $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %arg) ret i16 %res } @@ -204,18 +226,22 @@ define i8 @pairwise_and_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_and_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.and $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.and $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.and $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.and $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.and $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.and $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %arg) ret i8 %res } @@ -237,12 +263,14 @@ define i32 @pairwise_or_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_or_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.or $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.or $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.or $push7=, $0, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.or $push4=, $pop6, $pop3 +; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 %res = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %arg) ret i32 %res } @@ -252,15 +280,19 @@ define i16 @pairwise_or_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_or_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.or $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.or $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.or $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.or $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.or $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %arg) ret i16 %res } @@ -270,18 +302,22 @@ define i8 @pairwise_or_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_or_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.or $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.or $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.or $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.or $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.or $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.or $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %arg) ret i8 %res } @@ -303,12 +339,14 @@ define i32 @pairwise_xor_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_xor_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.xor $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: v128.xor $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.xor $push7=, $0, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.xor $push4=, $pop6, $pop3 +; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 %res = tail call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %arg) ret i32 %res } @@ -318,15 +356,19 @@ define i16 @pairwise_xor_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_xor_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.xor $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.xor $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: v128.xor $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.xor $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.xor $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %arg) ret i16 %res } @@ -336,18 +378,22 @@ define i8 @pairwise_xor_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_xor_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.xor $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: v128.xor $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: v128.xor $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.xor $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.xor $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: v128.xor $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %arg) ret i8 %res } @@ -356,12 +402,14 @@ define i64 @pairwise_smax_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_smax_v2i64: ; SIMD128: .functype pairwise_smax_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 -; SIMD128-NEXT: i64x2.gt_s $push0=, $0, $1 -; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 -; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push6=, $pop0, $pop1 +; SIMD128-NEXT: local.tee $push5=, $1=, $pop6 +; SIMD128-NEXT: i64x2.gt_s $push2=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push3=, $0, $pop5, $pop2 +; SIMD128-NEXT: i64x2.extract_lane $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 %res = tail call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %arg) ret i64 %res } @@ -371,12 +419,16 @@ define i32 @pairwise_smax_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_smax_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.max_s $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.max_s $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.max_s $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i32x4.max_s $push6=, $pop8, $pop5 +; SIMD128-NEXT: i32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arg) ret i32 %res } @@ -386,15 +438,19 @@ define i16 @pairwise_smax_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_smax_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_s $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_s $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_s $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.max_s $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.max_s $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arg) ret i16 %res } @@ -404,18 +460,22 @@ define i8 @pairwise_smax_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_smax_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_s $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i8x16.max_s $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i8x16.max_s $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_s $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_s $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_s $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arg) ret i8 %res } @@ -424,12 +484,14 @@ define i64 @pairwise_smin_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_smin_v2i64: ; SIMD128: .functype pairwise_smin_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 -; SIMD128-NEXT: i64x2.lt_s $push0=, $0, $1 -; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 -; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push6=, $pop0, $pop1 +; SIMD128-NEXT: local.tee $push5=, $1=, $pop6 +; SIMD128-NEXT: i64x2.lt_s $push2=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push3=, $0, $pop5, $pop2 +; SIMD128-NEXT: i64x2.extract_lane $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 %res = tail call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %arg) ret i64 %res } @@ -439,12 +501,16 @@ define i32 @pairwise_smin_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_smin_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.min_s $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.min_s $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.min_s $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i32x4.min_s $push6=, $pop8, $pop5 +; SIMD128-NEXT: i32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arg) ret i32 %res } @@ -454,15 +520,19 @@ define i16 @pairwise_smin_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_smin_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_s $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_s $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_s $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.min_s $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.min_s $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arg) ret i16 %res } @@ -472,18 +542,22 @@ define i8 @pairwise_smin_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_smin_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_s $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i8x16.min_s $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i8x16.min_s $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_s $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_s $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_s $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arg) ret i8 %res } @@ -492,18 +566,20 @@ define i64 @pairwise_umax_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_umax_v2i64: ; SIMD128: .functype pairwise_umax_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 -; SIMD128-NEXT: i64.const $push4=, -1 -; SIMD128-NEXT: i64.const $push3=, 0 -; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 -; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 -; SIMD128-NEXT: i64.gt_u $push2=, $pop1, $pop0 -; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 -; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 -; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 -; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 -; SIMD128-NEXT: return $pop8 +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push12=, $pop0, $pop1 +; SIMD128-NEXT: local.tee $push11=, $1=, $pop12 +; SIMD128-NEXT: i64.const $push6=, -1 +; SIMD128-NEXT: i64.const $push5=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $1, 0 +; SIMD128-NEXT: i64.gt_u $push4=, $pop3, $pop2 +; SIMD128-NEXT: i64.select $push7=, $pop6, $pop5, $pop4 +; SIMD128-NEXT: i64x2.replace_lane $push8=, $0, 0, $pop7 +; SIMD128-NEXT: v128.bitselect $push9=, $0, $pop11, $pop8 +; SIMD128-NEXT: i64x2.extract_lane $push10=, $pop9, 0 +; SIMD128-NEXT: return $pop10 %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg) ret i64 %res } @@ -513,12 +589,16 @@ define i32 @pairwise_umax_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_umax_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.max_u $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.max_u $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.max_u $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i32x4.max_u $push6=, $pop8, $pop5 +; SIMD128-NEXT: i32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arg) ret i32 %res } @@ -528,15 +608,19 @@ define i16 @pairwise_umax_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_umax_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_u $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_u $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.max_u $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.max_u $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.max_u $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arg) ret i16 %res } @@ -546,18 +630,22 @@ define i8 @pairwise_umax_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_umax_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_u $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i8x16.max_u $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i8x16.max_u $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_u $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_u $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.max_u $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arg) ret i8 %res } @@ -566,18 +654,20 @@ define i64 @pairwise_umin_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_umin_v2i64: ; SIMD128: .functype pairwise_umin_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 -; SIMD128-NEXT: i64.const $push4=, -1 -; SIMD128-NEXT: i64.const $push3=, 0 -; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 -; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 -; SIMD128-NEXT: i64.lt_u $push2=, $pop1, $pop0 -; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 -; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 -; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 -; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 -; SIMD128-NEXT: return $pop8 +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push12=, $pop0, $pop1 +; SIMD128-NEXT: local.tee $push11=, $1=, $pop12 +; SIMD128-NEXT: i64.const $push6=, -1 +; SIMD128-NEXT: i64.const $push5=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $1, 0 +; SIMD128-NEXT: i64.lt_u $push4=, $pop3, $pop2 +; SIMD128-NEXT: i64.select $push7=, $pop6, $pop5, $pop4 +; SIMD128-NEXT: i64x2.replace_lane $push8=, $0, 0, $pop7 +; SIMD128-NEXT: v128.bitselect $push9=, $0, $pop11, $pop8 +; SIMD128-NEXT: i64x2.extract_lane $push10=, $pop9, 0 +; SIMD128-NEXT: return $pop10 %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg) ret i64 %res } @@ -587,12 +677,16 @@ define i32 @pairwise_umin_v4i32(<4 x i32> %arg) { ; SIMD128: .functype pairwise_umin_v4i32 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.min_u $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: i32x4.min_u $push2=, $pop4, $pop1 -; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i32x4.min_u $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i32x4.min_u $push6=, $pop8, $pop5 +; SIMD128-NEXT: i32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arg) ret i32 %res } @@ -602,15 +696,19 @@ define i16 @pairwise_umin_v8i16(<8 x i16> %arg) { ; SIMD128: .functype pairwise_umin_v8i16 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_u $push8=, $0, $pop0 -; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_u $push6=, $pop7, $pop1 -; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; SIMD128-NEXT: i16x8.min_u $push3=, $pop5, $pop2 -; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 -; SIMD128-NEXT: return $pop4 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i16x8.min_u $push12=, $0, $pop2 +; SIMD128-NEXT: local.tee $push11=, $0=, $pop12 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i16x8.min_u $push10=, $pop11, $pop5 +; SIMD128-NEXT: local.tee $push9=, $0=, $pop10 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push7=, $pop9, $pop6 +; SIMD128-NEXT: i16x8.extract_lane_u $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arg) ret i16 %res } @@ -620,18 +718,22 @@ define i8 @pairwise_umin_v16i8(<16 x i8> %arg) { ; SIMD128: .functype pairwise_umin_v16i8 (v128) -> (i32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_u $push11=, $0, $pop0 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: i8x16.min_u $push15=, $0, $pop2 +; SIMD128-NEXT: local.tee $push14=, $0=, $pop15 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: i8x16.min_u $push13=, $pop14, $pop5 +; SIMD128-NEXT: local.tee $push12=, $0=, $pop13 +; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push11=, $pop12, $pop6 ; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_u $push9=, $pop10, $pop1 -; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 -; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_u $push7=, $pop8, $pop2 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; SIMD128-NEXT: i8x16.min_u $push4=, $pop6, $pop3 -; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: i8x16.shuffle $push7=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push8=, $pop10, $pop7 +; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $pop8, 0 +; SIMD128-NEXT: return $pop9 %res = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arg) ret i8 %res } @@ -763,14 +865,16 @@ define float @pairwise_mul_v4f32_reassoc(<4 x float> %arg) { ; SIMD128: .functype pairwise_mul_v4f32_reassoc (v128) -> (f32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.mul $push7=, $0, $pop0 -; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.mul $push2=, $pop6, $pop1 -; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: f32.const $push4=, -0x0p0 -; SIMD128-NEXT: f32.mul $push5=, $pop3, $pop4 -; SIMD128-NEXT: return $pop5 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: f32x4.mul $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.mul $push4=, $pop8, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $pop4, 0 +; SIMD128-NEXT: f32.const $push6=, -0x0p0 +; SIMD128-NEXT: f32.mul $push7=, $pop5, $pop6 +; SIMD128-NEXT: return $pop7 %res = tail call reassoc float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) ret float %res } @@ -792,9 +896,11 @@ define double @pairwise_max_v2f64_fast(<2 x double> %arg) { ; SIMD128: .functype pairwise_max_v2f64_fast (v128) -> (f64) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: f64x2.pmax $push1=, $0, $pop0 -; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: f64x2.pmax $push3=, $0, $pop2 +; SIMD128-NEXT: f64x2.extract_lane $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg) ret double%res } @@ -820,12 +926,16 @@ define float @pairwise_max_v4f32_fast(<4 x float> %arg) { ; SIMD128: .functype pairwise_max_v4f32_fast (v128) -> (f32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.pmax $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.pmax $push2=, $pop4, $pop1 -; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: f32x4.pmax $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: f32x4.pmax $push6=, $pop8, $pop5 +; SIMD128-NEXT: f32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) ret float %res } @@ -863,9 +973,11 @@ define double @pairwise_min_v2f64_fast(<2 x double> %arg) { ; SIMD128: .functype pairwise_min_v2f64_fast (v128) -> (f64) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: f64x2.pmin $push1=, $0, $pop0 -; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 -; SIMD128-NEXT: return $pop2 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: f64x2.pmin $push3=, $0, $pop2 +; SIMD128-NEXT: f64x2.extract_lane $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg) ret double%res } @@ -891,12 +1003,16 @@ define float @pairwise_min_v4f32_fast(<4 x float> %arg) { ; SIMD128: .functype pairwise_min_v4f32_fast (v128) -> (f32) ; SIMD128-NEXT: # %bb.0: ; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.pmin $push5=, $0, $pop0 -; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 -; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -; SIMD128-NEXT: f32x4.pmin $push2=, $pop4, $pop1 -; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 -; SIMD128-NEXT: return $pop3 +; SIMD128-NEXT: v128.const $push1=, -1, 0 +; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1 +; SIMD128-NEXT: f32x4.pmin $push9=, $0, $pop2 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.const $push4=, -1, 0, 0, 0 +; SIMD128-NEXT: v128.and $push5=, $pop3, $pop4 +; SIMD128-NEXT: f32x4.pmin $push6=, $pop8, $pop5 +; SIMD128-NEXT: f32x4.extract_lane $push7=, $pop6, 0 +; SIMD128-NEXT: return $pop7 %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) ret float %res } diff --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll index 94aa197bfd564..736bff1ad80fe 100644 --- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll +++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll @@ -121,14 +121,16 @@ define <8 x i32> @sext_zext_mul_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK-NEXT: i32x4.extend_low_i16x8_u $push1=, $pop0 ; CHECK-NEXT: i32x4.mul $push4=, $pop3, $pop1 ; CHECK-NEXT: v128.store 0($0), $pop4 -; CHECK-NEXT: i8x16.shuffle $push11=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: local.tee $push10=, $1=, $pop11 -; CHECK-NEXT: i16x8.extend_low_i8x16_s $push7=, $pop10 -; CHECK-NEXT: i32x4.extend_low_i16x8_s $push8=, $pop7 -; CHECK-NEXT: i16x8.extend_low_i8x16_u $push5=, $1 -; CHECK-NEXT: i32x4.extend_low_i16x8_u $push6=, $pop5 -; CHECK-NEXT: i32x4.mul $push9=, $pop8, $pop6 -; CHECK-NEXT: v128.store 16($0), $pop9 +; CHECK-NEXT: i8x16.shuffle $push5=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const $push6=, -1, 0, 0, 0 +; CHECK-NEXT: v128.and $push13=, $pop5, $pop6 +; CHECK-NEXT: local.tee $push12=, $1=, $pop13 +; CHECK-NEXT: i16x8.extend_low_i8x16_s $push9=, $pop12 +; CHECK-NEXT: i32x4.extend_low_i16x8_s $push10=, $pop9 +; CHECK-NEXT: i16x8.extend_low_i8x16_u $push7=, $1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u $push8=, $pop7 +; CHECK-NEXT: i32x4.mul $push11=, $pop10, $pop8 +; CHECK-NEXT: v128.store 16($0), $pop11 ; CHECK-NEXT: return %wide.a = sext <8 x i8> %a to <8 x i32> %wide.b = zext <8 x i8> %a to <8 x i32> @@ -146,30 +148,36 @@ define <16 x i32> @sext_zext_mul_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: i32x4.extend_low_i16x8_u $push1=, $pop0 ; CHECK-NEXT: i32x4.mul $push4=, $pop3, $pop1 ; CHECK-NEXT: v128.store 0($0), $pop4 -; CHECK-NEXT: i8x16.shuffle $push25=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: local.tee $push24=, $3=, $pop25 -; CHECK-NEXT: i16x8.extend_low_i8x16_s $push7=, $pop24 -; CHECK-NEXT: i32x4.extend_low_i16x8_s $push8=, $pop7 -; CHECK-NEXT: i16x8.extend_low_i8x16_u $push5=, $3 -; CHECK-NEXT: i32x4.extend_low_i16x8_u $push6=, $pop5 -; CHECK-NEXT: i32x4.mul $push9=, $pop8, $pop6 -; CHECK-NEXT: v128.store 48($0), $pop9 -; CHECK-NEXT: i8x16.shuffle $push23=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: local.tee $push22=, $3=, $pop23 -; CHECK-NEXT: i16x8.extend_low_i8x16_s $push12=, $pop22 -; CHECK-NEXT: i32x4.extend_low_i16x8_s $push13=, $pop12 -; CHECK-NEXT: i16x8.extend_low_i8x16_u $push10=, $3 -; CHECK-NEXT: i32x4.extend_low_i16x8_u $push11=, $pop10 -; CHECK-NEXT: i32x4.mul $push14=, $pop13, $pop11 -; CHECK-NEXT: v128.store 32($0), $pop14 -; CHECK-NEXT: i8x16.shuffle $push21=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: local.tee $push20=, $1=, $pop21 -; CHECK-NEXT: i16x8.extend_low_i8x16_s $push17=, $pop20 -; CHECK-NEXT: i32x4.extend_low_i16x8_s $push18=, $pop17 -; CHECK-NEXT: i16x8.extend_low_i8x16_u $push15=, $1 -; CHECK-NEXT: i32x4.extend_low_i16x8_u $push16=, $pop15 -; CHECK-NEXT: i32x4.mul $push19=, $pop18, $pop16 -; CHECK-NEXT: v128.store 16($0), $pop19 +; CHECK-NEXT: i8x16.shuffle $push5=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const $push31=, -1, 0, 0, 0 +; CHECK-NEXT: local.tee $push30=, $3=, $pop31 +; CHECK-NEXT: v128.and $push29=, $pop5, $pop30 +; CHECK-NEXT: local.tee $push28=, $4=, $pop29 +; CHECK-NEXT: i16x8.extend_low_i8x16_s $push8=, $pop28 +; CHECK-NEXT: i32x4.extend_low_i16x8_s $push9=, $pop8 +; CHECK-NEXT: i16x8.extend_low_i8x16_u $push6=, $4 +; CHECK-NEXT: i32x4.extend_low_i16x8_u $push7=, $pop6 +; CHECK-NEXT: i32x4.mul $push10=, $pop9, $pop7 +; CHECK-NEXT: v128.store 48($0), $pop10 +; CHECK-NEXT: i8x16.shuffle $push11=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.const $push12=, -1, 0 +; CHECK-NEXT: v128.and $push27=, $pop11, $pop12 +; CHECK-NEXT: local.tee $push26=, $4=, $pop27 +; CHECK-NEXT: i16x8.extend_low_i8x16_s $push15=, $pop26 +; CHECK-NEXT: i32x4.extend_low_i16x8_s $push16=, $pop15 +; CHECK-NEXT: i16x8.extend_low_i8x16_u $push13=, $4 +; CHECK-NEXT: i32x4.extend_low_i16x8_u $push14=, $pop13 +; CHECK-NEXT: i32x4.mul $push17=, $pop16, $pop14 +; CHECK-NEXT: v128.store 32($0), $pop17 +; CHECK-NEXT: i8x16.shuffle $push18=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: v128.and $push25=, $pop18, $3 +; CHECK-NEXT: local.tee $push24=, $1=, $pop25 +; CHECK-NEXT: i16x8.extend_low_i8x16_s $push21=, $pop24 +; CHECK-NEXT: i32x4.extend_low_i16x8_s $push22=, $pop21 +; CHECK-NEXT: i16x8.extend_low_i8x16_u $push19=, $1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u $push20=, $pop19 +; CHECK-NEXT: i32x4.mul $push23=, $pop22, $pop20 +; CHECK-NEXT: v128.store 16($0), $pop23 ; CHECK-NEXT: return %wide.a = sext <16 x i8> %a to <16 x i32> %wide.b = zext <16 x i8> %a to <16 x i32>