Skip to content

[WebAssembly] Mask undef shuffle lanes #149084

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2719,18 +2719,52 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
Ops[OpIdx++] = Op.getOperand(0);
Ops[OpIdx++] = Op.getOperand(1);

std::bitset<16> DefinedLaneBytes = 0xFFFF;
// Expand mask indices to byte indices and materialize them as operands
for (int M : Mask) {
for (size_t J = 0; J < LaneBytes; ++J) {
// Lower undefs (represented by -1 in mask) to {0..J}, which use a
// whole lane of vector input, to allow further reduction at VM. E.g.
// match an 8x16 byte shuffle to an equivalent cheaper 32x4 shuffle.
if (M == -1) {
DefinedLaneBytes[OpIdx - 2] = 0;
}
uint64_t ByteIndex = M == -1 ? J : (uint64_t)M * LaneBytes + J;
Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32);
}
}

return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
EVT VT = Op.getValueType();
SDValue Shuffle = DAG.getNode(WebAssemblyISD::SHUFFLE, DL, VT, Ops);

// If only the lower four or eight bytes are actually defined by the
// shuffle, insert an AND so a VM can know that it can ignore the higher,
// undef, lanes.
if (DefinedLaneBytes == 0xF) {
SDValue LowLaneMask[] = {
DAG.getConstant(uint32_t(-1), DL, MVT::i32),
DAG.getConstant(uint32_t(0), DL, MVT::i32),
DAG.getConstant(uint32_t(0), DL, MVT::i32),
DAG.getConstant(uint32_t(0), DL, MVT::i32),
};
SDValue UndefMask =
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, LowLaneMask);
SDValue MaskedShuffle =
DAG.getNode(ISD::AND, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, Shuffle), UndefMask);
return DAG.getBitcast(VT, MaskedShuffle);
} else if (DefinedLaneBytes == 0xFF) {
SDValue LowLaneMask[] = {
DAG.getConstant(uint64_t(-1), DL, MVT::i64),
DAG.getConstant(uint32_t(0), DL, MVT::i64),
};
SDValue UndefMask =
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64, LowLaneMask);
SDValue MaskedShuffle =
DAG.getNode(ISD::AND, DL, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, Shuffle), UndefMask);
return DAG.getBitcast(VT, MaskedShuffle);
}
return Shuffle;
}

SDValue WebAssemblyTargetLowering::LowerSETCC(SDValue Op,
Expand Down
28 changes: 18 additions & 10 deletions llvm/test/CodeGen/WebAssembly/extend-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ define <4 x i32> @sext_high_v4i8(<8 x i8> %in) {
; SIMD128: .functype sext_high_v4i8 (v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0
; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1
; SIMD128-NEXT: return $pop2
; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0
; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1
; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push3=, $pop2
; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push4=, $pop3
; SIMD128-NEXT: return $pop4
%shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%res = sext <4 x i8> %shuffle to <4 x i32>
ret <4 x i32> %res
Expand All @@ -23,9 +25,11 @@ define <4 x i32> @zext_high_v4i8(<8 x i8> %in) {
; SIMD128: .functype zext_high_v4i8 (v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push1=, $pop0
; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push2=, $pop1
; SIMD128-NEXT: return $pop2
; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0
; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1
; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push3=, $pop2
; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push4=, $pop3
; SIMD128-NEXT: return $pop4
%shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%res = zext <4 x i8> %shuffle to <4 x i32>
ret <4 x i32> %res
Expand Down Expand Up @@ -58,8 +62,10 @@ define <2 x i32> @sext_high_v2i16(<4 x i16> %in) {
; SIMD128: .functype sext_high_v2i16 (v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push1=, $pop0
; SIMD128-NEXT: return $pop1
; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0
; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1
; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push3=, $pop2
; SIMD128-NEXT: return $pop3
%shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
%res = sext <2 x i16> %shuffle to <2 x i32>
ret <2 x i32> %res
Expand All @@ -70,8 +76,10 @@ define <2 x i32> @zext_high_v2i16(<4 x i16> %in) {
; SIMD128: .functype zext_high_v2i16 (v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push1=, $pop0
; SIMD128-NEXT: return $pop1
; SIMD128-NEXT: v128.const $push1=, -1, 0, 0, 0
; SIMD128-NEXT: v128.and $push2=, $pop0, $pop1
; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push3=, $pop2
; SIMD128-NEXT: return $pop3
%shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
%res = zext <2 x i16> %shuffle to <2 x i32>
ret <2 x i32> %res
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
; CHECK-NEXT: v128.bitselect
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -76,6 +78,8 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
; CHECK-NEXT: v128.bitselect
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -112,6 +116,8 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) {
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -301,6 +307,8 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
Expand Down Expand Up @@ -328,6 +336,8 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) {
; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <2 x double> %x to <2 x i32>
Expand Down Expand Up @@ -355,6 +365,8 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
Expand All @@ -378,6 +390,8 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
Expand All @@ -399,6 +413,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <4 x float> %x to <4 x i32>
Expand All @@ -420,6 +436,8 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
Expand Down Expand Up @@ -1484,6 +1502,8 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
; CHECK-NEXT: v128.bitselect
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -1526,6 +1546,8 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
; CHECK-NEXT: v128.bitselect
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -1561,6 +1583,8 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i64>
Expand Down Expand Up @@ -1738,6 +1762,8 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
Expand All @@ -1763,6 +1789,8 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <2 x double> %x to <2 x i32>
Expand All @@ -1789,6 +1817,8 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
Expand All @@ -1810,6 +1840,8 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
Expand All @@ -1829,6 +1861,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
; CHECK-NEXT: i32x4.min_u
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptoui <4 x float> %x to <4 x i32>
Expand All @@ -1849,6 +1883,8 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) {
; CHECK-NEXT: i32x4.max_s
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/WebAssembly/simd-concat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) {
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
%v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %v
Expand All @@ -48,6 +50,8 @@ define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) {
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 1, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: v128.const -1, 0, 0, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
%v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i8> %v
Expand All @@ -60,6 +64,8 @@ define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) {
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 0, 1, 0, 1, 0, 1
; CHECK-NEXT: v128.const -1, 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: # fallthrough-return
%v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i16> %v
Expand Down
Loading
Loading