diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 96714adf78e43..70034aa3b9107 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49803,8 +49803,35 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + unsigned int Opcode = N->getOpcode(); SDLoc DL(N); + // If both operands of a 64-bit multiply are known to have their upper 48 bits + // zero, the result is guaranteed to fit in 32 bits. For example: + // (i16::MAX * i16::MAX) = 32767 * 32767 = 1073676289 + // which fits within a signed 32-bit integer (i32::MAX = 2,147,483,647). + // In such cases, we can safely perform the multiplication as a 32-bit signed + // `mul` followed by a zero-extension to i64. + if (VT == MVT::i64 && Subtarget.is64Bit()) { + APInt HiMask = APInt::getHighBitsSet(64, 48); + if (DAG.MaskedValueIsZero(Op0, HiMask) && + DAG.MaskedValueIsZero(Op1, HiMask)) { + SDValue LHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op0); + SDValue RHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); + bool NSW = Op0->getFlags().hasNoSignedWrap(); + bool NUW = Op0->getFlags().hasNoUnsignedWrap(); + NSW = NSW & DAG.willNotOverflowMul(true, LHS, RHS); + NUW = NUW & DAG.willNotOverflowMul(false, LHS, RHS); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(NUW); + Flags.setNoSignedWrap(NSW); + SDValue Mul = DAG.getNode(Opcode, DL, MVT::i32, LHS, RHS, Flags); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Mul); + } + } + if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget)) return V; @@ -58070,8 +58097,28 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + unsigned int Opcode = N->getOpcode(); SDLoc DL(N); + // Use a 32-bit add+zext if upper 33 bits known zero. + if (VT == MVT::i64 && Subtarget.is64Bit()) { + APInt HiMask = APInt::getHighBitsSet(64, 33); + if (DAG.MaskedValueIsZero(Op0, HiMask) && + DAG.MaskedValueIsZero(Op1, HiMask)) { + SDValue LHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op0); + SDValue RHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); + bool NSW = Op0->getFlags().hasNoSignedWrap(); + bool NUW = Op0->getFlags().hasNoUnsignedWrap(); + NSW = NSW & DAG.willNotOverflowAdd(true, LHS, RHS); + NUW = NUW & DAG.willNotOverflowAdd(false, LHS, RHS); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(NUW); + Flags.setNoSignedWrap(NSW); + SDValue Sum = DAG.getNode(Opcode, DL, MVT::i32, LHS, RHS, Flags); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Sum); + } + } + if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget)) return Select; @@ -58297,8 +58344,28 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + unsigned int Opcode = N->getOpcode(); SDLoc DL(N); + // Use a 32-bit sub+zext if upper 33 bits known zero. + if (VT == MVT::i64 && Subtarget.is64Bit()) { + APInt HiMask = APInt::getHighBitsSet(64, 33); + if (DAG.MaskedValueIsZero(Op0, HiMask) && + DAG.MaskedValueIsZero(Op1, HiMask)) { + SDValue LHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op0); + SDValue RHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); + bool NSW = Op0->getFlags().hasNoSignedWrap(); + bool NUW = Op0->getFlags().hasNoUnsignedWrap(); + NSW = NSW & DAG.willNotOverflowSub(true, LHS, RHS); + NUW = NUW & DAG.willNotOverflowSub(false, LHS, RHS); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(NUW); + Flags.setNoSignedWrap(NSW); + SDValue Sub = DAG.getNode(Opcode, DL, MVT::i32, LHS, RHS, Flags); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Sub); + } + } + auto IsNonOpaqueConstant = [&](SDValue Op) { return DAG.isConstantIntBuildVectorOrConstantInt(Op, /*AllowOpaques*/ false); diff --git a/llvm/test/CodeGen/X86/reduce-i64-add.ll b/llvm/test/CodeGen/X86/reduce-i64-add.ll new file mode 100644 index 0000000000000..97d98df739d80 --- /dev/null +++ b/llvm/test/CodeGen/X86/reduce-i64-add.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s --check-prefix=X64-LINUX +; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=X64-WIN32 + +define i64 @test1(i16 %a) { +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LINUX-LABEL: test1: +; X64-LINUX: # %bb.0: +; X64-LINUX-NEXT: movzwl %di, %eax +; X64-LINUX-NEXT: addl $42, %eax +; X64-LINUX-NEXT: retq +; +; X64-WIN32-LABEL: test1: +; X64-WIN32: # %bb.0: +; X64-WIN32-NEXT: movzwl %cx, %eax +; X64-WIN32-NEXT: addl $42, %eax +; X64-WIN32-NEXT: retq + %zext_a = zext i16 %a to i64 + %sum = add i64 %zext_a, 42 + ret i64 %sum +} diff --git a/llvm/test/CodeGen/X86/reduce-i64-mul.ll b/llvm/test/CodeGen/X86/reduce-i64-mul.ll new file mode 100644 index 0000000000000..b592c0ba270bc --- /dev/null +++ b/llvm/test/CodeGen/X86/reduce-i64-mul.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s --check-prefix=X64-LINUX +; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=X64-WIN32 + +define i64 @test1(i16 %a) { +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $42, %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: retl +; +; X64-LINUX-LABEL: test1: +; X64-LINUX: # %bb.0: +; X64-LINUX-NEXT: movzwl %di, %eax +; X64-LINUX-NEXT: imull $42, %eax, %eax +; X64-LINUX-NEXT: retq +; +; X64-WIN32-LABEL: test1: +; X64-WIN32: # %bb.0: +; X64-WIN32-NEXT: movzwl %cx, %eax +; X64-WIN32-NEXT: imull $42, %eax, %eax +; X64-WIN32-NEXT: retq + + %zext_a = zext i16 %a to i64 + %mul = mul i64 %zext_a, 42 + ret i64 %mul +} diff --git a/llvm/test/CodeGen/X86/reduce-i64-sub.ll b/llvm/test/CodeGen/X86/reduce-i64-sub.ll new file mode 100644 index 0000000000000..9cabe7fc1a1e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/reduce-i64-sub.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s --check-prefix=X64-LINUX +; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=X64-WIN32 + +define i64 @test1(i16 %a) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: addl $42, %eax +; CHECK-NEXT: retq +; +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LINUX-LABEL: test1: +; X64-LINUX: # %bb.0: +; X64-LINUX-NEXT: movzwl %di, %eax +; X64-LINUX-NEXT: addl $42, %eax +; X64-LINUX-NEXT: retq +; +; X64-WIN32-LABEL: test1: +; X64-WIN32: # %bb.0: +; X64-WIN32-NEXT: movzwl %cx, %eax +; X64-WIN32-NEXT: addl $42, %eax +; X64-WIN32-NEXT: retq + %zext_a = zext i16 %a to i64 + %sub = sub i64 %zext_a, -42 + ret i64 %sub +}