Skip to content

[NVPTX] Fixup under-aligned dynamic alloca lowering #139628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

AlexMaclean
Copy link
Member

The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that the default stack alignment should be used. Prior to this change, we passed this alignment through unchanged leading to an error in ptxas. Now, we use the stack-alignment in this case. Also did a little cleanup while I'm here.

@llvmbot
Copy link
Member

llvmbot commented May 12, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that the default stack alignment should be used. Prior to this change, we passed this alignment through unchanged leading to an error in ptxas. Now, we use the stack-alignment in this case. Also did a little cleanup while I'm here.


Full diff: https://github.com/llvm/llvm-project/pull/139628.diff

3 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+17-7)
  • (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+9-17)
  • (modified) llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll (+88-27)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3769aae7b620f..8bf0723220093 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2056,18 +2056,28 @@ SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     return DAG.getMergeValues(Ops, SDLoc());
   }
 
+  SDLoc DL(Op.getNode());
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  SDLoc DL(Op.getNode());
+  uint64_t Align = Op.getConstantOperandVal(2);
+
+  // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
+  // the default stack alignment should be used.
+  if (Align == 0)
+    Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value();
 
   // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
-  MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
+  const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
+
+  SDValue Alloc =
+      DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
+                  {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
+                   DAG.getTargetConstant(Align, DL, MVT::i32)});
+
+  SDValue ASC = DAG.getAddrSpaceCast(
+      DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
 
-  SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
-                        DAG.getTargetConstant(Align, DL, MVT::i32)};
-  EVT RetTypes[] = {ValueSizeTy, MVT::Other};
-  return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
+  return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
 }
 
 SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6639554e450f2..a90dfe7a0e6ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3102,28 +3102,20 @@ def CALL_PROTOTYPE :
             "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
 
 def SDTDynAllocaOp :
-  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>;
 
 def dyn_alloca :
   SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
          [SDNPHasChain, SDNPSideEffect]>;
 
-def DYNAMIC_STACKALLOC32 :
-  NVPTXInst<(outs Int32Regs:$ptr),
-            (ins Int32Regs:$size, i32imm:$align),
-            "alloca.u32 \t$ptr, $size, $align;\n\t"
-            "cvta.local.u32 \t$ptr, $ptr;",
-            [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def DYNAMIC_STACKALLOC64 :
-  NVPTXInst<(outs Int64Regs:$ptr),
-            (ins Int64Regs:$size, i32imm:$align),
-            "alloca.u64 \t$ptr, $size, $align;\n\t"
-            "cvta.local.u64 \t$ptr, $ptr;",
-            [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
+foreach t = [I32RT, I64RT] in {
+  def DYNAMIC_STACKALLOC # t.Size :
+    NVPTXInst<(outs t.RC:$ptr),
+              (ins t.RC:$size, i32imm:$align),
+              "alloca.u" # t.Size # " \t$ptr, $size, $align;",
+              [(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>,
+              Requires<[hasPTX<73>, hasSM<52>]>;
+}
 
 //
 // BRX
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 664569e3c525c..28bef0de48166 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,42 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
 ; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
 
-; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
-; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
 
 ; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
 
-; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) test_dynamic_stackalloc(
-; CHECK-NOT: __local_depot
-
-; CHECK-32:       ld.param.b32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
-; CHECK-32-NEXT:  add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
-; CHECK-32-NEXT:  and.b32         %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
-; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
-; CHECK-32-NEXT:  cvta.local.u32  %r[[ALLOCA]], %r[[ALLOCA]];
-; CHECK-32-NEXT:  { // callseq 0, 0
-; CHECK-32-NEXT:  .param .b32 param0;
-; CHECK-32-NEXT:  st.param.b32  [param0], %r[[ALLOCA]];
-
-; CHECK-64:       ld.param.b64  %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
-; CHECK-64-NEXT:  add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
-; CHECK-64-NEXT:  and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
-; CHECK-64-NEXT:  alloca.u64  %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
-; CHECK-64-NEXT:  cvta.local.u64  %rd[[ALLOCA]], %rd[[ALLOCA]];
-; CHECK-64-NEXT:  { // callseq 0, 0
-; CHECK-64-NEXT:  .param .b64 param0;
-; CHECK-64-NEXT:  st.param.b64  [param0], %rd[[ALLOCA]];
-
-; CHECK-NEXT:     .param .b32 retval0;
-; CHECK-NEXT:     call.uni (retval0),
-; CHECK-NEXT:     bar,
-
 define i32 @test_dynamic_stackalloc(i64 %n) {
+; CHECK-32-LABEL: test_dynamic_stackalloc(
+; CHECK-32:       {
+; CHECK-32-NEXT:    .reg .b32 %r<8>;
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT:  // %bb.0:
+; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT:    add.s32 %r2, %r1, 7;
+; CHECK-32-NEXT:    and.b32 %r3, %r2, -8;
+; CHECK-32-NEXT:    alloca.u32 %r4, %r3, 16;
+; CHECK-32-NEXT:    cvta.local.u32 %r5, %r4;
+; CHECK-32-NEXT:    { // callseq 0, 0
+; CHECK-32-NEXT:    .param .b32 param0;
+; CHECK-32-NEXT:    st.param.b32 [param0], %r5;
+; CHECK-32-NEXT:    .param .b32 retval0;
+; CHECK-32-NEXT:    call.uni (retval0),
+; CHECK-32-NEXT:    bar,
+; CHECK-32-NEXT:    (
+; CHECK-32-NEXT:    param0
+; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    ld.param.b32 %r6, [retval0];
+; CHECK-32-NEXT:    } // callseq 0
+; CHECK-32-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-32-NEXT:    ret;
+;
+; CHECK-64-LABEL: test_dynamic_stackalloc(
+; CHECK-64:       {
+; CHECK-64-NEXT:    .reg .b32 %r<3>;
+; CHECK-64-NEXT:    .reg .b64 %rd<6>;
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT:  // %bb.0:
+; CHECK-64-NEXT:    ld.param.b64 %rd1, [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT:    add.s64 %rd2, %rd1, 7;
+; CHECK-64-NEXT:    and.b64 %rd3, %rd2, -8;
+; CHECK-64-NEXT:    alloca.u64 %rd4, %rd3, 16;
+; CHECK-64-NEXT:    cvta.local.u64 %rd5, %rd4;
+; CHECK-64-NEXT:    { // callseq 0, 0
+; CHECK-64-NEXT:    .param .b64 param0;
+; CHECK-64-NEXT:    st.param.b64 [param0], %rd5;
+; CHECK-64-NEXT:    .param .b32 retval0;
+; CHECK-64-NEXT:    call.uni (retval0),
+; CHECK-64-NEXT:    bar,
+; CHECK-64-NEXT:    (
+; CHECK-64-NEXT:    param0
+; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    ld.param.b32 %r1, [retval0];
+; CHECK-64-NEXT:    } // callseq 0
+; CHECK-64-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-64-NEXT:    ret;
   %alloca = alloca i8, i64 %n, align 16
   %call = call i32 @bar(ptr %alloca)
   ret i32 %call
 }
 
+define float @test_dynamic_stackalloc_unaligned(i64 %0) {
+; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned(
+; CHECK-32:       {
+; CHECK-32-NEXT:    .reg .b32 %r<6>;
+; CHECK-32-NEXT:    .reg .b32 %f<2>;
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT:  // %bb.0:
+; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0];
+; CHECK-32-NEXT:    shl.b32 %r2, %r1, 2;
+; CHECK-32-NEXT:    add.s32 %r3, %r2, 7;
+; CHECK-32-NEXT:    and.b32 %r4, %r3, -8;
+; CHECK-32-NEXT:    alloca.u32 %r5, %r4, 8;
+; CHECK-32-NEXT:    ld.local.b32 %f1, [%r5];
+; CHECK-32-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-32-NEXT:    ret;
+;
+; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned(
+; CHECK-64:       {
+; CHECK-64-NEXT:    .reg .b32 %f<2>;
+; CHECK-64-NEXT:    .reg .b64 %rd<6>;
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT:  // %bb.0:
+; CHECK-64-NEXT:    ld.param.b64 %rd1, [test_dynamic_stackalloc_unaligned_param_0];
+; CHECK-64-NEXT:    shl.b64 %rd2, %rd1, 2;
+; CHECK-64-NEXT:    add.s64 %rd3, %rd2, 7;
+; CHECK-64-NEXT:    and.b64 %rd4, %rd3, -8;
+; CHECK-64-NEXT:    alloca.u64 %rd5, %rd4, 8;
+; CHECK-64-NEXT:    ld.local.b32 %f1, [%rd5];
+; CHECK-64-NEXT:    st.param.b32 [func_retval0], %f1;
+; CHECK-64-NEXT:    ret;
+  %4 = alloca float, i64 %0, align 4
+  %5 = getelementptr float, ptr %4, i64 0
+  %6 = load float, ptr %5, align 4
+  ret float %6
+}
+
 declare i32 @bar(ptr)
+

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants