-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.
|
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesWhen the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison. Full diff: https://github.com/llvm/llvm-project/pull/166877.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..aac546f6c8213 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7012,7 +7012,7 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
EVT CmpVT = Src0.getValueType();
SDLoc SL(N);
- if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
+ if ((CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) || CmpVT == MVT::bf16) {
Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
new file mode 100644
index 0000000000000..831e63669c335
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_with_fabs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s3, s3, 16
+; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat %temp, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_both_operands_with_fabs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_lshl_b32 s3, s3, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+ %src_input = call bfloat @llvm.fabs.bf16(bfloat %src)
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src_input, bfloat %temp, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 -1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oeq:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_one(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_one:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 6)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ogt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ogt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 2)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oge:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 3)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_olt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_olt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 4)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ole(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ole:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 5)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ueq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ueq:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 9)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_une(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_une:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 14)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ugt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ugt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 10)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uge:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 11)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ult(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ult:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 12)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_o(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_o:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 7)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uo(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uo:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 8)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ule(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ule:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
+ %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 13)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
|
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s | ||
|
|
||
| declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These intrinsics are deprecated, why is someone trying to use them with bfloat?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't find any use of the intrinsic with bfloat. You are right, this intrinsic seems not necessary. Will clang emit a warning of deprecated intrinsic? I am thinking abandon this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang does not directly emit intrinsics, it would have to go through a builtin
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, is __builtin_amdgcn_fcmp deprecated then?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but that doesn't have a bfloat version
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but that doesn't have a bfloat version
Right, and also does not have a half version. However, when you mentioned "deprecated", I think you meant types other than bfloat. I think amdgcn_fcmp is not necessary and thus deprecated, no matter what types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If a combine is folding an FP cast into the intrinsic, that would be a problem
| @@ -0,0 +1,297 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s | |||
|
|
|||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a check that these error appropriately for a target without bfloat compares
| ; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | ||
| ; GFX12-NEXT: s_lshl_b32 s3, s3, 16 | ||
| ; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we don't have bfloat compares, we shouldn't lower the intrinsic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1. intrinsic should not be used like a builtin.
When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.