Merge branch 'develop' into enable_types_sdxl

ROCm · Mar 4, 2025 · 8a15a51 · 8a15a51
2 parents 2729e67 + 04b82df
commit 8a15a51
Show file tree

Hide file tree

Showing 12 changed files with 150 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,66 @@
 Full documentation for MIGraphX is available at
 [https://rocmdocs.amd.com/projects/AMDMIGraphX/en/latest/](https://rocmdocs.amd.com/projects/AMDMIGraphX/en/latest/).
 
+## MIGraphX 2.12 for ROCm 6.4.0
+
+### Added
+
+* Support for gfx1200 and gfx1201
+* hipBLASLt support for contiguous transpose GEMM fusion and GEMM pointwise fusions for improved performance
+* Support for hardware specific FP8 datatypes (FP8 OCP and FP8 FNUZ)
+* Add support for the BF16 datatype
+* ONNX Operator Support for `com.microsoft.MultiHeadAttention`, `com.microsoft.NhwcConv`, and `com.microsoft.MatMulIntgerFloat`
+* migraphx-driver can now produce outfor for use with Netron
+* migraphx-driver now includes a `time` parameter (similar to `perf`) that is more accurate for very fast kernels
+* An end-to-end Stable Diffusion 3 example with option to disable T5 encoder on VRAM-limited GPUs has been added
+* Added support to track broadcast axes in `shape_transform_descriptor`
+* Added support for unsigned types with `rocMLIR`
+* Added a script to convert mxr files to ONNX models
+* Added the `MIGRAPHX_SET_GEMM_PROVIDER` environment variable to choose between rocBLAS and hipBLASLt. Set `MIGRAPHX_SET_GEMM_PROVIDER` to `rocblas` to use rocBLAS, or to `hipblaslt` to use hipBLASLt.
+
+
+### Changed
+
+* With the exception of gfx90a, switched to using hipBLASLt instead of rocBLAS
+* Included the min/max/median of the `perf` run as part of the summary report
+* Enable non-packed inputs for `rocMLIR`
+* Always output a packed type for q/dq after determining non-packed tensors were inefficient
+* Even if using NHWC, MIGraphX will always convert group convolutions to NCHW for best performance 
+* Renamed the `layout_nhwc` to `layout_convolution` and ensured that either the weights are the same layout as the inputs or set the input and weights to NHWC
+* Minimum version of Cmake is now 3.27
+
+
+### Removed
+
+* Removed `fp8e5m2fnuz` rocBLAS support
+* `__AMDGCN_WAVEFRONT_SIZE` has been deprecated.
+* Removed a warning that printed to stdout when using FP8 types
+* Remove zero point parameter for dequantizelinear when its zero
+
+
+### Optimized
+
+* Prefill buffers when MLIR produces a multioutput buffer
+* Improved the resize operator performance which should improve overall performance of models that use it
+* Allow the `reduce` operator to be split across an axis to improve fusion performance.  The `MIGRAPHX_SPLIT_REDUCE_SIZE` environment variable has been added to allow the minimum size of the reduction to be adjusted for a possible model specific performance improvement
+* Added `MIGRAPHX_DISABLE_PASSES` environment variable for debugging
+* Added `MIGRAPHX_MLIR_DUMP` environment variable to be set to a folder where individual final rocMLIR modules can be saved for investigation
+* Improved the C++ API to allow onnxruntime access to fp8 quantization
+
+
+
+### Resolved Issues
+
+* Fixed multistream execution with larger models (#3757)
+* Peephole LSTM Error (#3768)
+* Fixed BertSquad example that could include a broken tokenizers package (#3556)
+* Fixed Attention fusion ito not error with a shape mismatch when a trailing pointwise contains a literal (#3758)
+* Fixed instruction::replace() logic to handle more complex cases (#3574)
+* MatMulNBits could fail with a shape error (#3698)
+* Fixed a bug were some models could fail to compile with an error `flatten: Shapes are not in standard layout` (#3579)
+
+
+
 ## MIGraphX 2.11 for ROCm 6.3.0
 
 ### Added
@@ -18,7 +78,7 @@ Full documentation for MIGraphX is available at
 * Split-K as an optional performance improvement
 * Scripts to validate ONNX models from the ONNX Model Zoo
 * GPU Pooling Kernel
-* --mlir flag to the migraphx-driver program to offload entire module to mlir
+* --mlir flag to the migraphx-driver program to offload entire module to rocMLIR
 * Fusing split-reduce with MLIR
 * Multiple outputs for the MLIR + Pointwise fusions
 * Pointwise fusions with MLIR across reshape operations

diff --git a/docs/dev/quantization.rst b/docs/dev/quantization.rst
@@ -10,6 +10,11 @@ quantize_fp16
 
 .. doxygenfunction:: migraphx::internal::quantize_fp16
 
+quantize_bf16
+-------------
+
+.. doxygenfunction:: migraphx::internal::quantize_bf16
+
 quantize_int8
 -------------
 

diff --git a/docs/driver/compile.rst b/docs/driver/compile.rst
@@ -34,6 +34,10 @@ Perform an exhaustive search to find the fastest version of generated kernels fo
 
 Quantize for fp16
 
+.. option::  --bf16
+
+Quantize for bf16
+
 .. option::  --int8
 
 Quantize for int8

diff --git a/docs/migraphx-driver.rst b/docs/migraphx-driver.rst
@@ -113,6 +113,8 @@ To learn which options can be used with which commands, see the :ref:`MIGraphX d
       - Enables exhaustive search to find the fastest kernel
    *  - --fp16
       - Quantizes for fp16
+   *  - --bf16
+      - Quantizes for bf16
    *  - --int8
       - Quantizes for int8
    *  - --fp8

diff --git a/docs/reference/cpp.rst b/docs/reference/cpp.rst
@@ -60,6 +60,8 @@ quantize
 
 .. doxygenfunction:: migraphx::quantize_fp16(const program&, const quantize_op_names&)
 
+.. doxygenfunction:: migraphx::quantize_bf16(const program&)
+
 .. doxygenstruct:: migraphx::quantize_int8_options
    :members:
    :undoc-members:

diff --git a/docs/reference/py.rst b/docs/reference/py.rst
@@ -305,6 +305,13 @@ program
     :param ins_names: List of instructions to quantize.
     :type ins_names: list[str]
 
+.. py:function:: quantize_bf16(prog, ins_names=["all"])
+
+    Quantizes the program to use bf16.
+
+    :param program prog: Program to quantize.
+    :param ins_names: List of instructions to quantize.
+    :type ins_names: list[str]
 
 .. py:function:: quantize_int8(prog, t, calibration=[], ins_names=["dot", "convolution"])
 

diff --git a/src/targets/gpu/fuse_mlir.cpp b/src/targets/gpu/fuse_mlir.cpp
@@ -641,9 +641,19 @@ struct find_mlir_fused_ops
 {
     mlir_mode conv_mode = mlir_mode::none;
     mlir_mode dot_mode  = mlir_mode::none;
+
+    static auto make_conv_dot_reshaper_names()
+    {
+        auto names = reshaper_names();
+        names.erase("broadcast");
+        names.erase("multibroadcast");
+        return names;
+    }
+
     auto matcher() const
     {
-        auto dot_or_conv = match::skip(match::name(reshaper_names()))(
+        static const auto conv_dot_reshaper_names = make_conv_dot_reshaper_names();
+        auto dot_or_conv = match::skip(match::name(conv_dot_reshaper_names))(
             match::any_of(is_mlir_dot(dot_mode), is_mlir_conv(conv_mode)).bind("gemm_based_op"));
         return mlir_pointwise()(match::any_of[match::inputs()](dot_or_conv.bind("x")));
     }

diff --git a/src/targets/gpu/hip_gemm_impl.cpp b/src/targets/gpu/hip_gemm_impl.cpp
@@ -507,9 +507,8 @@ struct hip_gemm_impl
      * and calls matmulIsAlgoSupported() to get the workspace size.
      */
 
-    size_t get_workspace_size(context& ctx,
-                              const std::vector<shape>& input_shapes,
-                              int32_t solution_idx) const
+    size_t
+    get_workspace_size(context& ctx, const std::vector<shape>& input_shapes, int32_t solution_idx)
     {
         size_t workspace_size = hipblaslt_workspace_size;
         std::vector<argument> input_args;
@@ -521,20 +520,23 @@ struct hip_gemm_impl
         std::vector<int32_t> algo_index = {solution_idx};
         std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
 
-        // TODO: Use hipblasLtMatmulAlgoGetHeuristic instead of getAlgosFromIndex
-        // for solution index '0'.
-        hipblaslt_invoke([&]() {
-            return hipblaslt_ext::getAlgosFromIndex(
-                ctx.get_stream().get_hipblaslt(), algo_index, heuristic_result);
-        });
+        if(solution_idx == 0)
+        {
+            heuristic_result = solution.get_result(ctx, *this, 0);
+        }
+        else
+        {
+            hipblaslt_invoke([&]() {
+                return hipblaslt_ext::getAlgosFromIndex(
+                    ctx.get_stream().get_hipblaslt(), algo_index, heuristic_result);
+            });
+        }
+
         // Return default workspace size when no algo is provided.
         if(heuristic_result.empty())
         {
-            if(solution_idx != 0)
-            {
-                std::cout << "No hipBLASLt algo returned for solution index: " << solution_idx
-                          << std::endl;
-            }
+            std::cout << "No hipBLASLt algo returned for solution index: " << solution_idx
+                      << std::endl;
             return workspace_size;
         }
 

diff --git a/src/targets/gpu/hipblaslt.cpp b/src/targets/gpu/hipblaslt.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -59,6 +59,7 @@ bool hipblaslt_supported()
     // hipblaslt is supported for MI200 and above, and Navi3x and above.
     return (device_name == "gfx90a" or
             (starts_with(device_name, "gfx94") and device_name >= "gfx940") or
+            (starts_with(device_name, "gfx95") and device_name >= "gfx950") or
             starts_with(device_name, "gfx110") or starts_with(device_name, "gfx120"));
 }
 

diff --git a/src/targets/gpu/rocblas.cpp b/src/targets/gpu/rocblas.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -34,8 +34,6 @@ namespace gpu {
 #if MIGRAPHX_USE_ROCBLAS
 rocblas_handle_ptr create_rocblas_handle_ptr()
 {
-    // add a call to rocblas_initialize() to workaround a rocblas bug SWDEV-438929
-    rocblas_initialize();
     rocblas_handle handle;
     rocblas_create_handle(&handle);
     return rocblas_handle_ptr{handle};

diff --git a/test/gpu/fuse_mlir.cpp b/test/gpu/fuse_mlir.cpp
@@ -199,6 +199,44 @@ TEST_CASE(dot_transpose_reshape_add)
     EXPECT(p1.sort() == p2.sort());
 }
 
+TEST_CASE(conv_broadcast_mul)
+{
+    migraphx::shape os{migraphx::shape::float_type, {4, 56, 122, 122}};
+    migraphx::shape is{migraphx::shape::float_type, {4, 14, 1, 1}};
+    migraphx::shape ws{migraphx::shape::float_type, {56, 14, 1, 1}};
+    migraphx::program p1;
+    {
+        auto* mm   = p1.get_main_module();
+        auto x     = mm->add_parameter("x", is);
+        auto y     = mm->add_parameter("y", os);
+        auto w     = mm->add_parameter("w", ws);
+        auto conv  = mm->add_instruction(migraphx::make_op("convolution"), x, w);
+        auto convb = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", os.lens()}}), conv);
+        auto mul = add_pointwise(p1, "main:pointwise0", {convb, y}, single_pointwise("mul"));
+        mm->add_return({mul});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", is);
+        auto y    = mm->add_parameter("y", os);
+        auto w    = mm->add_parameter("w", ws);
+        auto conv = add_mlir(
+            p2, "mlir_convolution0", {x, w}, {"y0", "y1"}, [=](auto* pm, const auto& inputs) {
+                auto c =
+                    pm->add_instruction(migraphx::make_op("convolution"), inputs[0], inputs[1]);
+                return std::make_tuple(c->get_operator(), c);
+            });
+        auto convb = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", os.lens()}}), conv);
+        auto mul = add_pointwise(p2, "main:pointwise0", {convb, y}, single_pointwise("mul"));
+        mm->add_return({mul});
+    }
+    EXPECT(p1.sort() == p2.sort());
+}
+
 TEST_CASE(multi_use_dot_trans_add_pooling_sub)
 {
     migraphx::shape s1{migraphx::shape::float_type, {1, 1, 4, 5}};

diff --git a/test/onnx/.onnxrt-commit b/test/onnx/.onnxrt-commit
@@ -1 +1 @@
-d82604e802a91af0798db8fca404b10e56e46f20
+1be64f883190f058256948c8d254c61d1a724008
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		d82604e802a91af0798db8fca404b10e56e46f20
		1be64f883190f058256948c8d254c61d1a724008