From a0760e7964beba046d3c9463e3625e0582931c8d Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 30 Sep 2025 06:56:30 +0000 Subject: [PATCH 01/22] update paddle --- Paddle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Paddle b/Paddle index 2588f489910..208fb7687a4 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 2588f4899106cd27bdfcc84ba4c2f5f7aac570ab +Subproject commit 208fb7687a43f8c4e9e493e545f07b2d560f4749 From 887e7840df02c1bf55589f74c6e6016771b51c4c Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 30 Sep 2025 13:23:54 +0000 Subject: [PATCH 02/22] fix metax error --- Paddle | 2 +- .../fusion/fused_layernorm_kernel_register.cu | 33 ------------------- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/Paddle b/Paddle index 208fb7687a4..b51d97ff7ff 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 208fb7687a43f8c4e9e493e545f07b2d560f4749 +Subproject commit b51d97ff7ff0bdac6a16380ee90100b787979b05 diff --git a/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu index bdb809a2149..ff6a97ba691 100644 --- a/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu +++ b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu @@ -14,11 +14,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/fused_layernorm_kernel.h" -#include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h" -#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" -#ifndef PADDLE_WITH_HIP -#if CUDNN_VERSION_MIN(8, 1, 0) PD_CUSTOM_KERNEL_REGISTER(fused_bias_residual_layernorm, metax_gpu, ALL_LAYOUT, @@ -32,32 +28,3 @@ PD_CUSTOM_KERNEL_REGISTER(fused_bias_residual_layernorm, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); } -#else -PD_CUSTOM_KERNEL_REGISTER(fused_bias_residual_layernorm, - metax_gpu, - ALL_LAYOUT, - phi::fusion::FusedLayerNormKernel, - float, - phi::dtype::float16) { - kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); - kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); -} -#endif // CUDNN_VERSION_MIN -#else -PD_CUSTOM_KERNEL_REGISTER(fused_bias_residual_layernorm, - metax_gpu, - ALL_LAYOUT, - phi::fusion::FusedLayerNormKernel, - float, - phi::dtype::float16, - phi::dtype::bfloat16) { - kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); - kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); -} -#endif // PADDLE_WITH_HIP From b339d227d3b43b75a1828afbbf9b4ea0275db158 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 30 Sep 2025 14:36:05 +0000 Subject: [PATCH 03/22] fix fusion error --- backends/metax_gpu/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index c80982449ae..de678c4ded2 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -72,6 +72,9 @@ include(zlib) include(protobuf) include(generate_pb) +# Add fusion headers search path for phi::fusion::FusedLayerNormKernel +include_directories("${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion") + set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE) From a125b76b4f628e09ab8cab50fc2a531b3f553884 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Thu, 9 Oct 2025 03:41:09 +0000 Subject: [PATCH 04/22] fix metax error --- Paddle | 2 +- backends/metax_gpu/cmake/paddle.cmake | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Paddle b/Paddle index b51d97ff7ff..3b674aa2f0d 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit b51d97ff7ff0bdac6a16380ee90100b787979b05 +Subproject commit 3b674aa2f0db8b9cb41c6d0738e3ebfebab993cd diff --git a/backends/metax_gpu/cmake/paddle.cmake b/backends/metax_gpu/cmake/paddle.cmake index 899ffd2dd30..71a0b9a2c90 100755 --- a/backends/metax_gpu/cmake/paddle.cmake +++ b/backends/metax_gpu/cmake/paddle.cmake @@ -83,10 +83,14 @@ else() ABSOLUTE) message( STATUS "Run 'git submodule update --init Paddle' in ${REPO_SOURCE_DIR}") - # execute_process( COMMAND git submodule update --init Paddle - # WORKING_DIRECTORY ${REPO_SOURCE_DIR} RESULT_VARIABLE result_var) if(NOT - # result_var EQUAL 0) message( FATAL_ERROR "Failed to get submodule Paddle', - # please check your network !" ) endif() + execute_process( + COMMAND git submodule update --init Paddle + WORKING_DIRECTORY ${REPO_SOURCE_DIR} + RESULT_VARIABLE result_var) + if(NOT result_var EQUAL 0) + message(FATAL_ERROR "Failed to get submodule Paddle', + please check your network !") + endif() get_filename_component(PADDLE_SOURCE_DIR "${REPO_SOURCE_DIR}/Paddle" ABSOLUTE) message(STATUS "PADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR}") From a71379ae67ec3e3f75115aec391d935118ba0200 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Thu, 9 Oct 2025 08:00:39 +0000 Subject: [PATCH 05/22] fix cmake --- backends/iluvatar_gpu/CMakeLists.txt | 2 +- backends/metax_gpu/CMakeLists.txt | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index d71fa59857b..a4692d108b4 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -949,7 +949,7 @@ list( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu + # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/check_numerics_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/dgc_kernel.cu diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index de678c4ded2..c80982449ae 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -72,9 +72,6 @@ include(zlib) include(protobuf) include(generate_pb) -# Add fusion headers search path for phi::fusion::FusedLayerNormKernel -include_directories("${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion") - set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE) From dfc4c176a081ce3c8390123a63211752731b3e22 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Mon, 13 Oct 2025 03:28:52 +0000 Subject: [PATCH 06/22] fix cmake --- backends/metax_gpu/cmake/paddle.cmake | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/backends/metax_gpu/cmake/paddle.cmake b/backends/metax_gpu/cmake/paddle.cmake index 71a0b9a2c90..0620648c7a0 100755 --- a/backends/metax_gpu/cmake/paddle.cmake +++ b/backends/metax_gpu/cmake/paddle.cmake @@ -81,16 +81,11 @@ if(paddle_submodule) else() get_filename_component(REPO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../" ABSOLUTE) - message( - STATUS "Run 'git submodule update --init Paddle' in ${REPO_SOURCE_DIR}") - execute_process( - COMMAND git submodule update --init Paddle - WORKING_DIRECTORY ${REPO_SOURCE_DIR} - RESULT_VARIABLE result_var) - if(NOT result_var EQUAL 0) - message(FATAL_ERROR "Failed to get submodule Paddle', - please check your network !") - endif() + # message( STATUS "Run 'git submodule update --init Paddle' in + # ${REPO_SOURCE_DIR}") execute_process( COMMAND git submodule update --init + # Paddle WORKING_DIRECTORY ${REPO_SOURCE_DIR} RESULT_VARIABLE result_var) + # if(NOT result_var EQUAL 0) message(FATAL_ERROR "Failed to get submodule + # Paddle', please check your network !") endif() get_filename_component(PADDLE_SOURCE_DIR "${REPO_SOURCE_DIR}/Paddle" ABSOLUTE) message(STATUS "PADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR}") From 32d45e2af54b72b6c3c5308239db2ba5aa91c4ce Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Mon, 13 Oct 2025 11:23:08 +0000 Subject: [PATCH 07/22] update paddle to 1013 --- Paddle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Paddle b/Paddle index 3b674aa2f0d..0ee973079cf 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 3b674aa2f0db8b9cb41c6d0738e3ebfebab993cd +Subproject commit 0ee973079cfa713d0148979f74f62ed5442c1068 From 8fb82d8e8f5bf42d3b2a7b1e6afa2db7f9d05c4e Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Mon, 13 Oct 2025 12:27:55 +0000 Subject: [PATCH 08/22] fix patch --- backends/metax_gpu/patch/paddle.patch | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index beefb730bf7..4ad1074837e 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -862,6 +862,7 @@ index e838778952..83e805e75a 100644 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -14,7 +14,7 @@ + #include "paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" From a534e3a808eea94b76b33cca95208f03978b3276 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Mon, 13 Oct 2025 13:18:53 +0000 Subject: [PATCH 09/22] fix patch --- backends/metax_gpu/patch/paddle.patch | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 88e4cbce910..f2e4f067bb2 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -918,8 +918,7 @@ diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi index b2d15a59f8..f64582e85a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu -@@ -14,7 +14,7 @@ - +@@ -15,7 +15,7 @@ #include "paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" From ba174e3de4f06db5aa8036ca42f7386fdf6a64f3 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Thu, 27 Nov 2025 10:57:28 +0800 Subject: [PATCH 10/22] update paddle --- Paddle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Paddle b/Paddle index 9f6c53e2139..85088198802 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 9f6c53e2139478b386c4110a62ff93314a0ff7ea +Subproject commit 85088198802fc8632bf31d6b3341d1669b24898b From 406649a5952d6d23af013208a26be69e0b57eb34 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Thu, 27 Nov 2025 11:12:38 +0800 Subject: [PATCH 11/22] fix top_p_sampling_kernel --- .../iluvatar_gpu/kernels/ernie_core/top_p_sampling_kernel.cu | 2 +- backends/npu/kernels/fusion/topp_sampling_kernel.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/iluvatar_gpu/kernels/ernie_core/top_p_sampling_kernel.cu b/backends/iluvatar_gpu/kernels/ernie_core/top_p_sampling_kernel.cu index e118d8c0935..b98457a4b25 100644 --- a/backends/iluvatar_gpu/kernels/ernie_core/top_p_sampling_kernel.cu +++ b/backends/iluvatar_gpu/kernels/ernie_core/top_p_sampling_kernel.cu @@ -1054,7 +1054,7 @@ void TopPSamplingKernel(const Context& dev_ctx, const DenseTensor& ps, const paddle::optional& threshold, const paddle::optional& topp_seed, - int seed, + int64_t seed, int k, const std::string& mode, DenseTensor* out, diff --git a/backends/npu/kernels/fusion/topp_sampling_kernel.cc b/backends/npu/kernels/fusion/topp_sampling_kernel.cc index 2e32c9cea3b..9f327ff4335 100644 --- a/backends/npu/kernels/fusion/topp_sampling_kernel.cc +++ b/backends/npu/kernels/fusion/topp_sampling_kernel.cc @@ -23,7 +23,7 @@ void TopPSamplingKernel(const Context& dev_ctx, const phi::DenseTensor& ps, const paddle::optional& threshold, const paddle::optional& topp_seed, - int seed, + int64_t seed, int k, const std::string& mode, phi::DenseTensor* out, From f8079115bf443f15c8cb808ee671a1678f7e71ca Mon Sep 17 00:00:00 2001 From: tianshuo78520a Date: Mon, 1 Dec 2025 22:09:39 +0000 Subject: [PATCH 12/22] Update Paddle submodule to latest develop --- Paddle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Paddle b/Paddle index b9f128491d8..2a757212307 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit b9f128491d8e2d31eddcfec176813a4b674dc650 +Subproject commit 2a7572123075d28ee996f64890c9ee20c27dd4ee From 875a3c35977bf361b324ccb6802cbc8f76aab609 Mon Sep 17 00:00:00 2001 From: zrr1999 <2742392377@qq.com> Date: Tue, 2 Dec 2025 07:17:55 +0000 Subject: [PATCH 13/22] fix scale --- backends/gcu/kernels/interpolate_kernels.cc | 12 ++++++------ backends/mlu/kernels/interpolate_kernel.cc | 12 ++++++------ backends/npu/kernels/interpolate_kernel.cc | 12 ++++++------ backends/sdaa/kernels/interpolate_kernel.cc | 4 ++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/backends/gcu/kernels/interpolate_kernels.cc b/backends/gcu/kernels/interpolate_kernels.cc index 532b14834be..62d1ce39387 100644 --- a/backends/gcu/kernels/interpolate_kernels.cc +++ b/backends/gcu/kernels/interpolate_kernels.cc @@ -28,7 +28,7 @@ void InterpolateKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -253,7 +253,7 @@ void InterpolateGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -326,7 +326,7 @@ void BilinearInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -361,7 +361,7 @@ void BilinearInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -400,7 +400,7 @@ void NearestInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -435,7 +435,7 @@ void NearestInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, diff --git a/backends/mlu/kernels/interpolate_kernel.cc b/backends/mlu/kernels/interpolate_kernel.cc index 901a89159e7..f99cede5e54 100644 --- a/backends/mlu/kernels/interpolate_kernel.cc +++ b/backends/mlu/kernels/interpolate_kernel.cc @@ -57,7 +57,7 @@ void InterpolateKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -378,7 +378,7 @@ void InterpolateGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -547,7 +547,7 @@ void BilinearInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -579,7 +579,7 @@ void NearestInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -612,7 +612,7 @@ void BilinearInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -646,7 +646,7 @@ void NearestInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, diff --git a/backends/npu/kernels/interpolate_kernel.cc b/backends/npu/kernels/interpolate_kernel.cc index fe380563ae2..a0ecd50d2af 100644 --- a/backends/npu/kernels/interpolate_kernel.cc +++ b/backends/npu/kernels/interpolate_kernel.cc @@ -808,7 +808,7 @@ void InterpolateKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -996,7 +996,7 @@ void InterpolateGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -1129,7 +1129,7 @@ void BilinearInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -1161,7 +1161,7 @@ void NearestInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -1194,7 +1194,7 @@ void BilinearInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -1228,7 +1228,7 @@ void NearestInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, diff --git a/backends/sdaa/kernels/interpolate_kernel.cc b/backends/sdaa/kernels/interpolate_kernel.cc index f106bdb7761..7c48dc3c0c9 100644 --- a/backends/sdaa/kernels/interpolate_kernel.cc +++ b/backends/sdaa/kernels/interpolate_kernel.cc @@ -117,7 +117,7 @@ void NearestInterpKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, @@ -273,7 +273,7 @@ void NearestInterpGradKernel( int out_d, int out_h, int out_w, - const std::vector& scale, + const std::vector& scale, const std::string& interp_method, bool align_corners, int align_mode, From 16f338c7ae7829abdc8de1766a7d0a7485f25a43 Mon Sep 17 00:00:00 2001 From: zrr1999 <2742392377@qq.com> Date: Tue, 2 Dec 2025 07:49:35 +0000 Subject: [PATCH 14/22] fix --- backends/gcu/kernels/interpolate_kernels.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/gcu/kernels/interpolate_kernels.cc b/backends/gcu/kernels/interpolate_kernels.cc index 62d1ce39387..dd5b5360f2e 100644 --- a/backends/gcu/kernels/interpolate_kernels.cc +++ b/backends/gcu/kernels/interpolate_kernels.cc @@ -47,7 +47,7 @@ void InterpolateKernel( float scale_h = -1; float scale_w = -1; - std::vector new_scale(scale); + std::vector new_scale(scale); // Priority: size_tensor > out_size > scale_tensor > scale > out_h & out_w if (size_tensor && size_tensor->size() > 0) { auto tensors = size_tensor.get(); From 17c1485d77720d30d4982d42ed8f3471f5e18703 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 2 Dec 2025 09:13:17 +0000 Subject: [PATCH 15/22] fix datatype --- Paddle | 2 +- .../cuda_kernels/cross_entropy_kernel.cu | 4 +- .../kernels/gpudnn/conv_grad_kernel.cu | 69 +++++++++---------- .../kernels/gpudnn/conv_kernel.cu | 52 +++++++------- .../gpudnn/conv_transpose_grad_kernel.cu | 46 +++++-------- .../kernels/gpudnn/conv_transpose_kernel.cu | 25 +++---- .../kernels/gpudnn/softmax_gpudnn.h | 5 +- .../kernels/impl/conv_cudnn_impl.h | 6 +- 8 files changed, 89 insertions(+), 120 deletions(-) diff --git a/Paddle b/Paddle index 0ee973079cf..0a80d1cad5e 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 0ee973079cfa713d0148979f74f62ed5442c1068 +Subproject commit 0a80d1cad5e407d4a80eaf7310ea4d64ac641a5d diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/cross_entropy_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/cross_entropy_kernel.cu index 3438e48549e..521c31fa6a1 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/cross_entropy_kernel.cu +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/cross_entropy_kernel.cu @@ -751,7 +751,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::kNCHW; cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); @@ -1163,7 +1163,7 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::kNCHW; cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); diff --git a/backends/iluvatar_gpu/kernels/gpudnn/conv_grad_kernel.cu b/backends/iluvatar_gpu/kernels/gpudnn/conv_grad_kernel.cu index 1d58fb4f6d7..98890033ffd 100644 --- a/backends/iluvatar_gpu/kernels/gpudnn/conv_grad_kernel.cu +++ b/backends/iluvatar_gpu/kernels/gpudnn/conv_grad_kernel.cu @@ -53,8 +53,8 @@ void ConvCudnnGradKernelImplV7( const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout compute_format, - phi::backends::gpu::DataLayout layout, + DataLayout compute_format, + DataLayout layout, bool use_addto, bool exhaustive_search, bool deterministic, @@ -98,16 +98,16 @@ void ConvCudnnGradKernelImplV7( int i_n, i_c, i_d, i_h, i_w; int o_n, o_c, o_d, o_h, o_w; - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output_grad_channel->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &o_n, &o_c, &o_d, @@ -115,14 +115,14 @@ void ConvCudnnGradKernelImplV7( &o_w); } else { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output_grad_channel->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NCHW, &o_n, &o_c, &o_d, @@ -349,7 +349,7 @@ void ConvCudnnGradKernelImplV8( const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout layout, + DataLayout layout, bool use_addto, bool exhaustive_search, bool deterministic, @@ -469,7 +469,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = DataLayout::NCHW; #else #if CUDNN_VERSION_MIN(8, 1, 0) const bool compute_in_nhwc = @@ -479,14 +479,12 @@ void ConvCudnnGradKernel(const Context& dev_ctx, const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); #endif - auto compute_format = compute_in_nhwc && channel_last - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::NHWC : DataLayout::NCHW; #endif VLOG(3) << "Compute ConvGradOp with cuDNN:" << " data_format=" << data_format << " compute_format=" - << (compute_format == phi::backends::gpu::DataLayout::kNHWC ? "NHWC" - : "NCHW"); + << (compute_format == DataLayout::NHWC ? "NHWC" : "NCHW"); // transform Tensor DenseTensor transformed_input_channel(input.type()); @@ -495,7 +493,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, DenseTensor transformed_filter_channel(filter.type()); DenseTensor transformed_filter_grad_channel(filter.type()); - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { VLOG(3) << "Transform input, output_grad, input_grad and tensor from " "NHWC to NCHW."; ResizeToChannelFirst( @@ -526,7 +524,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; ResizeToChannelLast( dev_ctx, &filter, &transformed_filter_channel); @@ -549,7 +547,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, auto filter_dims = transformed_filter_channel.dims(); DDim in_data_dims; DDim filter_data_dims; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); } else { @@ -574,7 +572,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, std::vector padding_diff(data_dim); std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[1] = transformed_input_channel.dims()[1]; } else { new_input_shape_vec[data_dim + 1] = @@ -584,14 +582,14 @@ void ConvCudnnGradKernel(const Context& dev_ctx, for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[i + 2] = transformed_input_channel.dims()[i + 2] + padding_diff[i]; } else { new_input_shape_vec[i + 1] = transformed_input_channel.dims()[i + 1] + padding_diff[i]; } - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; } else { @@ -645,14 +643,11 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } } - phi::backends::gpu::DataLayout layout = - compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + DataLayout layout = + compute_format == DataLayout::NHWC ? DataLayout::NHWC : DataLayout::NCHW; if (transformed_input.dims().size() == 5) { - layout = compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNDHWC - : phi::backends::gpu::DataLayout::kNCDHW; + layout = compute_format == DataLayout::NHWC ? DataLayout::NDHWC + : DataLayout::NCDHW; } CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_input); CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_filter_channel); @@ -740,15 +735,14 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } - if (channel_last && - compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { TransToChannelLast( dev_ctx, &transformed_input_grad_channel, input_grad); } } if (filter_grad) { - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { TransToChannelFirst( dev_ctx, &transformed_filter_grad_channel, filter_grad); } @@ -1011,8 +1005,7 @@ void ConvCudnnGradGradKernel( auto dtype = phi::backends::gpu::CudnnDataType::type; auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - auto layout = phi::backends::gpu::GetCudnnTensorFormat( - phi::backends::gpu::DataLayout::kNCHW); + auto layout = phi::backends::gpu::GetCudnnTensorFormat(DataLayout::NCHW); ConvArgs args1{handle, &transformed_ddX, @@ -1023,7 +1016,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args2{handle, &transformed_X, ddW, @@ -1033,7 +1026,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args3{handle, &transformed_ddX, dW, @@ -1043,7 +1036,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args4{handle, &transformed_dX, ddW, @@ -1053,7 +1046,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; #ifdef PADDLE_WITH_HIP SearchResult fwd_result1; @@ -1179,11 +1172,11 @@ void ConvCudnnGradGradKernel( int i_n, i_c, i_d, i_h, i_w; GetNCDHW( - transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + transformed_X.dims(), DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; GetNCDHW(transformed_dO_channel.dims(), - DataLayout::kNCHW, + DataLayout::NCHW, &o_n, &o_c, &o_d, diff --git a/backends/iluvatar_gpu/kernels/gpudnn/conv_kernel.cu b/backends/iluvatar_gpu/kernels/gpudnn/conv_kernel.cu index 0f58e0fc1bc..f1d09a389c5 100644 --- a/backends/iluvatar_gpu/kernels/gpudnn/conv_kernel.cu +++ b/backends/iluvatar_gpu/kernels/gpudnn/conv_kernel.cu @@ -56,8 +56,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout compute_format, - phi::backends::gpu::DataLayout layout, + DataLayout compute_format, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -112,16 +112,16 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, int i_n, i_c, i_d, i_h, i_w; int o_n, o_c, o_d, o_h, o_w; - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::kNHWC, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::kNHWC, &o_n, &o_c, &o_d, @@ -129,14 +129,14 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, &o_w); } else { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::kNCHW, &o_n, &o_c, &o_d, @@ -225,7 +225,7 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout layout, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -342,7 +342,7 @@ void ConvCudnnKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = DataLayout::NCHW; #else #if CUDNN_VERSION_MIN(8, 1, 0) // Tensor Core introduced from Volta GPUs supports more faster conv op @@ -358,21 +358,19 @@ void ConvCudnnKernel(const Context& dev_ctx, #endif // We will only do data format conversion from NHWC to NCHW. // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = compute_in_nhwc && channel_last - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::NHWC : DataLayout::NCHW; #endif VLOG(3) << "Compute ConvOp with cuDNN:" << " data_format=" << data_format << " compute_format=" - << (compute_format == phi::backends::gpu::DataLayout::kNHWC ? "NHWC" - : "NCHW"); + << (compute_format == DataLayout::NHWC ? "NHWC" : "NCHW"); // ------------ transformed tensor ----------- DenseTensor transformed_input_channel(input.type()); DenseTensor transformed_output(output->type()); DenseTensor transformed_filter_channel(filter.type()); - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { VLOG(3) << "Transform input tensor from NHWC to NCHW."; ResizeToChannelFirst( dev_ctx, &input, &transformed_input_channel); @@ -385,8 +383,7 @@ void ConvCudnnKernel(const Context& dev_ctx, transformed_input_channel.ShareDataWith(input); transformed_output.ShareDataWith(*output); } - if (compute_format == phi::backends::gpu::DataLayout::kNHWC && - !FLAGS_manually_trans_conv_filter) { + if (compute_format == DataLayout::NHWC && !FLAGS_manually_trans_conv_filter) { VLOG(3) << "Transform filter tensor from NCHW to NHWC."; ResizeToChannelLast( dev_ctx, &filter, &transformed_filter_channel); @@ -402,7 +399,7 @@ void ConvCudnnKernel(const Context& dev_ctx, DDim in_data_dims; DDim filter_data_dims; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); } else { @@ -424,7 +421,7 @@ void ConvCudnnKernel(const Context& dev_ctx, std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[1] = transformed_input_channel.dims()[1]; } else { new_input_shape_vec[data_dim + 1] = @@ -435,14 +432,14 @@ void ConvCudnnKernel(const Context& dev_ctx, for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[i + 2] = transformed_input_channel.dims()[i + 2] + padding_diff[i]; } else { new_input_shape_vec[i + 1] = transformed_input_channel.dims()[i + 1] + padding_diff[i]; } - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; } else { @@ -489,14 +486,11 @@ void ConvCudnnKernel(const Context& dev_ctx, } } - phi::backends::gpu::DataLayout layout = - compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + DataLayout layout = + compute_format == DataLayout::NHWC ? DataLayout::NHWC : DataLayout::NCHW; if (transformed_input.dims().size() == 5) { - layout = compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNDHWC - : phi::backends::gpu::DataLayout::kNCDHW; + layout = compute_format == DataLayout::NHWC ? DataLayout::NDHWC + : DataLayout::NCDHW; } CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_input); @@ -543,7 +537,7 @@ void ConvCudnnKernel(const Context& dev_ctx, &transformed_output); #endif - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { TransToChannelLast(dev_ctx, &transformed_output, output); } } diff --git a/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_grad_kernel.cu b/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_grad_kernel.cu index a0c47783a48..d20edb0ce9a 100644 --- a/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -40,8 +40,6 @@ limitations under the License. */ namespace phi { -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; - template void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, const DenseTensor& x, @@ -79,16 +77,15 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, std::vector paddings_ = paddings; std::vector dilations_ = dilations; // cudnn v5 does not support dilations - const GPUDNNDataLayout data_layout = - (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW - : GPUDNNDataLayout::kNHWC); + const DataLayout data_layout = + (data_format != "NHWC" ? DataLayout::NCHW : DataLayout::NHWC); // if channel_last, transpose to channel_first DenseTensor x_transpose; DenseTensor dout_transpose; std::vector x_vec = common::vectorize(x.dims()); std::vector out_vec = common::vectorize(dout.dims()); - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -184,12 +181,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose); #endif - GPUDNNDataLayout layout; + DataLayout layout; if (strides.size() == 2U) { - layout = GPUDNNDataLayout::kNCHW; + layout = DataLayout::NCHW; } else { - layout = GPUDNNDataLayout::kNCDHW; + layout = DataLayout::NCDHW; } int iwo_groups = groups; @@ -339,7 +336,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, false); #endif // PADDLE_WITH_HIP - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { DenseTensor dx_transpose; DenseTensor dx_nchw; dx_nchw.ShareDataWith(*dx); @@ -662,8 +659,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( // auto handle = dev_ctx.cudnn_handle(); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - auto layout = - phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); + auto layout = phi::backends::gpu::GetCudnnTensorFormat(DataLayout::NCHW); ConvArgs args1{handle, &transformed_ddout_channel, @@ -674,7 +670,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args2{handle, &transformed_ddout_channel, &ddfilter, @@ -684,7 +680,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args3{handle, &transformed_dout, @@ -695,7 +691,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args4{handle, &transformed_dout, &ddfilter, @@ -705,7 +701,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; #ifdef PADDLE_WITH_HIP SearchResult bwd_result1; SearchResult bwd_result2; @@ -824,22 +820,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( } int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_x.dims(), - GPUDNNDataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); + GetNCDHW( + transformed_x.dims(), DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dout.dims(), - GPUDNNDataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); + GetNCDHW( + transformed_dout.dims(), DataLayout::NCHW, &o_n, &o_c, &o_d, &o_h, &o_w); int group_offset_in = transformed_x.numel() / transformed_x.dims()[0] / groups; diff --git a/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_kernel.cu index 577f014c4d1..e9ca26203fb 100644 --- a/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/iluvatar_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -46,8 +46,6 @@ limitations under the License. */ namespace phi { -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; - template void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, const DenseTensor* filter, @@ -55,8 +53,8 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations_, - GPUDNNDataLayout data_layout, - GPUDNNDataLayout layout, + DataLayout data_layout, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -169,8 +167,8 @@ void ConvTransposeCudnnKernelImplV8(const DenseTensor* transformed_x, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations_, - GPUDNNDataLayout data_layout, - GPUDNNDataLayout layout, + DataLayout data_layout, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -274,14 +272,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, std::vector paddings_ = paddings; std::vector dilations_ = dilations; - const GPUDNNDataLayout data_layout = - (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW - : GPUDNNDataLayout::kNHWC); + const DataLayout data_layout = + (data_format != "NHWC" ? DataLayout::NCHW : DataLayout::NHWC); std::vector x_vec = common::vectorize(x.dims()); std::vector out_vec = common::vectorize(out->dims()); // if channel_last, transpose to channel_first DenseTensor x_transpose; - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -390,11 +387,11 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, transformed_out.Resize(common::make_ddim(transformed_out_vec)); } - GPUDNNDataLayout layout; + DataLayout layout; if (strides.size() == 2U) { - layout = GPUDNNDataLayout::kNCHW; + layout = DataLayout::NCHW; } else { - layout = GPUDNNDataLayout::kNCDHW; + layout = DataLayout::NCDHW; } #ifdef PADDLE_WITH_CUDNN_FRONTEND @@ -447,7 +444,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, dev_ctx, &transformed_out, out, starts, ends, axes); } - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { DenseTensor out_transpose; DenseTensor out_nchw; out_nchw.ShareDataWith(*out); diff --git a/backends/iluvatar_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/iluvatar_gpu/kernels/gpudnn/softmax_gpudnn.h index 559ac826aae..da0722aa3b7 100644 --- a/backends/iluvatar_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/iluvatar_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -34,7 +34,6 @@ limitations under the License. */ namespace phi { using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; // Vectorization trait 4 * sizeof(T) template @@ -1031,7 +1030,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, const std::vector& tensor_dims, T* out_data) { auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::NCHW; ScopedTensorDescriptor scoped_desc; cudnnTensorDescriptor_t desc = scoped_desc.descriptor(layout, tensor_dims); auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE @@ -1084,7 +1083,7 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, const std::vector& tensor_dims, T* dx_data) { auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::NCHW; ScopedTensorDescriptor scoped_desc; cudnnTensorDescriptor_t desc = scoped_desc.descriptor(layout, tensor_dims); auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE diff --git a/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h b/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h index 26a106c51a4..6b893012c2d 100644 --- a/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h +++ b/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h @@ -59,15 +59,15 @@ static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { // } static inline void GetNCDHW(const DDim& dims, - const phi::DataLayout& layout, + const DataLayout& layout, int* N, int* C, int* D, int* H, int* W) { *N = dims[0]; - *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == phi::DataLayout::kNCHW ? 0 : 1; + *C = layout == DataLayout::NCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == DataLayout::NCHW ? 0 : 1; if (dims.size() == 5) { *D = dims[2 - i]; *H = dims[3 - i]; From 8c2b20bc14a22da74d9d787899fe9f5e618e1d5c Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 2 Dec 2025 09:17:01 +0000 Subject: [PATCH 16/22] restore cmake --- backends/metax_gpu/cmake/paddle.cmake | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/metax_gpu/cmake/paddle.cmake b/backends/metax_gpu/cmake/paddle.cmake index 5131ccbeb91..70420a00f96 100644 --- a/backends/metax_gpu/cmake/paddle.cmake +++ b/backends/metax_gpu/cmake/paddle.cmake @@ -80,11 +80,12 @@ if(paddle_submodule) else() get_filename_component(REPO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../" ABSOLUTE) - # message( STATUS "Run 'git submodule update --init Paddle' in - # ${REPO_SOURCE_DIR}") execute_process( COMMAND git submodule update --init - # Paddle WORKING_DIRECTORY ${REPO_SOURCE_DIR} RESULT_VARIABLE result_var) - # if(NOT result_var EQUAL 0) message(FATAL_ERROR "Failed to get submodule - # Paddle', please check your network !") endif() + message( + STATUS "Run 'git submodule update --init Paddle' in ${REPO_SOURCE_DIR}") + # execute_process( COMMAND git submodule update --init Paddle + # WORKING_DIRECTORY ${REPO_SOURCE_DIR} RESULT_VARIABLE result_var) if(NOT + # result_var EQUAL 0) message( FATAL_ERROR "Failed to get submodule Paddle', + # please check your network !" ) endif() get_filename_component(PADDLE_SOURCE_DIR "${REPO_SOURCE_DIR}/Paddle" ABSOLUTE) message(STATUS "PADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR}") From 9ca92e93ee69579c6e1ce96f9239a43290068318 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 2 Dec 2025 10:16:35 +0000 Subject: [PATCH 17/22] fix patch --- backends/iluvatar_gpu/patches/paddle-corex.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index 2d5918759a0..db9b747de20 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -954,7 +954,7 @@ diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kern index 6f656ca7be..db87d15919 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -@@ -296,16 +296,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, +@@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); // top_left_index is just input_index. From fd3056c94b4682bd0eaec5367d24c0839686e350 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:59:16 +0800 Subject: [PATCH 18/22] [Metax] Fix DNN-related bugs (#194) [Metax] Fix DNN-related bugs (#194) --- backends/metax_gpu/compile.sh | 2 +- .../cuda_kernels/anchor_generator_kernel.cu | 24 +++++++ .../interpolate_grad_kernel_register.cu | 13 ++++ .../gpudnn/conv_grad_kernel_register.cu | 70 +++++++++---------- .../kernels/gpudnn/conv_kernel_register.cu | 51 ++++++-------- .../conv_transpose_grad_kernel_register.cu | 47 +++++-------- .../kernels/gpudnn/conv_transpose_kernel.cu | 25 +++---- .../metax_gpu/kernels/gpudnn/cudnn_helper.h | 17 ++++- .../metax_gpu/kernels/gpudnn/pool_gpudnn.h | 12 ++-- .../kernels/gpudnn/pool_kernel_register.cu | 6 +- .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h | 5 +- .../metax_gpu/kernels/impl/conv_cudnn_impl.h | 40 +++++------ .../cross_entropy_kernel_register.cu | 4 +- 13 files changed, 168 insertions(+), 148 deletions(-) create mode 100644 backends/metax_gpu/kernels/cuda_kernels/anchor_generator_kernel.cu diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh index 20e888ef4d4..bdc6269a60b 100644 --- a/backends/metax_gpu/compile.sh +++ b/backends/metax_gpu/compile.sh @@ -35,6 +35,6 @@ make_maca -j18 echo "install whl" -pip install dist/paddle_metax_gpu*.whl --force-reinstall +pip install dist/paddle_metax_gpu-${PADDLE_VERSION}*.whl --force-reinstall cd .. echo "Done!" diff --git a/backends/metax_gpu/kernels/cuda_kernels/anchor_generator_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/anchor_generator_kernel.cu new file mode 100644 index 00000000000..7a467543274 --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/anchor_generator_kernel.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/anchor_generator_kernel_impl.h" +#include "paddle/phi/kernels/legacy/gpu/anchor_generator_kernel.cu" //NOLINT + +PD_CUSTOM_KERNEL_REGISTER(anchor_generator, + metax_gpu, + ALL_LAYOUT, + phi::AnchorGeneratorOpCUDAKernel, + float, + double) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/interpolate_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/interpolate_grad_kernel_register.cu index ce71e380b57..a280ac99d15 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/interpolate_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/interpolate_grad_kernel_register.cu @@ -13,8 +13,21 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/interpolate_grad_kernel.cu" //NOLINT #include "paddle/phi/kernels/interpolate_grad_kernel.h" +PD_CUSTOM_KERNEL_REGISTER(interp_antialias_grad, + metax_gpu, + ALL_LAYOUT, + phi::InterpolateGradKernel, + float, + double, + phi::float16, + phi::bfloat16) { + kernel->InputAt(1).SetBackend(phi::Backend::CPU); + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_CUSTOM_KERNEL_REGISTER(bilinear_interp_grad, metax_gpu, ALL_LAYOUT, diff --git a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu index 2da42c7ff8c..2030f8e8d39 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu @@ -53,8 +53,8 @@ void ConvCudnnGradKernelImplV7( const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout compute_format, - phi::backends::gpu::DataLayout layout, + DataLayout compute_format, + DataLayout layout, bool use_addto, bool exhaustive_search, bool deterministic, @@ -99,16 +99,16 @@ void ConvCudnnGradKernelImplV7( int i_n, i_c, i_d, i_h, i_w; int o_n, o_c, o_d, o_h, o_w; - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output_grad_channel->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &o_n, &o_c, &o_d, @@ -116,14 +116,14 @@ void ConvCudnnGradKernelImplV7( &o_w); } else { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NHWC, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output_grad_channel->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NHWC, &o_n, &o_c, &o_d, @@ -350,7 +350,7 @@ void ConvCudnnGradKernelImplV8( const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout layout, + DataLayout layout, bool use_addto, bool exhaustive_search, bool deterministic, @@ -469,7 +469,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = DataLayout::NCHW; #else #if CUDNN_VERSION_MIN(8, 1, 0) const bool compute_in_nhwc = @@ -479,14 +479,12 @@ void ConvCudnnGradKernel(const Context& dev_ctx, const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); #endif - auto compute_format = compute_in_nhwc && channel_last - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::NHWC : DataLayout::NCHW; #endif VLOG(3) << "Compute ConvGradOp with cuDNN:" << " data_format=" << data_format << " compute_format=" - << (compute_format == phi::backends::gpu::DataLayout::kNHWC ? "NHWC" - : "NCHW"); + << (compute_format == DataLayout::NHWC ? "NHWC" : "NCHW"); // transform Tensor DenseTensor transformed_input_channel(input.type()); @@ -495,7 +493,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, DenseTensor transformed_filter_channel(filter.type()); DenseTensor transformed_filter_grad_channel(filter.type()); - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { VLOG(3) << "Transform input, output_grad, input_grad and tensor from " "NHWC to NCHW."; ResizeToChannelFirst( @@ -526,7 +524,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; ResizeToChannelLast( dev_ctx, &filter, &transformed_filter_channel); @@ -549,7 +547,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, auto filter_dims = transformed_filter_channel.dims(); DDim in_data_dims; DDim filter_data_dims; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); } else { @@ -574,7 +572,7 @@ void ConvCudnnGradKernel(const Context& dev_ctx, std::vector padding_diff(data_dim); std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[1] = transformed_input_channel.dims()[1]; } else { new_input_shape_vec[data_dim + 1] = @@ -584,14 +582,14 @@ void ConvCudnnGradKernel(const Context& dev_ctx, for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[i + 2] = transformed_input_channel.dims()[i + 2] + padding_diff[i]; } else { new_input_shape_vec[i + 1] = transformed_input_channel.dims()[i + 1] + padding_diff[i]; } - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; } else { @@ -645,14 +643,12 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } } - phi::backends::gpu::DataLayout layout = - compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + DataLayout layout = + compute_format == DataLayout::NHWC ? DataLayout::NHWC : DataLayout::NCHW; + if (transformed_input.dims().size() == 5) { - layout = compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNDHWC - : phi::backends::gpu::DataLayout::kNCDHW; + layout = compute_format == DataLayout::NHWC ? DataLayout::NDHWC + : DataLayout::NCDHW; } CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_input); CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_filter_channel); @@ -740,15 +736,14 @@ void ConvCudnnGradKernel(const Context& dev_ctx, } } - if (channel_last && - compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { TransToChannelLast( dev_ctx, &transformed_input_grad_channel, input_grad); } } if (filter_grad) { - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { TransToChannelFirst( dev_ctx, &transformed_filter_grad_channel, filter_grad); } @@ -1011,8 +1006,7 @@ void ConvCudnnGradGradKernel( // auto handle = dev_ctx.cudnn_handle(); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - auto layout = phi::backends::gpu::GetCudnnTensorFormat( - phi::backends::gpu::DataLayout::kNCHW); + auto layout = phi::backends::gpu::GetCudnnTensorFormat(DataLayout::NCHW); ConvArgs args1{handle, &transformed_ddX, @@ -1023,7 +1017,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args2{handle, &transformed_X, ddW, @@ -1033,7 +1027,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args3{handle, &transformed_ddX, dW, @@ -1043,7 +1037,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args4{handle, &transformed_dX, ddW, @@ -1053,7 +1047,7 @@ void ConvCudnnGradGradKernel( dilations, dtype, groups, - phi::backends::gpu::DataLayout::kNCHW}; + DataLayout::NCHW}; #ifdef PADDLE_WITH_HIP SearchResult fwd_result1; @@ -1179,11 +1173,11 @@ void ConvCudnnGradGradKernel( int i_n, i_c, i_d, i_h, i_w; GetNCDHW( - transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + transformed_X.dims(), DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; GetNCDHW(transformed_dO_channel.dims(), - DataLayout::kNCHW, + DataLayout::NCHW, &o_n, &o_c, &o_d, diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu index d6b243c956c..cabf1343918 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu @@ -47,8 +47,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations, - phi::backends::gpu::DataLayout compute_format, - phi::backends::gpu::DataLayout layout, + DataLayout compute_format, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -103,16 +103,16 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, int i_n, i_c, i_d, i_h, i_w; int o_n, o_c, o_d, o_h, o_w; - if (compute_format == phi::backends::gpu::DataLayout::kNHWC) { + if (compute_format == DataLayout::NHWC) { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output->dims(), - phi::backends::gpu::DataLayout::kNHWC, + DataLayout::NHWC, &o_n, &o_c, &o_d, @@ -120,14 +120,14 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, &o_w); } else { GetNCDHW(transformed_input->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); GetNCDHW(transformed_output->dims(), - phi::backends::gpu::DataLayout::kNCHW, + DataLayout::NCHW, &o_n, &o_c, &o_d, @@ -251,7 +251,7 @@ void ConvCudnnKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = DataLayout::NCHW; #else #if CUDNN_VERSION_MIN(8, 1, 0) // Tensor Core introduced from Volta GPUs supports more faster conv op @@ -267,21 +267,19 @@ void ConvCudnnKernel(const Context& dev_ctx, #endif // We will only do data format conversion from NHWC to NCHW. // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = compute_in_nhwc && channel_last - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + auto compute_format = + compute_in_nhwc && channel_last ? DataLayout::NHWC : DataLayout::NCHW; #endif VLOG(3) << "Compute ConvOp with cuDNN:" << " data_format=" << data_format << " compute_format=" - << (compute_format == phi::backends::gpu::DataLayout::kNHWC ? "NHWC" - : "NCHW"); + << (compute_format == DataLayout::NHWC ? "NHWC" : "NCHW"); // ------------ transformed tensor ----------- DenseTensor transformed_input_channel(input.type()); DenseTensor transformed_output(output->type()); DenseTensor transformed_filter_channel(filter.type()); - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { VLOG(3) << "Transform input tensor from NHWC to NCHW."; ResizeToChannelFirst( dev_ctx, &input, &transformed_input_channel); @@ -294,8 +292,7 @@ void ConvCudnnKernel(const Context& dev_ctx, transformed_input_channel.ShareDataWith(input); transformed_output.ShareDataWith(*output); } - if (compute_format == phi::backends::gpu::DataLayout::kNHWC && - !FLAGS_manually_trans_conv_filter) { + if (compute_format == DataLayout::NHWC && !FLAGS_manually_trans_conv_filter) { VLOG(3) << "Transform filter tensor from NCHW to NHWC."; ResizeToChannelLast( dev_ctx, &filter, &transformed_filter_channel); @@ -311,7 +308,7 @@ void ConvCudnnKernel(const Context& dev_ctx, DDim in_data_dims; DDim filter_data_dims; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); } else { @@ -333,7 +330,7 @@ void ConvCudnnKernel(const Context& dev_ctx, std::vector new_input_shape_vec(data_dim + 2); new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[1] = transformed_input_channel.dims()[1]; } else { new_input_shape_vec[data_dim + 1] = @@ -344,14 +341,14 @@ void ConvCudnnKernel(const Context& dev_ctx, for (size_t i = 0; i < data_dim; ++i) { padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { new_input_shape_vec[i + 2] = transformed_input_channel.dims()[i + 2] + padding_diff[i]; } else { new_input_shape_vec[i + 1] = transformed_input_channel.dims()[i + 1] + padding_diff[i]; } - if (compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (compute_format == DataLayout::NCHW) { input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; } else { @@ -397,15 +394,11 @@ void ConvCudnnKernel(const Context& dev_ctx, } } } - - phi::backends::gpu::DataLayout layout = - compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNHWC - : phi::backends::gpu::DataLayout::kNCHW; + DataLayout layout = + compute_format == DataLayout::NHWC ? DataLayout::NHWC : DataLayout::NCHW; if (transformed_input.dims().size() == 5) { - layout = compute_format == phi::backends::gpu::DataLayout::kNHWC - ? phi::backends::gpu::DataLayout::kNDHWC - : phi::backends::gpu::DataLayout::kNCDHW; + layout = compute_format == DataLayout::NHWC ? DataLayout::NDHWC + : DataLayout::NCDHW; } CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_input); @@ -424,7 +417,7 @@ void ConvCudnnKernel(const Context& dev_ctx, groups, &transformed_output); - if (channel_last && compute_format == phi::backends::gpu::DataLayout::kNCHW) { + if (channel_last && compute_format == DataLayout::NCHW) { TransToChannelLast(dev_ctx, &transformed_output, output); } } diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu index b7eebfcee2e..151058f209d 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -35,8 +35,6 @@ limitations under the License. */ namespace phi { -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; - template void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, const DenseTensor& x, @@ -74,16 +72,15 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, std::vector paddings_ = paddings; std::vector dilations_ = dilations; // cudnn v5 does not support dilations - const GPUDNNDataLayout data_layout = - (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW - : GPUDNNDataLayout::kNHWC); + const DataLayout data_layout = + (data_format != "NHWC" ? DataLayout::NCHW : DataLayout::NHWC); // if channel_last, transpose to channel_first DenseTensor x_transpose; DenseTensor dout_transpose; std::vector x_vec = common::vectorize(x.dims()); std::vector out_vec = common::vectorize(dout.dims()); - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -179,12 +176,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose); #endif - GPUDNNDataLayout layout; + DataLayout layout; if (strides.size() == 2U) { - layout = GPUDNNDataLayout::kNCHW; + layout = DataLayout::NCHW; } else { - layout = GPUDNNDataLayout::kNCDHW; + layout = DataLayout::kNCDHW; } int iwo_groups = groups; @@ -333,7 +330,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, false); #endif // PADDLE_WITH_HIP - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { DenseTensor dx_transpose; DenseTensor dx_nchw; dx_nchw.ShareDataWith(*dx); @@ -655,8 +652,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( // auto handle = dev_ctx.cudnn_handle(); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - auto layout = - phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); + auto layout = phi::backends::gpu::GetCudnnTensorFormat(DataLayout::NCHW); ConvArgs args1{handle, &transformed_ddout_channel, @@ -667,7 +663,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args2{handle, &transformed_ddout_channel, &ddfilter, @@ -677,7 +673,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args3{handle, &transformed_dout, @@ -688,7 +684,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; ConvArgs args4{handle, &transformed_dout, &ddfilter, @@ -698,7 +694,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype, groups, - GPUDNNDataLayout::kNCHW}; + DataLayout::NCHW}; #ifdef PADDLE_WITH_HIP SearchResult bwd_result1; SearchResult bwd_result2; @@ -817,23 +813,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( } int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_x.dims(), - GPUDNNDataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); + GetNCDHW( + transformed_x.dims(), DataLayout::NCHW, &i_n, &i_c, &i_d, &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dout.dims(), - GPUDNNDataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - + GetNCDHW( + transformed_dout.dims(), DataLayout::NCHW, &o_n, &o_c, &o_d, &o_h, &o_w); int group_offset_in = transformed_x.numel() / transformed_x.dims()[0] / groups; int group_offset_out = diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu index 4049d2f3130..02c6dcace09 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -46,8 +46,6 @@ limitations under the License. */ namespace phi { -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; - template void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, const DenseTensor* filter, @@ -55,8 +53,8 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations_, - GPUDNNDataLayout data_layout, - GPUDNNDataLayout layout, + DataLayout data_layout, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -170,8 +168,8 @@ void ConvTransposeCudnnKernelImplV8(const DenseTensor* transformed_x, const std::vector& strides, const std::vector& padding_common, const std::vector& dilations_, - GPUDNNDataLayout data_layout, - GPUDNNDataLayout layout, + DataLayout data_layout, + DataLayout layout, bool exhaustive_search, bool deterministic, int groups, @@ -277,14 +275,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, std::vector paddings_ = paddings; std::vector dilations_ = dilations; - const GPUDNNDataLayout data_layout = - (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW - : GPUDNNDataLayout::kNHWC); + const DataLayout data_layout = + (data_format != "NHWC" ? DataLayout::NCHW : DataLayout::NHWC); std::vector x_vec = common::vectorize(x.dims()); std::vector out_vec = common::vectorize(out->dims()); // if channel_last, transpose to channel_first DenseTensor x_transpose; - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::NHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -393,11 +390,11 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, transformed_out.Resize(common::make_ddim(transformed_out_vec)); } - GPUDNNDataLayout layout; + DataLayout layout; if (strides.size() == 2U) { - layout = GPUDNNDataLayout::kNCHW; + layout = DataLayout::NCHW; } else { - layout = GPUDNNDataLayout::kNCDHW; + layout = DataLayout::NCDHW; } #ifdef PADDLE_WITH_CUDNN_FRONTEND @@ -450,7 +447,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, dev_ctx, &transformed_out, out, starts, ends, axes); } - if (data_layout == GPUDNNDataLayout::kNHWC) { + if (data_layout == DataLayout::kNHWC) { DenseTensor out_transpose; DenseTensor out_nchw; out_nchw.ShareDataWith(*out); diff --git a/backends/metax_gpu/kernels/gpudnn/cudnn_helper.h b/backends/metax_gpu/kernels/gpudnn/cudnn_helper.h index 9ca55518572..4e38ea33fc4 100644 --- a/backends/metax_gpu/kernels/gpudnn/cudnn_helper.h +++ b/backends/metax_gpu/kernels/gpudnn/cudnn_helper.h @@ -190,7 +190,22 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat( } return CUDNN_TENSOR_NCHW; } - +inline cudnnTensorFormat_t GetCudnnTensorFormat(const phi::DataLayout& order) { + switch (order) { + case phi::DataLayout::NHWC: + return CUDNN_TENSOR_NHWC; + case phi::DataLayout::NCHW: + return CUDNN_TENSOR_NCHW; + case phi::DataLayout::NCDHW: + return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same + case phi::DataLayout::NDHWC: + return CUDNN_TENSOR_NHWC; + default: + PADDLE_THROW(common::errors::Unimplemented( + "CUDNN has no equivalent dataLayout for input order.")); + } + return CUDNN_TENSOR_NCHW; +} class ScopedTensorDescriptor { public: ScopedTensorDescriptor() { diff --git a/backends/metax_gpu/kernels/gpudnn/pool_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/pool_gpudnn.h index 9e27eb26ffd..defd27ee9df 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/pool_gpudnn.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace phi { -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; +// using GPUDNNDataLayout = phi::backends::gpu::DataLayout; using PoolingMode = phi::backends::gpu::PoolingMode; using ScopedPoolingDescriptor = phi::backends::gpu::ScopedPoolingDescriptor; using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; @@ -52,15 +52,15 @@ class CudnnIndexType { #endif }; -inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) { +inline DataLayout GetLayoutFromStr(std::string data_format) { if (data_format == "NHWC") { - return GPUDNNDataLayout::kNHWC; + return DataLayout::NHWC; } else if (data_format == "NCHW") { - return GPUDNNDataLayout::kNCHW; + return DataLayout::NCHW; } else if (data_format == "NCDHW") { - return GPUDNNDataLayout::kNCDHW; + return DataLayout::NCHW; } else { - return GPUDNNDataLayout::kNCDHW; + return DataLayout::NCHW; } } diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu index 1c2bfeedf34..d03f9f8238e 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu @@ -107,10 +107,10 @@ void PoolRawGPUDNNKernel(const Context& dev_ctx, DenseTensor transformed_input(input->type()); DenseTensor transformed_output(output->type()); - GPUDNNDataLayout layout; + DataLayout layout; if (data_format == str_NDHWC) { - layout = GPUDNNDataLayout::kNCDHW; + layout = DataLayout::NCDHW; std::vector axis{0, 4, 1, 2, 3}; // input @@ -139,7 +139,7 @@ void PoolRawGPUDNNKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP // MIOPEN not support NHWC data layout } else if (data_format == str_NHWC) { - layout = GPUDNNDataLayout::kNCHW; + layout = DataLayout::NCHW; std::vector axis{0, 3, 1, 2}; diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h index 5844886ad1b..bf7f39a9127 100644 --- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -35,7 +35,6 @@ namespace phi { using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; -using GPUDNNDataLayout = phi::backends::gpu::DataLayout; // Vectorization trait 4 * sizeof(T) template @@ -1035,7 +1034,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, T* out_data) { // auto handle = dev_ctx.cudnn_handle(); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::NCHW; ScopedTensorDescriptor scoped_desc; #ifdef PADDLE_WITH_HIP @@ -1108,7 +1107,7 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, T* dx_data) { // auto handle = dev_ctx.cudnn_handle(); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::NCHW; ScopedTensorDescriptor scoped_desc; #ifdef PADDLE_WITH_HIP diff --git a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h b/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h index 988d3345fab..920881af72f 100644 --- a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h +++ b/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h @@ -58,26 +58,26 @@ static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { // return CUDNN_TENSOR_NCHW; // } -static inline void GetNCDHW(const DDim& dims, - const phi::DataLayout& layout, - int* N, - int* C, - int* D, - int* H, - int* W) { - *N = dims[0]; - *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == phi::DataLayout::kNCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +// static inline void GetNCDHW(const DDim& dims, +// const phi::DataLayout& layout, +// int* N, +// int* C, +// int* D, +// int* H, +// int* W) { +// *N = dims[0]; +// *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; +// int i = layout == phi::DataLayout::kNCHW ? 0 : 1; +// if (dims.size() == 5) { +// *D = dims[2 - i]; +// *H = dims[3 - i]; +// *W = dims[4 - i]; +// } else { +// *D = 1; +// *H = dims[2 - i]; +// *W = dims[3 - i]; +// } +// } } // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu index 043a64dc149..ef6c3ddd3ba 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu @@ -776,7 +776,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); @@ -1202,7 +1202,7 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, auto* softmax_data = softmax->data(); ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + DataLayout layout = DataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); From c2e459ff6f27847f1913f9d223e30571a0e30835 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 2 Dec 2025 20:22:17 +0800 Subject: [PATCH 19/22] fix patch --- backends/iluvatar_gpu/build_inc.sh | 2 +- backends/iluvatar_gpu/build_paddle.sh | 2 +- backends/iluvatar_gpu/patches/paddle-corex.patch | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/backends/iluvatar_gpu/build_inc.sh b/backends/iluvatar_gpu/build_inc.sh index ab013102ca4..5a68b542dfe 100755 --- a/backends/iluvatar_gpu/build_inc.sh +++ b/backends/iluvatar_gpu/build_inc.sh @@ -150,7 +150,7 @@ fi # Compile echo "Starting compilation..." -ninja -k 0 -j$(nproc) 2>&1 | tee -a compile.log +ninja -j$(nproc) 2>&1 FAILED_LOG="failed_files.log" grep -E "FAILED: " compile.log | tee ${FAILED_LOG} echo "Failed files are listed in ${FAILED_LOG}" diff --git a/backends/iluvatar_gpu/build_paddle.sh b/backends/iluvatar_gpu/build_paddle.sh index 4cd0145f4fa..b94234b3a0f 100644 --- a/backends/iluvatar_gpu/build_paddle.sh +++ b/backends/iluvatar_gpu/build_paddle.sh @@ -72,7 +72,7 @@ cmake -G Ninja -DPY_VERSION=${PYTHON_VERSION} -DWITH_COREX=ON -DPADDLE_SOURCE_DI -DCMAKE_CUDA_FLAGS='-Xclang -fcuda-allow-variadic-functions -mllvm --skip-double' \ -DCMAKE_C_FLAGS="-pthread" \ -DWITH_ARM=OFF -DWITH_DGC=OFF .. || { echo "Error: CMake configuration failed!"; exit 1; } -ninja -k 0 -j$(nproc) || { echo "Error: Paddle-iluvatar-gpu build failed!"; exit 1; } +ninja -j$(nproc) || { echo "Error: Paddle-iluvatar-gpu build failed!"; exit 1; } popd if [[ ! -d "build_pip" ]]; then diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index db9b747de20..bf816803061 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -951,7 +951,7 @@ index f3a2874b92..47d2c0b4d7 100644 dim3 grid_size = dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -index 6f656ca7be..db87d15919 100644 +index d6c105e1b2..6d810ab0e6 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, @@ -975,9 +975,8 @@ index 6f656ca7be..db87d15919 100644 - int64_t remain = nthreads - (tid & (-static_cast(blockDim.x))); + int64_t remain = nthreads - (tid & (-blockDim.x)); int64_t in_top_max_index = - phi::funcs::BlockReduceMax(top_right_index, FINAL_MASK); + funcs::BlockReduceMax(top_right_index, FINAL_MASK); int64_t in_bot_max_index = - diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index 2645060f4c..6a38e20776 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu From 1fa9987af3c7c0f4dd6bde549427ae89a6292084 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Tue, 2 Dec 2025 20:54:32 +0800 Subject: [PATCH 20/22] fix compile error --- .../kernels/impl/conv_cudnn_impl.h | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h b/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h index 6b893012c2d..69b3b61ef65 100644 --- a/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h +++ b/backends/iluvatar_gpu/kernels/impl/conv_cudnn_impl.h @@ -58,26 +58,26 @@ static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { // return CUDNN_TENSOR_NCHW; // } -static inline void GetNCDHW(const DDim& dims, - const DataLayout& layout, - int* N, - int* C, - int* D, - int* H, - int* W) { - *N = dims[0]; - *C = layout == DataLayout::NCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::NCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +// static inline void GetNCDHW(const DDim& dims, +// const DataLayout& layout, +// int* N, +// int* C, +// int* D, +// int* H, +// int* W) { +// *N = dims[0]; +// *C = layout == DataLayout::NCHW ? dims[1] : dims[dims.size() - 1]; +// int i = layout == DataLayout::NCHW ? 0 : 1; +// if (dims.size() == 5) { +// *D = dims[2 - i]; +// *H = dims[3 - i]; +// *W = dims[4 - i]; +// } else { +// *D = 1; +// *H = dims[2 - i]; +// *W = dims[3 - i]; +// } +// } } // namespace phi From f776ed08af7ea046d0a9a3be9a9d25407f794785 Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Wed, 3 Dec 2025 10:45:52 +0800 Subject: [PATCH 21/22] fix scaled_dot_product_attention --- .../tests/unittests/test_flash_attention_iluvatar.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/iluvatar_gpu/tests/unittests/test_flash_attention_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_flash_attention_iluvatar.py index 7a2eeff7647..3426231ccad 100644 --- a/backends/iluvatar_gpu/tests/unittests/test_flash_attention_iluvatar.py +++ b/backends/iluvatar_gpu/tests/unittests/test_flash_attention_iluvatar.py @@ -23,6 +23,9 @@ from paddle.nn.functional.flash_attention import ( flash_attention, flash_attn_unpadded, +) + +from paddle.nn.functional import ( scaled_dot_product_attention, ) From 582bd6bfab52c09dfcc7217aeba05c24596b8b0d Mon Sep 17 00:00:00 2001 From: geyuqiang Date: Wed, 3 Dec 2025 14:45:56 +0800 Subject: [PATCH 22/22] disable test_tensor_cuda_static --- backends/iluvatar_gpu/tests/disabled_test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/iluvatar_gpu/tests/disabled_test.txt b/backends/iluvatar_gpu/tests/disabled_test.txt index abd682e8541..b94aed2b65a 100644 --- a/backends/iluvatar_gpu/tests/disabled_test.txt +++ b/backends/iluvatar_gpu/tests/disabled_test.txt @@ -548,3 +548,4 @@ test_linear_interp_v2_op.py test_nearest_interp_v2_op.py test_poisson_op.py test_rrelu_op.py +test_tensor_cuda_static.py