diff --git a/parsec b/parsec index 0faa7836..b3e7e24c 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit 0faa783695b7137c62a042a1ec81096b53cddde8 +Subproject commit b3e7e24c4ab42076ee39a520f1540a9fe6b553db diff --git a/src/ztrmm_LLN.jdf b/src/ztrmm_LLN.jdf index 757e1f11..b36a9e46 100644 --- a/src/ztrmm_LLN.jdf +++ b/src/ztrmm_LLN.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,9 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + + read_A(m, k) [profile = off] /* Execution Space */ m = 0..(descB->mt-1) @@ -153,6 +160,61 @@ loc_C = %{ return LOC(descB, (descB->mt-1)-m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> ((k+m) < (descB->mt-2)) ? C zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ + +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( dplasma_lapack_const(trans), 'N', + tempmm, tempnn, descB->mb, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, dplasma_hipblas_op(trans), HIPBLAS_OP_N, + tempmm, tempnn, descB->mb, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; diff --git a/src/ztrmm_LLT.jdf b/src/ztrmm_LLT.jdf index eb94d7e9..fd25b586 100644 --- a/src/ztrmm_LLT.jdf +++ b/src/ztrmm_LLT.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(m, k) [profile = off] /* Execution Space */ m = 0 .. (descB->mt-1) @@ -154,6 +160,62 @@ loc_C = %{ return LOC(descB, m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> (k < (descB->mt-1)) ? C zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkm = ((k)==(descA->mt-1)) ? (descA->m-(k*descA->mb)) : descA->mb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( dplasma_lapack_const(trans), 'N', + tempmm, tempnn, tempkm, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkm = ((k)==(descA->mt-1)) ? (descA->m-(k*descA->mb)) : descA->mb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, dplasma_hipblas_op(trans), HIPBLAS_OP_N, + tempmm, tempnn, tempkm, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_LUN.jdf b/src/ztrmm_LUN.jdf index 2e57cc66..129f98d6 100644 --- a/src/ztrmm_LUN.jdf +++ b/src/ztrmm_LUN.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(m, k) [profile = off] /* Execution Space */ m = 0 .. (descB->mt-1) @@ -154,6 +160,62 @@ loc_C = %{ return LOC(descB, m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> (k < (descB->mt-1)) ? C zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( dplasma_lapack_const(trans), 'N', + tempmm, tempnn, tempkn, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, dplasma_hipblas_op(trans), HIPBLAS_OP_N, + tempmm, tempnn, tempkn, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_LUT.jdf b/src/ztrmm_LUT.jdf index 20a37f71..50d123f0 100644 --- a/src/ztrmm_LUT.jdf +++ b/src/ztrmm_LUT.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(m, k) [profile = off] /* Execution Space */ m = 0..(descB->mt-1) @@ -153,6 +159,60 @@ loc_C = %{ return LOC(descB, (descB->mt-1)-m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> ((k+m) < (descB->mt-2)) ? C zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( dplasma_lapack_const(trans), 'N', + tempmm, tempnn, descB->mb, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int lda = LDA(ddescA, A); + int ldb = LDA(ddescB, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, dplasma_hipblas_op(trans), HIPBLAS_OP_N, + tempmm, tempnn, descB->mb, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = (((descB->mt-1)-m)==(descB->mt-1)) ? (descB->m-(((descB->mt-1)-m)*descB->mb)) : descB->mb; diff --git a/src/ztrmm_RLN.jdf b/src/ztrmm_RLN.jdf index 3ff76adc..b0d6e973 100644 --- a/src/ztrmm_RLN.jdf +++ b/src/ztrmm_RLN.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(n, k) [profile = off] /* Execution Space */ n = 0 .. (descB->nt-1) @@ -152,6 +158,62 @@ loc_C = %{ return LOC(descB, m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> (k < (descB->nt-1)) ? C zgemm(n, m, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( 'N', dplasma_lapack_const(trans), + tempmm, tempnn, tempkn, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), + tempmm, tempnn, tempkn, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_RLT.jdf b/src/ztrmm_RLT.jdf index 5ade1c6c..9cdd6dde 100644 --- a/src/ztrmm_RLT.jdf +++ b/src/ztrmm_RLT.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(n, k) [profile = off] /* Execution Space */ n = 0 .. (descB->nt-1) @@ -153,6 +159,60 @@ loc_C = %{ return LOC(descB, m,(descB->nt-1)-n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> ((n+k) < (descB->nt-2)) ? C zgemm(n, m, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = (((descB->nt-1)-n)==(descB->nt-1)) ? (descB->n-(((descB->nt-1)-n)*descB->nb)) : descB->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( 'N', dplasma_lapack_const(trans), + tempmm, tempnn, descB->mb, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = (((descB->nt-1)-n)==(descB->nt-1)) ? (descB->n-(((descB->nt-1)-n)*descB->nb)) : descB->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), + tempmm, tempnn, descB->mb, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_RUN.jdf b/src/ztrmm_RUN.jdf index 209f0ff5..4c4def99 100644 --- a/src/ztrmm_RUN.jdf +++ b/src/ztrmm_RUN.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(n, k) [profile = off] /* Execution Space */ n = 0 .. (descB->nt-1) @@ -153,6 +159,60 @@ loc_C = %{ return LOC(descB, m, (descB->nt-1)-n ); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> ((n+k) < (descB->nt-2)) ? C zgemm(n, m, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = (((descB->nt-1)-n)==(descB->nt-1)) ? (descB->n-(((descB->nt-1)-n)*descB->nb)) : descB->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( 'N', dplasma_lapack_const(trans), + tempmm, tempnn, descB->mb, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = (((descB->nt-1)-n)==(descB->nt-1)) ? (descB->n-(((descB->nt-1)-n)*descB->nb)) : descB->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), + tempmm, tempnn, descB->mb, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_RUT.jdf b/src/ztrmm_RUT.jdf index 9bb2ed23..aab8b5dd 100644 --- a/src/ztrmm_RUT.jdf +++ b/src/ztrmm_RUT.jdf @@ -8,6 +8,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -54,6 +58,8 @@ descA [type = "const parsec_tiled_matrix_t*" hidden = on default = "((dplas ddescB [type = "dplasma_data_collection_t*"] descB [type = "parsec_tiled_matrix_t*" hidden = on default = "((dplasma_data_collection_t*)ddescB)->dc_original" aligned=ddescB] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + read_A(n, k) [profile = off] /* Execution Space */ n = 0 .. (descB->nt-1) @@ -154,6 +160,62 @@ loc_C = %{ return LOC(descB, m, n); %} type_data = %{ return ADTT_DC(ddescB, loc_C, B_SHAPE, LAPACK); %} ] -> (k < (descB->nt-1)) ? C zgemm(n, m, k+1) /* dep OUT: rely on datacopy dtt for sending */ +BODY [type=CUDA] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); + cuDoubleComplex lbeta = make_cuDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + cublasStatus_t status; + cublasSetKernelStream( parsec_body.stream ); + cublasZgemm( 'N', dplasma_lapack_const(trans), + tempmm, tempnn, tempkn, + lalpha, (cuDoubleComplex*)A, lda, + (cuDoubleComplex*)B, ldb, + lbeta, (cuDoubleComplex*)C, ldc ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = make_hipDoubleComplex( 1., 0.); +#else + double lalpha = alpha; + double lbeta = 1.0; +#endif + int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; + int tempnn = ((n)==(descB->nt-1)) ? (descB->n-(n*descB->nb)) : descB->nb; + int tempkn = ((k)==(descA->nt-1)) ? (descA->n-(k*descA->nb)) : descA->nb; + int lda = LDA(ddescB, A); + int ldb = LDA(ddescA, B); + int ldc = LDA(ddescB, C); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, dplasma_hipblas_op(trans), + tempmm, tempnn, tempkn, + &lalpha, (hipDoubleComplex*)A, lda, + (hipDoubleComplex*)B, ldb, + &lbeta, (hipDoubleComplex*)C, ldc ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { int tempmm = ((m)==(descB->mt-1)) ? (descB->m-(m*descB->mb)) : descB->mb; diff --git a/src/ztrmm_wrapper.c b/src/ztrmm_wrapper.c index 1b15ff01..06afd925 100644 --- a/src/ztrmm_wrapper.c +++ b/src/ztrmm_wrapper.c @@ -99,7 +99,7 @@ dplasma_ztrmm_New( dplasma_enum_t side, dplasma_enum_t uplo, const parsec_tiled_matrix_t *A, parsec_tiled_matrix_t *B ) { - parsec_taskpool_t *parsec_trmm = NULL; + parsec_taskpool_t *parsec_tp = NULL; dplasma_data_collection_t * ddc_A = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)A); dplasma_data_collection_t * ddc_B = dplasma_wrap_data_collection((parsec_tiled_matrix_t*)B); @@ -124,45 +124,68 @@ dplasma_ztrmm_New( dplasma_enum_t side, dplasma_enum_t uplo, if ( side == dplasmaLeft ) { if ( uplo == dplasmaLower ) { if ( trans == dplasmaNoTrans ) { - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_LLN_new( - side, uplo, trans, diag, alpha, - ddc_A, ddc_B); + parsec_ztrmm_LLN_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_LLN_new( + side, uplo, trans, diag, alpha, + ddc_A, ddc_B); +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + parsec_trmm->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + parsec_trmm->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } else { /* trans =! dplasmaNoTrans */ - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_LLT_new( + parsec_ztrmm_LLT_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_LLT_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } } else { /* uplo = dplasmaUpper */ if ( trans == dplasmaNoTrans ) { - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_LUN_new( + parsec_ztrmm_LUN_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_LUN_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } else { /* trans =! dplasmaNoTrans */ - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_LUT_new( + parsec_ztrmm_LUT_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_LUT_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } } } else { /* side == dplasmaRight */ if ( uplo == dplasmaLower ) { if ( trans == dplasmaNoTrans ) { - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_RLN_new( + parsec_ztrmm_RLN_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_RLN_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } else { /* trans =! dplasmaNoTrans */ - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_RLT_new( + parsec_ztrmm_RLT_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_RLT_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } } else { /* uplo = dplasmaUpper */ if ( trans == dplasmaNoTrans ) { - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_RUN_new( + parsec_ztrmm_RUN_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_RUN_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } else { /* trans =! dplasmaNoTrans */ - parsec_trmm = (parsec_taskpool_t*)parsec_ztrmm_RUT_new( + parsec_ztrmm_RUT_taskpool_t* parsec_trmm; + parsec_trmm = parsec_ztrmm_RUT_new( side, uplo, trans, diag, alpha, ddc_A, ddc_B); + parsec_tp = (parsec_taskpool_t*)parsec_trmm; } } } @@ -183,7 +206,7 @@ dplasma_ztrmm_New( dplasma_enum_t side, dplasma_enum_t uplo, assert(shape == MAX_SHAPES); - return parsec_trmm; + return parsec_tp; } /** diff --git a/tests/testing_ztrmm.c b/tests/testing_ztrmm.c index 03f9edb9..41e766ff 100644 --- a/tests/testing_ztrmm.c +++ b/tests/testing_ztrmm.c @@ -34,7 +34,6 @@ int main(int argc, char ** argv) /* Set defaults for non argv iparams */ iparam_default_gemm(iparam); iparam_default_ibnbmb(iparam, 0, 200, 200); - iparam[IPARAM_NGPUS] = DPLASMA_ERR_NOT_SUPPORTED; /* Initialize PaRSEC */ parsec = setup_parsec(argc, argv, iparam); @@ -57,6 +56,7 @@ int main(int argc, char ** argv) dplasma_enum_t side = dplasmaLeft; dplasma_enum_t trans = dplasmaNoTrans; dplasma_enum_t diag = dplasmaUnit; + uplo = dplasmaLower; PASTE_CODE_FLOPS(FLOPS_ZTRMM, (side, (DagDouble_t)M, (DagDouble_t)N));