diff --git a/CMakeLists.txt b/CMakeLists.txt index eb33534f81..d475d0987f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,8 @@ option(Chroma_ENABLE_LAPACK "Enable Lapack Binding in QDPLapack" OFF) option(Chroma_ENABLE_OPENMP "Enable OpenMP" OFF) option(Chroma_ENABLE_SUPERBBLAS "Enable tasks using Superbblas" OFF) option(Chroma_ENABLE_PRIMME "Build with PRIMME" OFF) +option(Chroma_ENABLE_CUDA "Build with CUDA" OFF) +option(Chroma_ENABLE_ROCM "Build with ROCM" OFF) set(BLAS_SUFFIX_LINK_FLAGS "" CACHE STRING "Extra linking flags placed after BLAS") separate_arguments(Chroma_extra_link_flags UNIX_COMMAND ${BLAS_SUFFIX_LINK_FLAGS}) message(STATUS "Chroma extra links: ${Chroma_extra_link_flags}") @@ -77,7 +79,7 @@ endif() list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) find_package(GMP) -if( GMP_FOUND ) +if( GMP_FOUND ) set(BUILD_GMP_REMEZ 1) # For the header file endif() # Create an imported target @@ -93,7 +95,7 @@ if(Chroma_ENABLE_OPENMP) if( NOT OpenMP_FOUND ) find_package(OpenMP REQUIRED) endif() - + if( NOT Threads_FOUND ) find_package(Threads REQUIRED) endif() @@ -116,7 +118,7 @@ endif() # 2nd: Deal with QPhiX ########################## if( Chroma_ENABLE_QPHIX ) -# +# # Deal with QPhix # find_package(QPhiX) endif() @@ -141,8 +143,10 @@ if( Chroma_ENABLE_QUDA ) find_package(QUDA REQUIRED) if( QUDA_TARGET_CUDA ) set(CHROMA_TARGET_CUDA 1) + set(Chroma_ENABLE_CUDA ON) elseif( QUDA_TARGET_HIP ) set(CHROMA_TARGET_HIP 1) + set(Chroma_ENABLE_ROCM ON) endif() if (QDP_IS_QDPJIT) message(STATUS "QDP is QDPJIT so enabling QDPJIT Interfaces") @@ -151,14 +155,17 @@ if( Chroma_ENABLE_QUDA ) set(BUILD_QUDA_DEVIFACE_SPINOR 1) endif() endif() +if ( Chroma_ENABLE_CUDA AND Chroma_ENABLE_ROCM ) + message( FATAL_ERROR "Don't set Chroma_ENABLE_CUDA and Chroma_ENABLE_ROCM" ) +endif() ########################### # 5th: Deal with MDWF ########################### if( Chroma_ENABLE_MDWF ) -# +# # Gonna need to write a find module for this -# +# endif() #################################################### @@ -180,7 +187,7 @@ if( Chroma_ENABLE_LAPACK ) set(QDPLapack_BINDING "lapack") endif() add_subdirectory(other_libs/qdp-lapack) - + ########################### # 6th: Deal with superbblas ########################### @@ -189,7 +196,7 @@ if( Chroma_ENABLE_SUPERBBLAS ) include(FindBLAS) find_package(BLAS REQUIRED) - + if( QDP_BACKEND_ROCM AND NOT HIP_CPP_CONFIG ) find_package(HIP REQUIRED) execute_process( @@ -203,10 +210,14 @@ if( Chroma_ENABLE_SUPERBBLAS ) mark_as_advanced(HIP_CPP_CONFIG) endif() - if( QDP_BACKEND_ROCM ) + if( QDP_BACKEND_ROCM OR Chroma_ENABLE_ROCM) find_package(hipBLAS REQUIRED) + find_package(rocBLAS REQUIRED) + find_package(hipSPARSE REQUIRED) + find_package(hipSOLVER REQUIRED) + find_package(rocSOLVER REQUIRED) endif() - if( Chroma_ENABLE_CUDA ) + if( QDP_BACKEND_CUDA OR Chroma_ENABLE_CUDA) find_package(CUDAToolkit REQUIRED) endif() @@ -225,11 +236,11 @@ endif() ##################################### configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lib/chroma_config_internal.h.cmake.in lib/chroma_config_internal.h) - + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/lib/chroma_config_internal.h DESTINATION include) # Now we are ready to build in lib -# Heaven help us. +# Heaven help us. add_subdirectory(lib) add_subdirectory(mainprogs/main) @@ -239,7 +250,7 @@ add_subdirectory(mainprogs/tests) # if needed install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindGMP.cmake DESTINATION lib/cmake/Chroma ) - + #install the export description of the targets install(EXPORT ChromaTargets FILE ChromaTargets.cmake diff --git a/configure.ac b/configure.ac index 580fd3a98f..2b71641074 100755 --- a/configure.ac +++ b/configure.ac @@ -1282,7 +1282,7 @@ then AC_SUBST(CUDA_CXXFLAGS, ["-I${CUDA_HOME}/include"] ) AC_SUBST(CUDA_LDFLAGS, ["-L${CUDA_LIBDIR} -Wl,-rpath,${CUDA_LIBDIR}"] ) - AC_SUBST(CUDA_LIBS, ["-lcublas -lcublasLt -lcudart -lcuda"]) + AC_SUBST(CUDA_LIBS, ["-lcusolver -lcusparse -lcublas -lcublasLt -lcudart -lcuda"]) fi ############################################### diff --git a/docs/notes/chroma_gamma_matrices.tex b/docs/notes/chroma_gamma_matrices.tex index 8f82391405..ec9039b2ea 100644 --- a/docs/notes/chroma_gamma_matrices.tex +++ b/docs/notes/chroma_gamma_matrices.tex @@ -50,7 +50,7 @@ 5 & 0101 & b1\_y & $\gamma_1 \gamma_3$ & $- b_1(y)$ & 10\\ 6 & 0110 & b1\_x & $\gamma_2 \gamma_3$ & $b_1(x)$ & 9\\ 7 & 0111 & pion\_2 & $\gamma_1 \gamma_2 \gamma_3 = \gamma_5 \gamma_4$ & $\pi$& 8 \\ -8 & 1000 & a0\_2 & $\gamma_4$ & $a_0$ & 7 \\ +8 & 1000 & b0 & $\gamma_4$ & $b_0$ & 7 \\ 9 & 1001 & rho\_x\_2 & $\gamma_1 \gamma_4$ & $\varrho(x)$ & 6\\ 10 & 1010 & rho\_y\_2 & $\gamma_2 \gamma_4$ & $\varrho(y)$ & 5\\ 11 & 1011 & a1\_z & $\gamma_1 \gamma_2 \gamma_4 = \gamma_3 \gamma_5$ & $a_1(z)$ & 4\\ diff --git a/docs/superb_tasks.md b/docs/superb_tasks.md index 1af97b6d59..47b8b7f973 100644 --- a/docs/superb_tasks.md +++ b/docs/superb_tasks.md @@ -75,7 +75,7 @@ Main options: * Also all superb tasks support spatial phasing on the fly, so the option `phase` isn't usually needed. -* The smearing information is stored in the created file's metadata. When a filed created with `write_fingerprint` being `true` is passed as input to another Chroma tasks, the smearing infomation passed as input to the task should be the same as the smearing used for generating the eigenvectors. In case is not, the Chroma task will emmit a runtime error. +* The smearing information is stored in the created file's metadata. When a filed created with `write_fingerprint` being `true` is passed as input to another Chroma tasks, the smearing information passed as input to the task should be the same as the smearing used for generating the eigenvectors. In case is not, the Chroma task will emit a runtime error. # Creation of mesons @@ -844,3 +844,329 @@ Main options: * `Param/eigensolver/verbosity`: (optional, default `nooutput`) one of `nooutput`, `summary`, `detailed`. * `Param/Propagator`: propagator configuration. * `NamedObject/eigs_file`: file to store the results. + +# `mgproton` solver collection + +## Flexible GMRES + +Example: +``` + + MGPROTON + fgmres + 1e-7 + 3 + 20000 + Detailed + +``` + +Main options: + +* `tol`: residual norm tolerance, stopping when $\|Dx-b\|_2 \leq \text{tol}\ \|b\|_2$. +* `max_basis_size`: (optional, default `5`) maximum size of the search subspace. +* `max_its`: (optional, default infinity) maximum number of iterations. +* `error_if_not_converged`: (optional, default `true`) whether to complain if tolerance is not achieved. +* `prec`: (optional, default none) left preconditioning, does not affect residual norm. +* `ortho_each_its`: (optional, default `8` for double and `4` for single precision) orthogonalize the basis every this number of iterations. +* `max_residual_updates`: (optional, default `4` for double and `2` for single precision) recompute the residual vector every this number of iterations. +* `max_simultaneous_rhs`: (optional, default infinity) solver this many right-hand-sides at once. +* `verbosity`: (optional, default `nooutput`) level of verbosity, one of `nonoutput`, `summary`, `detailed`. +* `prefix`: (optional, default none) prefix output related with this solver with this string. + +## BiCGstab + +Example: +``` + + MGPROTON + bicgstab + 1e-7 + 20000 + Detailed + +``` + +Main options: + +* `tol`: residual norm tolerance, stopping when $\|Dx-b\|_2 \leq \text{tol}\ \|b\|_2$. +* `max_its`: (optional, default infinity) maximum number of iterations. +* `error_if_not_converged`: (optional, default `true`) whether to complain if tolerance is not achieved. +* `prec`: (optional, default none) left preconditioning, does not affect the residual norm. +* `max_simultaneous_rhs`: (optional, default infinity) solver this many right-hand-sides at once. +* `verbosity`: (optional, default `nooutput`) level of verbosity, one of `nonoutput`, `summary`, `detailed`. +* `prefix`: (optional, default none) prefix output related with this solver with this string. + +## Minimum Residual (MR) + +Example: +``` + + MGPROTON + mr + 1e-7 + 20000 + Detailed + +``` + +Main options: + +* `tol`: residual norm tolerance, stopping when $\|Dx-b\|_2 \leq \text{tol}\ \|b\|_2$. +* `max_its`: (optional, default infinity) maximum number of iterations. +* `error_if_not_converged`: (optional, default `true`) whether to complain if tolerance is not achieved. +* `prec`: (optional, default none) left preconditioning, does not affect residual norm. +* `max_simultaneous_rhs`: (optional, default infinity) solver this many right-hand-sides at once. +* `verbosity`: (optional, default `nooutput`) level of verbosity, one of `nonoutput`, `summary`, `detailed`. +* `prefix`: (optional, default none) prefix output related with this solver with this string. + +## Generalized Conjugate Residual (GCR) + +Example: +``` + + MGPROTON + gcr + 3 + 1e-7 + 20000 + Detailed + +``` + +Main options: + +* `tol`: residual norm tolerance, stopping when $\|Dx-b\|_2 \leq \text{tol}\ \|b\|_2$. +* `max_basis_size`: (optional, default `3`) maximum size of the search subspace. +* `max_its`: (optional, default infinity) maximum number of iterations. +* `error_if_not_converged`: (optional, default `true`) whether to complain if tolerance is not achieved. +* `prec`: (optional, default none) left preconditioning, does not affect residual norm. +* `max_simultaneous_rhs`: (optional, default infinity) solver this many right-hand-sides at once. +* `verbosity`: (optional, default `nooutput`) level of verbosity, one of `nonoutput`, `summary`, `detailed`. +* `prefix`: (optional, default none) prefix output related with this solver with this string. + +## Even-odd preconditioning + +Approximate $D^{-1}$ by splitting the sites of $D$ into two colors (red-black, even-odd) and solving the Schur complement iteratively. + +Example: +``` + + MGPROTON + eo + + gcr + 1e-7 + 3 + 20000 + Detailed + + +``` + +Main options: + +* `use_Aee_prec`: (optional, default `false`) whether to preconditioning the Schur complement with $D_{ee}^{-1}$, resulting in solving $(I-D_{eo}D_{oo}^{-1}D_{oe}D_{ee}^{-1})^{-1}$, insted of $(D_{ee}-D_{eo}D_{oo}^{-1}D_{oe})^{-1}$. +* `prec_ee`: (optional, default none): left preconditioning acting on the even sites for solving the Schur complement. +* `solver`: solver for the Schur complement. +* `prefix`: (optional, default none) prefix output related with this solver with this string. + +## Multigrid preconditioner + +Example: +``` + + MGPROTON + eo + + mr + 1e-7 + 20000 + l0 + Detailed + + + mg + 24 + 4 4 4 4 + + + eo + + mr + 1e-3 + 50 + false + nv0 + Detailed + + dd + + mr + 2 + 1e-1 + false + Detailed + nv0_dd + + + + + + + eo + + mr + 1e-1 + Detailed + c0 + + dd + + mr + 2 + 1e-1 + false + Detailed + c0_dd + + + + + + eo + + mr + 1e-1 + 5 + false + s0 + + dd + + mr + 4 + 1e-1 + false + Detailed + s0_dd + + + + + + +``` + +Main options: + +* `num_null_vecs`: number of null vectors. +* `blocking`: sites factor reduction in each lattice direction for producing the prolongator $V$. +* `null_vecs/solver`: if `null_vecs/eigensolver` is not given, solver to compute the null vectors approximating solutions of $Dx=0$; otherwise, solver used as the operator for the eigensolver. +* `null_vecs/eigensolver`: (optional) if given, it uses as null vectors the approximated largest singular vectors of $D^{-1}$. +* `null_vecs/tol`: (optional) if `null_vecs/eigensolver` is given, the tolerance for the eigensolver. +* `solver_coarse`: solver for the coarse operator, $V^* D V$, where $V$ is the prolongator. +* `solver_smoother`: solver for the correction (post-smoothing). + +# `mgproton` projection collection + +## Deflation projector + +If $U$ and $V$ and $\Sigma$ are the smallest singular triplets of $D$, that is $DV=\Sigma U$, then this builds the +following oblique projector, $V(U^*D*V)^{-1}U^*D$. + +``` + + MGPROTON + defl + 200 + 1e-1 + + bicgstab + 1e-2 + 20000 + Detailed + eig + + + Detailed + + +``` + +Main options: +* `rank`: number of singular triplets to compute. +* `tol`: (optional, default 0.1): relative error of the singular triplets relative to $\|D^{-1}\|_2$, $\|\gamma_5 D^{-1} v - \sigma v \|_2 \leq \text{tol}\ \|D^{-1}\|_2$. +* `solver`: solver to estimate $D^{-1}$ used by the eigensolver. +* `eigensolver/max_block_size`: (optional, default is `1`) maximum number of vectors expanding the search subspace in each iteration. +* `eigensolver/max_basis_size`: (optional, default is `PRIMME`'s default) maximum rank of the search subspace. +* `eigensolver/verbosity`: (optional, default `nooutput`) one of `nooutput`, `summary`, `detailed`. + +## Multigrid-based deflation projector + +If $U$ and $V$ and $\Sigma$ are the smallest singular triplets of $P^*DP$, that is $DV=\Sigma U$, and $P$ is a multigrid prolongator, then this builds the +following oblique projector, $PV(U^*P^*D*PV)^{-1}U^*P^*D$. + +``` + + MGPROTON + mg + + 24 + 4 4 4 4 + + + eo + + mr + 1e-3 + 50 + false + nv0 + Detailed + + dd + + mr + 2 + 1e-1 + false + Detailed + nv0_dd + + + + + 1e-2 + + Detailed + + + + + defl + 200 + 1e-1 + + bicgstab + 1e-2 + 20000 + Detailed + eig + + + Detailed + + + +``` + +Main options: +* `prolongator/num_null_vecs`: number of null vectors. +* `prolongator/blocking`: sites factor reduction in each lattice direction for producing the prolongator $V$. +* `prolongator/null_vecs/solver`: if `prolongator/null_vecs/eigensolver` is not given, solver to compute the null vectors approximating solutions of $Dx=0$; otherwise, solver used as the operator for the eigensolver. +* `prolongator/null_vecs/eigensolver`: (optional) if given, it uses as null vectors the approximated largest singular vectors of $D^{-1}$. +* `prolongator/null_vecs/tol`: (optional) if `null_vecs/eigensolver` is given, the tolerance for the eigensolver. +* `solver_coarse`: solver for the coarse operator, $V^* D V$, where $V$ is the prolongator. +* `solver_smoother`: solver for the correction (post-smoothing). +* `proj`: projector options onto the coarse operator, $V^* D V$. diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 10924b2507..82d1128a58 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -549,10 +549,11 @@ list(APPEND ChromaLIB_HEADERS actions/ferm/fermacts/fermact_factory_w.h actions/ferm/fermacts/clover_fermact_params_w.h actions/ferm/fermacts/eoprec_clover_fermact_w.h + actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h actions/ferm/fermacts/seoprec_clover_fermact_w.h actions/ferm/fermacts/eoprec_clover_orbifold_fermact_w.h actions/ferm/fermacts/unprec_clover_fermact_w.h - actions/ferm/fermacts/unprec_exp_clover_fermact_w.h + actions/ferm/fermacts/unprec_exp_clover_fermact_w.h actions/ferm/fermacts/nef_fermact_params_w.h actions/ferm/fermacts/eoprec_slic_fermact_w.h actions/ferm/fermacts/eoprec_slrc_fermact_w.h @@ -654,19 +655,20 @@ list(APPEND ChromaLIB_HEADERS actions/ferm/linop/lwldslash_base_3d_w.h actions/ferm/linop/lwldslash_3d_qdp_w.h actions/ferm/linop/clover_term_w.h - actions/ferm/linop/clov_triang_qdp_w.h + actions/ferm/linop/clov_triang_qdp_w.h actions/ferm/linop/clover_term_base_w.h actions/ferm/linop/exp_clover_term_base_w.h actions/ferm/linop/clover_term_qdp_w.h - actions/ferm/linop/exp_clover_term_w.h + actions/ferm/linop/exp_clover_term_w.h actions/ferm/linop/exp_clover_term_qdp_w.h actions/ferm/linop/eoprec_clover_linop_w.h + actions/ferm/linop/eoprec_exp_clover_linop_w.h actions/ferm/linop/seoprec_clover_linop_w.h actions/ferm/linop/shifted_linop_w.h actions/ferm/linop/eoprec_clover_dumb_linop_w.h actions/ferm/linop/eoprec_clover_orbifold_linop_w.h actions/ferm/linop/unprec_clover_linop_w.h - actions/ferm/linop/unprec_exp_clover_linop_w.h + actions/ferm/linop/unprec_exp_clover_linop_w.h actions/ferm/linop/eoprec_clover_extfield_linop_w.h actions/ferm/linop/eoprec_dwflike_linop_base_array_w.h actions/ferm/linop/eoprec_dwf_linop_array_w.h @@ -1015,6 +1017,7 @@ list(APPEND ChromaLIB_HEADERS meas/inline/hadron/inline_genprop_matelem_colorvec_w.h meas/inline/hadron/inline_genprop_matelem_da_colorvec_w.h meas/inline/hadron/inline_genprop_matelem_pt_colorvec_w.h + meas/inline/hadron/inline_inverter_test_w.h meas/inline/hadron/inline_mres_w.h meas/inline/hadron/inline_qpropqio_w.h meas/inline/hadron/inline_qpropadd_w.h @@ -1372,6 +1375,7 @@ target_sources(chromalib PRIVATE util/ferm/subset_vectors.cc util/ferm/block_couplings.cc util/ferm/disp_soln_cache.cc + util/ferm/mgproton.cc util/ft/sftmom.cc util/ft/single_phase.cc util/ft/time_slice_set.cc @@ -1446,6 +1450,7 @@ target_sources(chromalib PRIVATE meas/hadron/dilution_quark_source_const_w.cc util/gauge/cern_gauge_init.cc io/readcern.cc + constant.cc ) ######################################################## @@ -1469,10 +1474,11 @@ target_sources(chromalib PRIVATE actions/ferm/fermacts/eoprec_ovext_fermact_array_w.cc actions/ferm/fermacts/clover_fermact_params_w.cc actions/ferm/fermacts/eoprec_clover_fermact_w.cc + actions/ferm/fermacts/eoprec_exp_clover_fermact_w.cc actions/ferm/fermacts/seoprec_clover_fermact_w.cc actions/ferm/fermacts/eoprec_clover_orbifold_fermact_w.cc actions/ferm/fermacts/unprec_clover_fermact_w.cc - actions/ferm/fermacts/unprec_exp_clover_fermact_w.cc + actions/ferm/fermacts/unprec_exp_clover_fermact_w.cc actions/ferm/fermacts/nef_fermact_params_w.cc actions/ferm/fermacts/eoprec_slic_fermact_w.cc actions/ferm/fermacts/eoprec_slrc_fermact_w.cc @@ -1565,12 +1571,13 @@ target_sources(chromalib PRIVATE actions/ferm/linop/unprec_wilson_linop_w.cc actions/ferm/linop/clover_term_base_w.cc actions/ferm/linop/clover_term_qdp_w.cc - actions/ferm/linop/eoprec_clover_linop_w.cc + actions/ferm/linop/eoprec_clover_linop_w.cc + actions/ferm/linop/eoprec_exp_clover_linop_w.cc actions/ferm/linop/seoprec_clover_linop_w.cc actions/ferm/linop/eoprec_clover_dumb_linop_w.cc actions/ferm/linop/eoprec_clover_orbifold_linop_w.cc actions/ferm/linop/unprec_clover_linop_w.cc - actions/ferm/linop/unprec_exp_clover_linop_w.cc + actions/ferm/linop/unprec_exp_clover_linop_w.cc actions/ferm/linop/eoprec_clover_extfield_linop_w.cc actions/ferm/linop/eoprec_slic_linop_w.cc actions/ferm/linop/eoprec_slrc_linop_w.cc @@ -1783,6 +1790,7 @@ target_sources(chromalib PRIVATE meas/inline/hadron/inline_genprop_matelem_colorvec_w.cc meas/inline/hadron/inline_genprop_matelem_da_colorvec_w.cc meas/inline/hadron/inline_genprop_matelem_pt_colorvec_w.cc + meas/inline/hadron/inline_inverter_test_w.cc meas/inline/hadron/inline_mres_w.cc meas/inline/hadron/inline_qpropqio_w.cc meas/inline/hadron/inline_qpropadd_w.cc @@ -1929,7 +1937,9 @@ endif() ####################################################### if( Chroma_ENABLE_JIT_CLOVER ) list(APPEND ChromaLIB_HEADERS actions/ferm/linop/clover_term_jit_w.h) + list(APPEND ChromaLIB_HEADERS actions/ferm/linop/clover_term_jit2_w.h) target_sources(chromalib PRIVATE util/gauge/stout_utils_jit.cc) + target_sources(chromalib PRIVATE util/gauge/stout_utils_jit2.cc) endif() @@ -1946,11 +1956,13 @@ if( Chroma_ENABLE_QUDA ) actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.h actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.h actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.h + actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.h actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_w.h actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h + actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.h actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.h @@ -1976,11 +1988,13 @@ target_sources(chromalib PRIVATE actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.cc actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.cc actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.cc + actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.cc actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.cc actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_w.cc actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.cc actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.cc actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.cc + actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.cc actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.cc actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.cc actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.cc @@ -2011,6 +2025,8 @@ if( Chroma_ENABLE_SUPERBBLAS ) meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.h meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.h meas/inline/hadron/inline_create_colorvecs_superb.h + meas/inline/hadron/inline_eigenvalues_superb_w.h + meas/inline/hadron/inline_inverter_test_superb_w.h ) target_sources(chromalib PRIVATE meas/inline/hadron/inline_disco_prob_defl_superb_w.cc @@ -2019,6 +2035,8 @@ if( Chroma_ENABLE_SUPERBBLAS ) meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.cc meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc meas/inline/hadron/inline_create_colorvecs_superb.cc + meas/inline/hadron/inline_eigenvalues_superb_w.cc + meas/inline/hadron/inline_inverter_test_superb_w.cc ) target_include_directories(chromalib PUBLIC ${SUPERBBLAS_DIR}/include) add_library(superbblas STATIC IMPORTED ) @@ -2030,7 +2048,7 @@ if( Chroma_ENABLE_SUPERBBLAS ) add_library(magma STATIC IMPORTED ) set_target_properties( magma PROPERTIES IMPORTED_LOCATION ${MAGMA_DIR}/lib/libmagma.a ) target_link_libraries(chromalib PUBLIC magma ) - if( QDP_BACKEND_CUDA ) + if( QDP_BACKEND_CUDA OR Chroma_ENABLE_CUDA) target_link_libraries(magma INTERFACE CUDA::cublas CUDA::cublasLt CUDA::cusparse CUDA::cudart) endif() target_link_libraries(magma INTERFACE BLAS::BLAS ${Chroma_extra_link_flags}) @@ -2043,19 +2061,19 @@ if( Chroma_ENABLE_SUPERBBLAS ) target_link_libraries(chromalib PUBLIC primme ) if( Chroma_ENABLE_MAGMA ) target_link_libraries(primme INTERFACE magma) - elseif( QDP_BACKEND_CUDA ) + elseif( QDP_BACKEND_CUDA OR Chroma_ENABLE_CUDA) target_link_libraries(primme INTERFACE CUDA::cublas CUDA::cudart BLAS::BLAS ${Chroma_extra_link_flags}) else() target_link_libraries(primme INTERFACE BLAS::BLAS ${Chroma_extra_link_flags}) endif() endif() - if( QDP_BACKEND_ROCM ) + if( QDP_BACKEND_ROCM OR Chroma_ENABLE_ROCM) target_compile_options(chromalib PUBLIC ${HIP_CPP_CONFIG}) - target_link_libraries(superbblas INTERFACE roc::hipblas) + target_link_libraries(superbblas INTERFACE roc::hipblas roc::hipsparse roc::hipsolver roc::rocblas roc::rocsolver) endif() - if( QDP_BACKEND_CUDA ) - target_link_libraries(superbblas INTERFACE CUDA::cublas CUDA::cublasLt CUDA::cusparse CUDA::cudart) + if( QDP_BACKEND_CUDA OR Chroma_ENABLE_CUDA) + target_link_libraries(superbblas INTERFACE CUDA::cublas CUDA::cublasLt CUDA::cusparse CUDA::cusolver CUDA::cudart) endif() target_link_libraries(superbblas INTERFACE BLAS::BLAS ${Chroma_extra_link_flags}) endif() diff --git a/lib/Makefile.am b/lib/Makefile.am index 95376eb4ae..6688c1ca39 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -927,9 +927,12 @@ nobase_include_HEADERS += \ meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.h \ meas/inline/hadron/inline_unsmeared_hadron_node_distillation_w.h \ meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h \ + meas/inline/hadron/inline_eigenvalues_superb_w.h \ meas/inline/hadron/inline_genprop_matelem_colorvec_w.h \ meas/inline/hadron/inline_genprop_matelem_da_colorvec_w.h \ meas/inline/hadron/inline_genprop_matelem_pt_colorvec_w.h \ + meas/inline/hadron/inline_inverter_test_w.h \ + meas/inline/hadron/inline_inverter_test_superb_w.h \ meas/inline/hadron/inline_mres_w.h \ meas/inline/hadron/inline_qpropqio_w.h \ meas/inline/hadron/inline_qpropadd_w.h \ @@ -1080,6 +1083,7 @@ lib_LIBRARIES = libchroma.a # chroma/scripts/build_libchroma_sources.pl # libchroma_a_SOURCES = \ + constant.cc \ actions/boson/operator/klein_gord.cc \ actions/ferm/fermacts/zolotarev_coeffs.cc \ actions/ferm/fermstates/hex_fermstate_params.cc \ @@ -1344,6 +1348,7 @@ libchroma_a_SOURCES = \ util/ferm/subset_vectors.cc \ util/ferm/block_couplings.cc \ util/ferm/disp_soln_cache.cc \ + util/ferm/mgproton.cc \ util/ft/sftmom.cc \ util/ft/single_phase.cc \ util/ft/time_slice_set.cc \ @@ -1849,9 +1854,12 @@ libchroma_a_SOURCES += \ meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc \ meas/inline/hadron/inline_unsmeared_hadron_node_distillation_w.cc \ meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.cc \ + meas/inline/hadron/inline_eigenvalues_superb_w.cc \ meas/inline/hadron/inline_genprop_matelem_colorvec_w.cc \ meas/inline/hadron/inline_genprop_matelem_da_colorvec_w.cc \ meas/inline/hadron/inline_genprop_matelem_pt_colorvec_w.cc \ + meas/inline/hadron/inline_inverter_test_w.cc \ + meas/inline/hadron/inline_inverter_test_superb_w.cc \ meas/inline/hadron/inline_mres_w.cc \ meas/inline/hadron/inline_qpropqio_w.cc \ meas/inline/hadron/inline_qpropadd_w.cc \ diff --git a/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_clover_fermact_w.cc b/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_clover_fermact_w.cc index e244b71f87..26d1c694ec 100644 --- a/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_clover_fermact_w.cc +++ b/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_clover_fermact_w.cc @@ -7,6 +7,8 @@ #if QDP_ND == 4 #if QDP_NC == 3 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/eo3dprec_s_cprec_t_clover_fermact_w.h" #include "actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h" @@ -79,6 +81,8 @@ namespace Chroma } +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_wilson_fermact_w.cc b/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_wilson_fermact_w.cc index 16acd42f26..be5f70c6b9 100644 --- a/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_wilson_fermact_w.cc +++ b/lib/actions/ferm/fermacts/eo3dprec_s_cprec_t_wilson_fermact_w.cc @@ -7,6 +7,8 @@ #if QDP_NC == 3 #if QDP_ND == 4 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/eo3dprec_s_cprec_t_wilson_fermact_w.h" #include "actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h" @@ -79,6 +81,8 @@ namespace Chroma } +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.cc b/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.cc new file mode 100644 index 0000000000..acc79f4fec --- /dev/null +++ b/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.cc @@ -0,0 +1,91 @@ +/*! \file + * \brief Even-odd preconditioned ExpClover fermion action + */ + +#include "chromabase.h" +#include "actions/ferm/fermacts/fermact_factory_w.h" + +#include "actions/ferm/linop/eoprec_exp_clover_linop_w.h" +#include "actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h" +#include "actions/ferm/invert/syssolver_linop_factory.h" + +//#include "actions/ferm/fermacts/fermact_factory_w.h" +#include "actions/ferm/fermstates/ferm_createstate_reader_w.h" + +namespace Chroma +{ + + //! Hooks to register the class with the fermact factory + namespace EvenOddPrecExpCloverFermActEnv + { + //! Callback function + WilsonTypeFermAct, + multi1d >* createFermAct4D(XMLReader& xml_in, + const std::string& path) + { + return new EvenOddPrecExpCloverFermAct(CreateFermStateEnv::reader(xml_in, path), + CloverFermActParams(xml_in, path)); + } + + //! Callback function + /*! Differs in return type */ + FermionAction, + multi1d >* createFermAct(XMLReader& xml_in, + const std::string& path) + { + return createFermAct4D(xml_in, path); + } + + //! Name to be used + const std::string name = "EXP_CLOVER"; + + //! Local registration flag + static bool registered = false; + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= Chroma::TheFermionActionFactory::Instance().registerObject(name, createFermAct); + success &= Chroma::TheWilsonTypeFermActFactory::Instance().registerObject(name, createFermAct4D); + registered = true; + } + return success; + } + } + + //! Produce a linear operator for this action + /*! + * The operator acts on the odd subset + * + * \param state gauge field (Read) + */ + EvenOddPrecLogDetLinearOperator, + multi1d >* + EvenOddPrecExpCloverFermAct::linOp(Handle< FermState > state) const + { + return new EvenOddPrecExpCloverLinOp(state,param); + } + + //! Return a linear operator solver for this action to solve M*psi=chi + Projector* + EvenOddPrecExpCloverFermAct::projector(Handle< FermState > state, + const GroupXML_t& projParam) const + { + std::istringstream is(projParam.xml); + XMLReader paramtop(is); + + return TheLinOpFermProjectorFactory::Instance().createObject(projParam.id, + paramtop, + projParam.path, + state, + linOp(state)); + } + +} + diff --git a/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h b/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h new file mode 100644 index 0000000000..7758b3dbfe --- /dev/null +++ b/lib/actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h @@ -0,0 +1,83 @@ +// -*- C++ -*- +/*! \file + * \brief Even-odd preconditioned ExpClover fermion action + */ + +#ifndef __prec_exp_clover_fermact_w_h__ +#define __prec_exp_clover_fermact_w_h__ + +#include "eoprec_logdet_wilstype_fermact_w.h" +#include "actions/ferm/linop/lgherm_w.h" +#include "actions/ferm/fermacts/clover_fermact_params_w.h" + +namespace Chroma +{ + //! Name and registration + /*! \ingroup fermacts */ + namespace EvenOddPrecExpCloverFermActEnv + { + extern const std::string name; + bool registerAll(); + } + + + //! Even-odd preconditioned ExpClover fermion action + /*! \ingroup fermacts + * + * Even-odd preconditioned exponentiated clover fermion action. + * Only defined on odd subset. + */ + + class EvenOddPrecExpCloverFermAct : public EvenOddPrecLogDetWilsonTypeFermAct, multi1d > + { + public: + // Typedefs to save typing + typedef LatticeFermion T; + typedef multi1d P; + typedef multi1d Q; + + //! Partial constructor + EvenOddPrecExpCloverFermAct() {} + + //! General FermState + EvenOddPrecExpCloverFermAct(Handle< CreateFermState > cfs_, + const CloverFermActParams& param_) : + cfs(cfs_), param(param_) {} + + //! Copy constructor + EvenOddPrecExpCloverFermAct(const EvenOddPrecExpCloverFermAct& a) : + cfs(a.cfs), param(a.param) {} + + //! Produce a linear operator for this action + EvenOddPrecLogDetLinearOperator* linOp(Handle< FermState > state) const; + + //! Produce the gamma_5 hermitian operator H_w + LinearOperator* hermitianLinOp(Handle< FermState > state) const + { + return new lgherm(linOp(state)); + } + + //! Return a projector after this action + Projector* projector(Handle< FermState > state, + const GroupXML_t& projParam) const override; + + //! Destructor is automatic + ~EvenOddPrecExpCloverFermAct() {} + + protected: + //! Return the fermion BC object for this action + const CreateFermState& getCreateState() const {return *cfs;} + + //! Assignment + void operator=(const EvenOddPrecExpCloverFermAct& a) {} + + private: + Handle< CreateFermState > cfs; + CloverFermActParams param; + }; + +} // End Namespace Chroma + + +#endif diff --git a/lib/actions/ferm/fermacts/fermacts_aggregate_w.cc b/lib/actions/ferm/fermacts/fermacts_aggregate_w.cc index 5b008c052f..cf43831133 100644 --- a/lib/actions/ferm/fermacts/fermacts_aggregate_w.cc +++ b/lib/actions/ferm/fermacts/fermacts_aggregate_w.cc @@ -6,6 +6,7 @@ #include "actions/ferm/fermacts/fermacts_aggregate_w.h" #include "actions/ferm/fermacts/unprec_clover_fermact_w.h" +#include "actions/ferm/fermacts/unprec_exp_clover_fermact_w.h" #include "actions/ferm/fermacts/unprec_wilson_fermact_w.h" #include "actions/ferm/fermacts/unprec_parwilson_fermact_w.h" #include "actions/ferm/fermacts/unprec_graphene_fermact_w.h" @@ -14,6 +15,7 @@ #include "actions/ferm/fermacts/unprec_w12_fermact_w.h" #include "actions/ferm/fermacts/eoprec_clover_fermact_w.h" +#include "actions/ferm/fermacts/eoprec_exp_clover_fermact_w.h" #include "actions/ferm/fermacts/eoprec_clover_orbifold_fermact_w.h" #include "actions/ferm/fermacts/eoprec_clover_extfield_fermact_w.h" #include "actions/ferm/fermacts/eoprec_wilson_fermact_w.h" @@ -100,8 +102,10 @@ namespace Chroma success &= UnprecParWilsonFermActEnv::registerAll(); success &= EvenOddPrecCloverFermActEnv::registerAll(); + success &= EvenOddPrecExpCloverFermActEnv::registerAll(); success &= SymEvenOddPrecCloverFermActEnv::registerAll(); success &= UnprecCloverFermActEnv::registerAll(); + success &= UnprecExpCloverFermActEnv::registerAll(); success &= EvenOddPrecCloverOrbifoldFermActEnv::registerAll(); success &= EvenOddPrecSLICFermActEnv::registerAll(); success &= EvenOddPrecSLRCFermActEnv::registerAll(); @@ -116,6 +120,7 @@ namespace Chroma #if QDP_NS == 4 #if QDP_NC == 3 #if QDP_ND == 4 +#if ! defined (QDP_IS_QDPJIT2) success &= UnprecSpaceCentralPrecTimeWilsonFermActEnv::registerAll(); success &= ILUPrecSpaceCentralPrecTimeWilsonFermActEnv::registerAll(); success &= ILUPrecSpaceCentralPrecTimeCloverFermActEnv::registerAll(); @@ -125,6 +130,7 @@ namespace Chroma success &= EO3DPrecSpaceCentralPrecTimeCloverFermActEnv::registerAll(); #endif #endif +#endif #endif registered = true; } diff --git a/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_clover_fermact_w.cc b/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_clover_fermact_w.cc index 88f47349aa..71724edde2 100644 --- a/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_clover_fermact_w.cc +++ b/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_clover_fermact_w.cc @@ -6,6 +6,8 @@ #if QDP_ND == 4 #if QDP_NC == 3 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/ilu2prec_s_cprec_t_clover_fermact_w.h" #include "actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h" @@ -81,3 +83,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_wilson_fermact_w.cc b/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_wilson_fermact_w.cc index c0e2b1b27a..c1639e2d4a 100644 --- a/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_wilson_fermact_w.cc +++ b/lib/actions/ferm/fermacts/ilu2prec_s_cprec_t_wilson_fermact_w.cc @@ -6,6 +6,8 @@ #if QDP_ND == 4 #if QDP_NC == 3 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/ilu2prec_s_cprec_t_wilson_fermact_w.h" #include "actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h" @@ -81,3 +83,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/fermacts/iluprec_s_cprec_t_clover_fermact_w.cc b/lib/actions/ferm/fermacts/iluprec_s_cprec_t_clover_fermact_w.cc index 4b25148ec8..a95745a8ab 100644 --- a/lib/actions/ferm/fermacts/iluprec_s_cprec_t_clover_fermact_w.cc +++ b/lib/actions/ferm/fermacts/iluprec_s_cprec_t_clover_fermact_w.cc @@ -6,6 +6,8 @@ #if QDP_ND == 4 #if QDP_NC == 3 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/iluprec_s_cprec_t_clover_fermact_w.h" #include "actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h" @@ -81,3 +83,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/fermacts/iluprec_s_cprec_t_wilson_fermact_w.cc b/lib/actions/ferm/fermacts/iluprec_s_cprec_t_wilson_fermact_w.cc index 2b06d244cf..82bc4e97c1 100644 --- a/lib/actions/ferm/fermacts/iluprec_s_cprec_t_wilson_fermact_w.cc +++ b/lib/actions/ferm/fermacts/iluprec_s_cprec_t_wilson_fermact_w.cc @@ -6,6 +6,8 @@ #if QDP_ND == 4 #if QDP_NC == 3 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/iluprec_s_cprec_t_wilson_fermact_w.h" #include "actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h" @@ -81,3 +83,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/fermacts/unprec_clover_fermact_w.cc b/lib/actions/ferm/fermacts/unprec_clover_fermact_w.cc index d33ab30f9e..70ec53ea1e 100644 --- a/lib/actions/ferm/fermacts/unprec_clover_fermact_w.cc +++ b/lib/actions/ferm/fermacts/unprec_clover_fermact_w.cc @@ -7,6 +7,7 @@ #include "actions/ferm/linop/unprec_clover_linop_w.h" #include "actions/ferm/fermacts/unprec_clover_fermact_w.h" +#include "actions/ferm/invert/syssolver_linop_factory.h" //#include "actions/ferm/fermacts/fermact_factory_w.h" #include "actions/ferm/fermstates/ferm_createstate_reader_w.h" @@ -72,5 +73,20 @@ namespace Chroma return new UnprecCloverLinOp(state,param); } + + //! Return a linear operator solver for this action to solve M*psi=chi + Projector* + UnprecCloverFermAct::projector(Handle< FermState > state, + const GroupXML_t& projParam) const + { + std::istringstream is(projParam.xml); + XMLReader paramtop(is); + + return TheLinOpFermProjectorFactory::Instance().createObject(projParam.id, + paramtop, + projParam.path, + state, + linOp(state)); + } } diff --git a/lib/actions/ferm/fermacts/unprec_clover_fermact_w.h b/lib/actions/ferm/fermacts/unprec_clover_fermact_w.h index 79e058b8a7..618bda9d05 100644 --- a/lib/actions/ferm/fermacts/unprec_clover_fermact_w.h +++ b/lib/actions/ferm/fermacts/unprec_clover_fermact_w.h @@ -52,6 +52,10 @@ namespace Chroma return new lgherm(linOp(state)); } + //! Return a projector after this action + Projector* projector(Handle< FermState > state, + const GroupXML_t& projParam) const override; + //! Destructor is automatic ~UnprecCloverFermAct() {} diff --git a/lib/actions/ferm/fermacts/unprec_s_cprec_t_wilson_fermact_w.cc b/lib/actions/ferm/fermacts/unprec_s_cprec_t_wilson_fermact_w.cc index dabfde0ab3..90c24501de 100644 --- a/lib/actions/ferm/fermacts/unprec_s_cprec_t_wilson_fermact_w.cc +++ b/lib/actions/ferm/fermacts/unprec_s_cprec_t_wilson_fermact_w.cc @@ -8,6 +8,8 @@ #if QDP_NC == 3 #if QDP_ND == 4 +#if ! defined (QDP_IS_QDPJIT2) + #include "chromabase.h" #include "actions/ferm/fermacts/unprec_s_cprec_t_wilson_fermact_w.h" #include "actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h" @@ -83,3 +85,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/fermbcs/schr_sf_fermbc_w.cc b/lib/actions/ferm/fermbcs/schr_sf_fermbc_w.cc index e4aad651e1..bb5b0eb5e1 100644 --- a/lib/actions/ferm/fermbcs/schr_sf_fermbc_w.cc +++ b/lib/actions/ferm/fermbcs/schr_sf_fermbc_w.cc @@ -90,7 +90,7 @@ namespace Chroma if (mu != j_decay) { - Real ftmp = Chroma::twopi * getTheta()[i] / Real(QDP::Layout::lattSize()[mu]); + Real ftmp = Chroma::constant().twopi * getTheta()[i] / Real(QDP::Layout::lattSize()[mu]); SFBndFld[mu] *= cmplx(cos(ftmp),sin(ftmp)); ++i; } diff --git a/lib/actions/ferm/invert/containers.h b/lib/actions/ferm/invert/containers.h index 2c7e1d9862..3df2139697 100644 --- a/lib/actions/ferm/invert/containers.h +++ b/lib/actions/ferm/invert/containers.h @@ -5,6 +5,9 @@ #include "chromabase.h" +#if ! defined (QDP_IS_QDPJIT2) + + namespace Chroma { namespace LinAlg @@ -489,3 +492,4 @@ namespace Chroma } // namespace Chroma #endif +#endif diff --git a/lib/actions/ferm/invert/inv_eigcg2.cc b/lib/actions/ferm/invert/inv_eigcg2.cc index 44151a9ae9..f08540eca9 100644 --- a/lib/actions/ferm/invert/inv_eigcg2.cc +++ b/lib/actions/ferm/invert/inv_eigcg2.cc @@ -12,6 +12,8 @@ #include "actions/ferm/invert/containers.h" #include "actions/ferm/invert/norm_gram_schm.h" +#if ! defined (QDP_IS_QDPJIT2) + //#define DEBUG #define DEBUG_FINAL @@ -1281,3 +1283,4 @@ namespace Chroma }// End Namespace Chroma +#endif diff --git a/lib/actions/ferm/invert/inv_eigcg2.h b/lib/actions/ferm/invert/inv_eigcg2.h index c7a1837ac5..89d911fa48 100644 --- a/lib/actions/ferm/invert/inv_eigcg2.h +++ b/lib/actions/ferm/invert/inv_eigcg2.h @@ -10,6 +10,8 @@ #include "syssolver.h" #include "actions/ferm/invert/containers.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -116,3 +118,4 @@ namespace Chroma }// End Namespace Chroma #endif +#endif diff --git a/lib/actions/ferm/invert/inv_eigcg2_array.cc b/lib/actions/ferm/invert/inv_eigcg2_array.cc index db0418e2be..c51b7f79c8 100644 --- a/lib/actions/ferm/invert/inv_eigcg2_array.cc +++ b/lib/actions/ferm/invert/inv_eigcg2_array.cc @@ -12,6 +12,8 @@ #include "actions/ferm/invert/containers.h" #include "actions/ferm/invert/norm_gram_schm.h" +#if ! defined (QDP_IS_QDPJIT2) + //#define DEBUG #define DEBUG_FINAL @@ -997,3 +999,4 @@ namespace Chroma }// End Namespace Chroma +#endif diff --git a/lib/actions/ferm/invert/inv_eigcg2_array.h b/lib/actions/ferm/invert/inv_eigcg2_array.h index 9186f3d688..51b6b704ea 100644 --- a/lib/actions/ferm/invert/inv_eigcg2_array.h +++ b/lib/actions/ferm/invert/inv_eigcg2_array.h @@ -10,6 +10,8 @@ #include "syssolver.h" #include "actions/ferm/invert/containers.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -104,3 +106,4 @@ namespace Chroma }// End Namespace Chroma #endif +#endif diff --git a/lib/actions/ferm/invert/minv_rel_cg.cc b/lib/actions/ferm/invert/minv_rel_cg.cc index 8343f89c25..cf1c1d8aa6 100644 --- a/lib/actions/ferm/invert/minv_rel_cg.cc +++ b/lib/actions/ferm/invert/minv_rel_cg.cc @@ -116,7 +116,7 @@ void MInvRelCG_a(const LinearOperator& A, Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) { + if( toBool( chi_norm < Chroma::constant().fuzz )) { n_count = 0; // The psi are all zero anyway at this point diff --git a/lib/actions/ferm/invert/minvcg.cc b/lib/actions/ferm/invert/minvcg.cc index 48d08ddc17..ab68bb4a7a 100644 --- a/lib/actions/ferm/invert/minvcg.cc +++ b/lib/actions/ferm/invert/minvcg.cc @@ -135,7 +135,7 @@ namespace Chroma Double chi_norm_sq = norm2(chi_internal,sub); flopcount.addSiteFlops(4*Nc*Ns,sub); Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) + if( toBool( chi_norm < Chroma::constant().fuzz )) { swatch.stop(); diff --git a/lib/actions/ferm/invert/minvcg2.cc b/lib/actions/ferm/invert/minvcg2.cc index c07d84d09a..d9014cf2f1 100644 --- a/lib/actions/ferm/invert/minvcg2.cc +++ b/lib/actions/ferm/invert/minvcg2.cc @@ -134,7 +134,7 @@ namespace Chroma Double chi_norm_sq = norm2(chi_internal,sub); flopcount.addSiteFlops(4*Nc*Ns,sub); Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) + if( toBool( chi_norm < Chroma::constant().fuzz )) { swatch.stop(); diff --git a/lib/actions/ferm/invert/minvcg2_accum.cc b/lib/actions/ferm/invert/minvcg2_accum.cc index c187cfe089..11433b6fde 100644 --- a/lib/actions/ferm/invert/minvcg2_accum.cc +++ b/lib/actions/ferm/invert/minvcg2_accum.cc @@ -132,7 +132,7 @@ namespace Chroma Double chi_norm_sq = norm2(chi_internal,sub); flopcount.addSiteFlops(4*Nc*Ns,sub); Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) + if( toBool( chi_norm < Chroma::constant().fuzz )) { swatch.stop(); diff --git a/lib/actions/ferm/invert/minvcg_accumulate_array.cc b/lib/actions/ferm/invert/minvcg_accumulate_array.cc index 362e3975c4..3e13fafe82 100644 --- a/lib/actions/ferm/invert/minvcg_accumulate_array.cc +++ b/lib/actions/ferm/invert/minvcg_accumulate_array.cc @@ -183,7 +183,7 @@ namespace Chroma Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) { + if( toBool( chi_norm < Chroma::constant().fuzz )) { n_count = 0; swatch.stop(); QDPIO::cout << "MinvCG: Finished. Iters taken = " << n_count << std::endl; diff --git a/lib/actions/ferm/invert/minvcg_array.cc b/lib/actions/ferm/invert/minvcg_array.cc index 87140756a3..6b764feaf5 100644 --- a/lib/actions/ferm/invert/minvcg_array.cc +++ b/lib/actions/ferm/invert/minvcg_array.cc @@ -174,7 +174,7 @@ namespace Chroma Double chi_norm = sqrt(chi_norm_sq); - if( toBool( chi_norm < fuzz )) { + if( toBool( chi_norm < Chroma::constant().fuzz )) { n_count = 0; swatch.stop(); QDPIO::cout << "MinvCG: Finished. Iters taken = " << n_count << std::endl; diff --git a/lib/actions/ferm/invert/projector_null.h b/lib/actions/ferm/invert/projector_null.h new file mode 100644 index 0000000000..f24ad1ab62 --- /dev/null +++ b/lib/actions/ferm/invert/projector_null.h @@ -0,0 +1,118 @@ +// -*- C++ -*- +/*! \file + * \brief Oblique projector to a random vector (useful for testing) + */ + +#ifndef __projector_null_h__ +#define __projector_null_h__ + +#include "chroma_config.h" +#include "handle.h" +#include "syssolver.h" +#include "linearop.h" + +namespace Chroma +{ + + //! Return a null projector + /*! \ingroup invert + */ + template + class ProjectorNull : public Projector + { + public: + using Ts = const std::vector>; + using const_Ts = const std::vector>; + + //! Constructor + /*! + * \param A_ Linear operator ( Read ) + */ + ProjectorNull(Handle> A_) : A(A_) + { + } + + //! Destructor is automatic + ~ProjectorNull() {} + + //! Return the subset on which the operator acts + const Subset& subset() const + { + return A->subset(); + } + + //! Apply the oblique projector A*V*inv(U^H*A*V)*U^H + /*! + *! Returns A*V*inv(U^H*A*V)*U^H*chi = psi + */ + void AVUObliqueProjector(Ts& psi, const_Ts&) const override + { + for (int i = 0; i < psi.size(); ++i) + { + *psi[i] = zero; + } + } + + //! Apply the oblique projector V*inv(U^H*A*V)*U^H*A + /*! + * Returns V*inv(U^H*A*V)*U^H*A*chi = psi + */ + void VUAObliqueProjector(Ts& psi, const_Ts&) const override + { + for (int i = 0; i < psi.size(); ++i) + { + *psi[i] = zero; + } + } + + //! Rank of the projector, which is the rank of U and V also + unsigned int rank() const override + { + return 0; + } + + //! Return U[i] + void U(unsigned int, T&) const override + { + throw std::runtime_error("ProjectorNull: rank of the projector is null"); + } + + //! Return V[i] + void V(unsigned int, T&) const override + { + throw std::runtime_error("ProjectorNull: rank of the projector is null"); + } + + //! Return U[i]^H*A*V[i] + void lambda(unsigned int, DComplex&) const override + { + throw std::runtime_error("ProjectorNull: rank of the projector is null"); + } + + private: + Handle> A; + }; + + //! Null projector namespace + namespace ProjectorNullEnv + { + Projector* createProjector( + XMLReader&, const std::string&, + Handle, multi1d>>, + Handle> A) + { + return new ProjectorNull(A); + } + + //! Register the projector + inline bool registerAll() { + static bool registered = false; + if (registered) return true; + registered = true; + return Chroma::TheLinOpFermProjectorFactory::Instance().registerObject("NULL_PROJECTOR", + createProjector); + } + } +} // End namespace + +#endif diff --git a/lib/actions/ferm/invert/projector_random.h b/lib/actions/ferm/invert/projector_random.h new file mode 100644 index 0000000000..4e2f310d84 --- /dev/null +++ b/lib/actions/ferm/invert/projector_random.h @@ -0,0 +1,130 @@ +// -*- C++ -*- +/*! \file + * \brief Oblique projector to a random vector (useful for testing) + */ + +#ifndef __projector_random_h__ +#define __projector_random_h__ + +#include "chroma_config.h" +#include "handle.h" +#include "syssolver.h" +#include "linearop.h" + +namespace Chroma +{ + + //! Solve a M*psi=chi linear system by BICGSTAB + /*! \ingroup invert + */ + template + class ProjectorRandom : public Projector + { + public: + using Ts = const std::vector>; + using const_Ts = const std::vector>; + + //! Constructor + /*! + * \param A_ Linear operator ( Read ) + */ + ProjectorRandom(Handle> A_) : A(A_) + { + T u0 = zero, v0 = zero; + random(v0, A->subset()); + v = v0 / sqrt(norm2(v0, A->subset())); + (*A)(u0, v, PLUS); + l = sqrt(norm2(u0, A->subset())); + u = u0 / l; + } + + //! Destructor is automatic + ~ProjectorRandom() {} + + //! Return the subset on which the operator acts + const Subset& subset() const + { + return A->subset(); + } + + //! Apply the oblique projector A*V*inv(U^H*A*V)*U^H + /*! + *! Returns A*V*inv(U^H*A*V)*U^H*chi = psi + */ + void AVUObliqueProjector(Ts& psi, const_Ts& chi) const override + { + for (int i = 0; i < psi.size(); ++i) + { + // Return u * (u^* * chi) + *psi[i] = u * innerProduct(u, *chi[i], A->subset()); + } + } + + //! Apply the oblique projector V*inv(U^H*A*V)*U^H*A + /*! + * Returns V*inv(U^H*A*V)*U^H*A*chi = psi + */ + void VUAObliqueProjector(Ts& psi, const_Ts& chi) const override + { + for (int i = 0; i < psi.size(); ++i) + { + T A_chi = zero; + (*A)(A_chi, *chi[i], PLUS); + // Return v * (u^* A * chi) / l + *psi[i] = v * (innerProduct(u, A_chi, A->subset()) / l); + } + } + + //! Rank of the projector, which is the rank of U and V also + unsigned int rank() const override + { + return 1; + } + + //! Return U[i] + void U(unsigned int i, T& psi) const override + { + psi = u; + } + + //! Return V[i] + void V(unsigned int i, T& psi) const override + { + psi = v; + } + + //! Return U[i]^H*A*V[i] + void lambda(unsigned int i, DComplex& lambda) const override + { + lambda = l; + } + + private: + Handle> A; + T u, v; + DComplex l; ///< = u^* * A * v + }; + + //! Random projector namespace + namespace ProjectorRandomEnv + { + Projector* createProjector( + XMLReader&, const std::string&, + Handle, multi1d>>, + Handle> A) + { + return new ProjectorRandom(A); + } + + //! Register the projector + inline bool registerAll() { + static bool registered = false; + if (registered) return true; + registered = true; + return Chroma::TheLinOpFermProjectorFactory::Instance().registerObject("RANDOM_PROJECTOR", + createProjector); + } + } +} // End namespace + +#endif diff --git a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.h b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.h index 453db40d56..177011181a 100644 --- a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.h +++ b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_clover_quda_w.h @@ -365,19 +365,6 @@ namespace Chroma #endif - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } - - // PADDING // Setup padding diff --git a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.h b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.h index 55ca6ec98d..4301c1cb7b 100644 --- a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.h +++ b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_mdagm_cg_wilson_quda_w.h @@ -270,17 +270,6 @@ namespace Chroma quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } // PADDING diff --git a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.cc b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.cc index 06de67f519..e5025a2f6b 100644 --- a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric (i.e. CLOVER) is default } if( paramtop.count("CudaPrecision") > 0 ) { @@ -100,14 +100,6 @@ namespace Chroma { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - if( paramtop.count("GCRInnerParams") > 0 ) { innerParams = new GCRInnerSolverParams(paramtop, "./GCRInnerParams"); @@ -160,7 +152,6 @@ namespace Chroma { write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); write(xml, "CheckShifts", p.checkShiftsP); - write(xml, "AutotuneDslash", p.tuneDslashP); write(xml, "Pipeline", p.Pipeline); if( p.innerParamsP ) { diff --git a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.h b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.h index 7657db3a24..e0b719b18c 100644 --- a/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.h +++ b/lib/actions/ferm/invert/quda_solvers/multi_syssolver_quda_clover_params.h @@ -22,11 +22,10 @@ namespace Chroma cudaSloppyReconstruct=RECONS_12; cudaRefinementPrecision=DEFAULT; cudaRefinementReconstruct=RECONS_12; - asymmetricP = false; //< Use asymmetric version of the linear operator + asymmetricP = true; //< Use asymmetric version of the linear operator axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; innerParamsP = false; checkShiftsP = true; @@ -50,7 +49,6 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; innerParamsP = p.innerParamsP; innerParams = p.innerParams; checkShiftsP = p.checkShiftsP; @@ -75,7 +73,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; bool innerParamsP; bool checkShiftsP; int Pipeline; diff --git a/lib/actions/ferm/invert/quda_solvers/quda_mg_utils.h b/lib/actions/ferm/invert/quda_solvers/quda_mg_utils.h index e73e17e738..0eb3339c91 100644 --- a/lib/actions/ferm/invert/quda_solvers/quda_mg_utils.h +++ b/lib/actions/ferm/invert/quda_solvers/quda_mg_utils.h @@ -136,15 +136,6 @@ namespace Chroma { mg_inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; // //Done... - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling MG Dslash Autotuning" << std::endl; - mg_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling MG Dslash Autotuning" << std::endl; - mg_inv_param.tune = QUDA_TUNE_NO; - } if( invParam.MULTIGRIDParamsP ) { QDPIO::cout << "Setting MULTIGRID solver params" << std::endl; // Dereference handle @@ -255,6 +246,7 @@ namespace Chroma { // FIXME: Elevate ip.nvec, ip.nu_pre, ip.nu_post, ip.tol to arrays in the XML if ( i < mg_param.n_level-1) { mg_param.n_vec[i] = ip.nvec[i]; + mg_param.n_vec_batch[i] = ip.nvec_batch[i]; mg_param.nu_pre[i] = ip.nu_pre[i]; mg_param.nu_post[i] = ip.nu_post[i]; } diff --git a/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.cc b/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.cc index e4eefdf7a9..d876cb2c6d 100644 --- a/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.cc @@ -62,6 +62,7 @@ namespace Chroma { nvec.resize(mg_levels-1); + nvec_batch.resize(mg_levels-1); nu_pre.resize(mg_levels-1); nu_post.resize(mg_levels-1); maxIterSubspaceCreate.resize(mg_levels-1); @@ -104,7 +105,6 @@ namespace Chroma { << " blockings but only " << nvec.size() << " sets of NullVectors" << std::endl; QDP_abort(1); } - if (nu_pre.size() != mg_levels-1 ) { QDPIO::cout<<"Error. There are "<< (mg_levels-1) @@ -112,6 +112,31 @@ namespace Chroma { QDP_abort(1); } + { + int paramcount = paramtop.count("NullVectorsBatchSize"); + if ( paramcount == 1 ) { + read(paramtop, "NullVectorsBatchSize", nvec_batch); + if (nvec_batch.size() != mg_levels - 1 ) { + QDPIO::cout << "If NullVectorsBatchSize is given, then for " + << mg_levels << " levels, there must be " << mg_levels-1 + << " values in the input. Currently the input has " + << nvec_batch.size() << " values \n"; + QDP_abort(1); + } + } + else if ( paramcount > 1 ) { + QDPIO::cout << "NullVectorsBatchSize occurs more than once in this input\n"; + QDP_abort(1); + } + else { + // Not found in output + nvec_batch.resize(mg_levels-1); + for( int i=0; i < mg_levels-1; i++) nvec_batch[i] = 1; + } + } + + + subspaceSolver.resize(mg_levels-1); readArray(paramtop, "SubspaceSolver", subspaceSolver, CG); @@ -258,6 +283,7 @@ namespace Chroma { write(xml, "Reconstruct", p.reconstruct); write(xml, "SchwarzType", p.schwarzType); write(xml, "NullVectors", p.nvec); + write(xml, "NullVectorsBatchSize", p.nvec_batch); write(xml, "MultiGridLevels", p.mg_levels); write(xml, "GenerateNullSpace", p.generate_nullspace); write(xml, "GenerateAllLevels", p.generate_all_levels); diff --git a/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.h b/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.h index e95a0aeede..0614d05a02 100644 --- a/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.h +++ b/lib/actions/ferm/invert/quda_solvers/quda_multigrid_params.h @@ -35,6 +35,7 @@ namespace Chroma multi1d nu_pre; multi1d nu_post; multi1d< multi1d > blocking; + multi1d nvec_batch; int outer_gcr_nkrylov; int precond_gcr_nkrylov; std::string cycle_type; @@ -65,6 +66,7 @@ namespace Chroma } blocking.resize(mg_levels-1); nvec.resize(mg_levels-1); + nvec_batch.resize(mg_levels-1); nu_pre.resize(mg_levels-1); nu_post.resize(mg_levels-1); maxIterSubspaceCreate.resize(mg_levels-1); @@ -87,7 +89,7 @@ namespace Chroma nu_pre[l] = 2; nu_post[l] = 2; nvec[l] = 16; - + nvec_batch[l]=1; // the batch size for Nvec solves is 1 by default // Default params: maxIterSubspaceCreate[l] = 500; rsdTargetSubspaceCreate[l] = 5.0e-6; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.cc index 00a2ecba50..58211879fe 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.cc @@ -64,14 +64,13 @@ namespace Chroma T& psi_s) const{ SystemSolverResults_t ret; - + const auto& sub = A->subset(); T mod_chi; // Copy source into mod_chi, and zero the off-parity - mod_chi[rb[0]] = zero; - - - if( invParam.asymmetricP ) { + if( is_precond ) { + mod_chi[rb[0]] = zero; + if( invParam.asymmetricP ) { // // symmetric // Solve with M_symm = 1 - A^{-1}_oo D A^{-1}ee D @@ -80,15 +79,18 @@ namespace Chroma // // So M x = b => A_oo (M_symm) x = b // => M_symm x = A^{-1}_oo b = chi_mod - invclov.apply(mod_chi, chi_s, PLUS, 1); - } - else { - mod_chi[rb[1]] = chi_s; - } - + invclov.apply(mod_chi, chi_s, PLUS, 1); + } + else { + mod_chi[rb[1]] = chi_s; + } + } + else { + mod_chi = chi_s; + } #ifndef BUILD_QUDA_DEVIFACE_SPINOR - void* spinorIn =(void *)&(mod_chi.elem(rb[1].start()).elem(0).elem(0).real()); - void* spinorOut =(void *)&(psi_s.elem(rb[1].start()).elem(0).elem(0).real()); + void* spinorIn =(void *)&(mod_chi.elem(sub.start()).elem(0).elem(0).real()); + void* spinorOut =(void *)&(psi_s.elem(sub.start()).elem(0).elem(0).real()); #else // void* spinorIn = GetMemoryPtr( mod_chi.getId() ); // void* spinorOut = GetMemoryPtr( psi_s.getId() ); @@ -111,11 +113,141 @@ namespace Chroma QDPIO::cout << "\tTotal Time (incl. load gauge)=" << swatch1.getTimeInSeconds() <<" s"<& invclov, + const std::vector>& psi_s, + const std::vector>& chi_s, + std::vector& res) const { + + + StopWatch source_prep; + source_prep.reset(); + source_prep.start(); + + multi1d mod_chi(chi_s.size()); + const auto& sub = A->subset(); + + for(int i=0; i < chi_s.size(); i++) { + + if( is_precond ) { + // Copy source into mod_chi, and zero the off-parity + mod_chi[i][rb[0]] = zero; + + if( invParam.asymmetricP ) { + // + // symmetric + // Solve with M_symm = 1 - A^{-1}_oo D A^{-1}ee D + // + // Chroma M = A_oo ( M_symm ) + // + // So M x = b => A_oo (M_symm) x = b + // => M_symm x = A^{-1}_oo b = chi_mod + invclov.apply(mod_chi[i], *(chi_s[i]), PLUS, 1); + } + else { + mod_chi[i][rb[1]] = *(chi_s[i]); + } + } + else { + mod_chi[i] = *(chi_s[i]); + } + } + + std::vector spinorIn(chi_s.size()); + std::vector spinorOut(psi_s.size()); + + int N_src = chi_s.size(); +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + // Regular non-qdpjit approach. Just collect the pointers + for(int soln=0; soln < chi_s.size(); soln++) { + spinorIn[soln] = (void *)&(mod_chi[soln].elem(sub.start()).elem(0).elem(0).real()); + spinorOut[soln] = (void *)&(psi_s[soln]->elem(sub.start()).elem(0).elem(0).real()); + } +#else + std::vector ids(2*N_src); + + for(int soln=0; soln < N_src; soln++) { + ids[soln] = mod_chi[soln].getId(); + ids[N_src+soln] = psi_s[soln]->getId(); + } + + // Grab all the keys + auto dev_ptr = QDP_get_global_cache().get_dev_ptrs( multi1d( ids.data(), ids.size()) ); + + + for(int soln=0; soln < N_src; soln++) { + spinorIn[soln] = dev_ptr(soln); + spinorOut[soln] = dev_ptr(N_src+soln); + } + source_prep.stop(); +#endif + + // Local quda_inv_param (?) + // Relies on quda_inv_param being just a dumb/struct and or copyable + QudaInvertParam local_quda_inv_param = quda_inv_param ; + local_quda_inv_param.num_src = mod_chi.size(); + + // No grid splitting for MG yet, so commenting that out +#if 0 + int totalSubgrids=1; + const multi1d& machine_size=QDP::Layout::logicalSize(); + + for (int i = 0; i < Nd; i++) { + local_quda_inv_param.split_grid[i] = invParam.GridSplitDims[i]; + totalSubgrids *= invParam.GridSplitDims[i]; + if ( machine_size[i] % invParam.GridSplitDims[i] != 0 ) { + QDPIO::cerr << "The split-grid-subgrid dimensions must divide the number ranks in each dimension exactly\n"; + QDPIO::cerr << "Currently this is not the case: dim=" << i << " machine_size["< > A_, Handle< FermState > state_, const SysSolverQUDAMULTIGRIDCloverParams& invParam_) : - A(A_), invParam(invParam_), clov(new CloverTermT() ), invclov(new CloverTermT()) + A(A_), is_precond( true ), invParam(invParam_), clov(new CloverTermT() ), invclov(new CloverTermT()) { StopWatch init_swatch; init_swatch.reset(); init_swatch.start(); @@ -85,7 +85,11 @@ namespace Chroma QDPIO::cout << solver_string << "Initializing" << std::endl; // FOLLOWING INITIALIZATION in test QUDA program - + const auto& sub = A->subset(); + if( sub.start() == all.start() && sub.numSiteTable() == all.numSiteTable()) { + is_precond = false; + } + // 1) work out cpu_prec, cuda_prec, cuda_prec_sloppy int s = sizeof( WordType::Type_t ); if (s == 4) { @@ -266,32 +270,9 @@ namespace Chroma // Solution type //quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; //Taken from invert test. - quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; + quda_inv_param.solution_type = is_precond ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - - // Solve type - /*switch( invParam.solverType ) { - case CG: - quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; - break; - case BICGSTAB: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - case GCR: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - - case MR: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - - default: - quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; - - break; - }*/ - - quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; // Always quda_inv_param.dagger = QUDA_DAG_NO; quda_inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; @@ -317,13 +298,6 @@ namespace Chroma quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; #endif - // Autotuning - if( invParam.tuneDslashP ) { - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - quda_inv_param.tune = QUDA_TUNE_NO; - } // Setup padding @@ -583,13 +557,13 @@ namespace Chroma T g_chi,g_psi; // Gauge Fix source and initial guess - g_chi[ rb[1] ] = GFixMat * chi; - g_psi[ rb[1] ] = GFixMat * psi; + g_chi[ A->subset() ] = GFixMat * chi; + g_psi[ A->subset() ] = GFixMat * psi; res = qudaInvert(*clov, *invclov, g_chi, g_psi); - psi[ rb[1]] = adj(GFixMat)*g_psi; + psi[ A->subset() ] = adj(GFixMat)*g_psi; } else { @@ -600,33 +574,25 @@ namespace Chroma } swatch.stop(); - Double rel_resid; if( invParam.SolutionCheckP ) { - - { T r; r[A->subset()]=chi; T tmp; (*A)(tmp, psi, PLUS); r[A->subset()] -= tmp; - res.resid = sqrt(norm2(r, A->subset())); - } - - rel_resid = res.resid/sqrt(norm2(chi,A->subset())); - - QDPIO::cout << solver_string << res.n_count << " iterations. Rsd = " << res.resid << " Relative Rsd = " << rel_resid << std::endl; + res.resid = sqrt(norm2(r, A->subset()))/sqrt(norm2(chi,A->subset())); } - else { - // just believe the QUDA residuum. - // which is always a true reiduum - rel_resid = res.resid; + else { + QDPIO::cout << "Chroma <-> QUDA solution check disabled. Using (trusting) QUDA residuum\n"; } + QDPIO::cout << solver_string << res.n_count << " iterations. Relative Rsd = " << res.resid << std::endl; + // Convergence Check/Blow Up if ( ! invParam.SilentFailP ) { - if ( toBool( rel_resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { - QDPIO::cerr << solver_string << "ERROR: Solver residuum is outside tolerance: QUDA resid="<< rel_resid << " Desired =" << invParam.RsdTarget << " Max Tolerated = " << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + if ( toBool( res.resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << solver_string << "ERROR: Solver residuum is outside tolerance: QUDA resid="<< res.resid << " Desired =" << invParam.RsdTarget << " Max Tolerated = " << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; QDP_abort(1); } } @@ -635,6 +601,68 @@ namespace Chroma return res; } + std::vector operator() (const std::vector>& psi, const std::vector>& chi) const override + { + + START_CODE(); + QDPIO::cout << "Entering MRHS solution: N_src = " << chi.size() << "\n"; + + std::vector res(chi.size()); + if( psi.size() != chi.size() ) { + QDPIO::cout << "Number of sources does not match number of solutions\n"; + QDPIO::cout << "psi.size() = " << psi.size() << " but chi.size() = " << chi.size() << "\n"; + QDP_abort(1); + } + + StopWatch swatch; + swatch.start(); + + if ( invParam.axialGaugeP ) { + QDPIO::cerr << "Multi RHS solve in axial gauge not yet implemented\n"; + QDP_abort(1); + } + + qudaInvertMultiSrc(*invclov, psi, chi, res); + + swatch.stop(); + + // Check solutions -- if desired + + if( invParam.SolutionCheckP ) { + for(int soln =0; soln < psi.size(); soln++) { + T r; + r[A->subset()]=*(chi[ soln ]); + T tmp; + (*A)(tmp, *(psi[soln]), PLUS); + r[A->subset()] -= tmp; + res[soln].resid = sqrt(norm2(r, A->subset()))/sqrt(norm2(*(chi[soln]),A->subset())); + } + } + else { + QDPIO::cout << "Chroma <-> QUDA solution check disabled. Using (trusting) QUDA residua\n"; + } + + for(int soln=0; soln < psi.size(); soln++ ) { + QDPIO::cout << "QUDA_"<< solver_string <<" solution " << soln << + " : " << res[soln].n_count << " iterations. Relative Rsd = " << res[soln].resid << std::endl; + + // Convergence Check/Blow Up + if ( ! invParam.SilentFailP ) { + if ( toBool( res[soln].resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << "ERROR: QUDA Solver residuum for solution " << soln + << " is outside tolerance: QUDA resid="<< res[soln].resid << " Desired =" + << invParam.RsdTarget << " Max Tolerated = " + << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + QDP_abort(1); + } + } + } + + + END_CODE(); + return res; + } + private: // Hide default constructor LinOpSysSolverQUDAMULTIGRIDClover() {} @@ -642,6 +670,7 @@ namespace Chroma #if 1 Q links_orig; #endif + bool is_precond; U GFixMat; QudaPrecision_s cpu_prec; @@ -658,11 +687,16 @@ namespace Chroma Handle< CloverTermT > invclov; SystemSolverResults_t qudaInvert(const CloverTermT& clover, - const CloverTermT& inv_clov, + const CloverTermT& invclov, const T& chi_s, T& psi_s )const; + void qudaInvertMultiSrc(const CloverTermT& invclov, + const std::vector>& psi_s, + const std::vector>& chi_s, + std::vector& res) const; + std::string solver_string; }; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.cc index 45de3adef7..3dc0b552d2 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.cc @@ -19,161 +19,179 @@ namespace Chroma { - namespace LinOpSysSolverQUDACloverEnv - { - - //! Anonymous namespace - namespace - { - //! Name to be used - const std::string name("QUDA_CLOVER_INVERTER"); - - //! Local registration flag - bool registered = false; - } - - - - LinOpSystemSolver* createFerm(XMLReader& xml_in, - const std::string& path, - Handle< FermState< LatticeFermion, multi1d, multi1d > > state, - - Handle< LinearOperator > A) + namespace LinOpSysSolverQUDACloverEnv { - return new LinOpSysSolverQUDAClover(A, state,SysSolverQUDACloverParams(xml_in, path)); - } - //! Register all the factories - bool registerAll() - { - bool success = true; - if (! registered) - { - success &= Chroma::TheLinOpFermSystemSolverFactory::Instance().registerObject(name, createFerm); - registered = true; - } - return success; + //! Anonymous namespace + namespace + { + //! Name to be used + const std::string name("QUDA_CLOVER_INVERTER"); + + //! Local registration flag + bool registered = false; + } + + + + LinOpSystemSolver* createFerm(XMLReader& xml_in, + const std::string& path, + Handle< FermState< LatticeFermion, multi1d, multi1d > > state, + + Handle< LinearOperator > A) + { + return new LinOpSysSolverQUDAClover(A, state,SysSolverQUDACloverParams(xml_in, path)); + } + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= Chroma::TheLinOpFermSystemSolverFactory::Instance().registerObject(name, createFerm); + registered = true; + } + return success; + } } - } - SystemSolverResults_t - LinOpSysSolverQUDAClover::qudaInvert(const CloverTermT& clover, - const CloverTermT& invclov, - const T& chi_s, - T& psi_s) const{ + SystemSolverResults_t LinOpSysSolverQUDAClover::qudaInvert( const T& chi_s, T& psi_s) const { + SystemSolverResults_t ret; - SystemSolverResults_t ret; - - void *spinorIn; - void *spinorOut; + void *spinorIn; + void *spinorOut; + auto sub = A->subset(); #ifdef BUILD_QUDA_DEVIFACE_SPINOR - std::vector ids; + std::vector ids; #endif - - // No need to transform source -#ifndef BUILD_QUDA_DEVIFACE_SPINOR - spinorIn =(void *)&(chi_s.elem(rb[1].start()).elem(0).elem(0).real()); -#else - //spinorIn = GetMemoryPtr( chi_s.); - //QDPIO::cout << "MDAGM spinor in = " << spinorIn << "\n"; - ids.push_back(chi_s.getId()); -#endif - + // No need to transform source #ifndef BUILD_QUDA_DEVIFACE_SPINOR - spinorOut =(void *)&(psi_s.elem(rb[1].start()).elem(0).elem(0).real()); + spinorIn =(void *)&(chi_s.elem(sub.start()).elem(0).elem(0).real()); #else - ids.push_back(psi_s.getId()); - auto dev_ptr = GetMemoryPtr(ids); - spinorIn = dev_ptr[0]; - spinorOut = dev_ptr[1]; - + //spinorIn = GetMemoryPtr( chi_s.); + //QDPIO::cout << "MDAGM spinor in = " << spinorIn << "\n"; + ids.push_back(chi_s.getId()); #endif - // Do the solve here - StopWatch swatch1; - swatch1.reset(); - swatch1.start(); - invertQuda(spinorOut, spinorIn, (QudaInvertParam*)&quda_inv_param); - swatch1.stop(); - - QDPIO::cout << "QUDA_"<>& psi_s, + const std::vector>& chi_s, + std::vector& res) const { + std::vector spinorIn(chi_s.size()); + std::vector spinorOut(psi_s.size()); + auto sub = A->subset(); - // Need to create a simple ferm state from the links_single... - Handle< FermState > pstate(new PeriodicFermState(links_orig)); - const AnisoParam_t& aniso = invParam.CloverParams.anisoParam; - QDPWilsonDslashT qdp_dslash(pstate, aniso); - - T tmp,psi2; - tmp=zero; - psi2=zero; - // qdp_dslash.apply(psi2, mod_chi, PLUS, 0); - qdp_dslash.apply(tmp, mod_chi, PLUS, 0); - invclov.apply(psi2,tmp, PLUS, 0); + int N_src = chi_s.size(); +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + // Regular non-qdpjit approach. Just collect the pointers + for(int soln=0; soln < chi_s.size(); soln++) { + spinorIn[soln] = (void *)&(chi_s[soln]->elem(sub.start()).elem(0).elem(0).real()); + spinorOut[soln] = (void *)&(psi_s[soln]->elem(sub.start()).elem(0).elem(0).real()); + } +#else + std::vector ids(2*N_src); + for(int soln=0; soln < N_src; soln++) { + ids[soln] = chi_s[soln]->getId(); + ids[N_src+soln] = psi_s[soln]->getId(); + } + + // Grab all the keys + auto dev_ptr = QDP_get_global_cache().get_dev_ptrs( multi1d( ids.data(), ids.size()) ); - T r=zero; - r = psi2 - psi_s; - QDPIO::cout << "CB=0" << std::endl; - QDPIO::cout << "Dslash Test: || r || = " << sqrt(norm2(r,rb[0])) << std::endl; - // QDPIO::cout << "Dslash Test: || r ||/|| psi || = " << sqrt(norm2(r,rb[0])/norm2(psi_s, rb[0])) << std::endl; + for(int soln=0; soln < N_src; soln++) { + spinorIn[soln] = dev_ptr(soln); + spinorOut[soln] = dev_ptr(N_src+soln); + } +#endif - QDPIO::cout << "CB=1: Should be zero" << std::endl; - QDPIO::cout << "Dslash Test: || r || = " << sqrt(norm2(r,rb[1])) << std::endl; - //QDPIO::cout << "Dslash Test: || r ||/|| psi || = " << sqrt(norm2(r,rb[1])/norm2(psi_s, rb[1])) << std::endl; - - const int* tab = rb[0].siteTable().slice(); - for(int i=0; i < rb[0].numSiteTable(); i++) { - int j = tab[i]; - bool printSite=false; - - for(int spin=0; spin < 4; spin++) { - for(int col=0; col < 3; col++) { - if( (fabs(r.elem(j).elem(spin).elem(col).real()) > 1.0e-5 ) - || (fabs(r.elem(j).elem(spin).elem(col).imag()) > 1.0e-5 )) { - printSite=true; - } - } - } - if( printSite ) { - - for(int spin=0; spin < 4; spin++) { - for(int col=0; col < 3; col++) { - QDPIO::cout << "Site= " << j << " Spin= "<< spin << " Col= " << col << " spinor = ( " - << psi2.elem(j).elem(spin).elem(col).real() << " , " - << psi2.elem(j).elem(spin).elem(col).imag() << " )" << std::endl; - } - } - QDPIO::cout << std::endl; - } + // Local quda_inv_param (?) + // Relies on quda_inv_param being just a dumb/struct and or copyable + QudaInvertParam local_quda_inv_param = quda_inv_param ; + + int totalSubgrids=1; + const multi1d& machine_size=QDP::Layout::logicalSize(); + + for (int i = 0; i < Nd; i++) { + local_quda_inv_param.split_grid[i] = invParam.GridSplitDims[i]; + totalSubgrids *= invParam.GridSplitDims[i]; + if ( machine_size[i] % invParam.GridSplitDims[i] != 0 ) { + QDPIO::cerr << "The split-grid-subgrid dimensions must divide the number ranks in each dimension exactly\n"; + QDPIO::cerr << "Currently this is not the case: dim=" << i << " machine_size["< +#include #include "util/gauge/reunit.h" #ifdef QDP_IS_QDPJIT @@ -33,650 +34,694 @@ namespace Chroma { -//! Richardson system solver namespace -namespace LinOpSysSolverQUDACloverEnv -{ -//! Register the syssolver -bool registerAll(); -} - - - -//! Solve a Clover Fermion System using the QUDA inverter -/*! \ingroup invert - *** WARNING THIS SOLVER WORKS FOR Clover FERMIONS ONLY *** - */ - -class LinOpSysSolverQUDAClover : public LinOpSystemSolver -{ -public: - typedef LatticeFermion T; - typedef LatticeColorMatrix U; - typedef multi1d Q; - - typedef LatticeFermionF TF; - typedef LatticeColorMatrixF UF; - typedef multi1d QF; - - typedef LatticeFermionF TD; - typedef LatticeColorMatrixF UD; - typedef multi1d QD; - - typedef WordType::Type_t REALT; - //! Constructor - /*! - * \param M_ Linear operator ( Read ) - * \param invParam inverter parameters ( Read ) - */ - LinOpSysSolverQUDAClover(Handle< LinearOperator > A_, - Handle< FermState > state_, - const SysSolverQUDACloverParams& invParam_) : - A(A_), invParam(invParam_), clov(new CloverTermT() ), invclov(new CloverTermT()) - { - QDPIO::cout << "LinOpSysSolverQUDAClover:" << std::endl; - - // FOLLOWING INITIALIZATION in test QUDA program - - // 1) work out cpu_prec, cuda_prec, cuda_prec_sloppy - int s = sizeof( WordType::Type_t ); - if (s == 4) { - cpu_prec = QUDA_SINGLE_PRECISION; - } - else { - cpu_prec = QUDA_DOUBLE_PRECISION; - } - - - // Work out GPU precision - switch( invParam.cudaPrecision ) { - case HALF: - gpu_prec = QUDA_HALF_PRECISION; - break; - case SINGLE: - gpu_prec = QUDA_SINGLE_PRECISION; - break; - case DOUBLE: - gpu_prec = QUDA_DOUBLE_PRECISION; - break; - default: - gpu_prec = cpu_prec; - break; - } - - // Work out GPU Sloppy precision - // Default: No Sloppy - switch( invParam.cudaSloppyPrecision ) { - case HALF: - gpu_half_prec = QUDA_HALF_PRECISION; - break; - case SINGLE: - gpu_half_prec = QUDA_SINGLE_PRECISION; - break; - case DOUBLE: - gpu_half_prec = QUDA_DOUBLE_PRECISION; - break; - default: - gpu_half_prec = gpu_prec; - break; - } - - // 2) pull 'new; GAUGE and Invert params - q_gauge_param = newQudaGaugeParam(); - quda_inv_param = newQudaInvertParam(); - - // 3) set lattice size - const multi1d& latdims = Layout::subgridLattSize(); - - q_gauge_param.X[0] = latdims[0]; - q_gauge_param.X[1] = latdims[1]; - q_gauge_param.X[2] = latdims[2]; - q_gauge_param.X[3] = latdims[3]; - - // 4) - deferred (anisotropy) - - // 5) - set QUDA_WILSON_LINKS, QUDA_GAUGE_ORDER - q_gauge_param.type = QUDA_WILSON_LINKS; + //! Richardson system solver namespace + namespace LinOpSysSolverQUDACloverEnv + { + //! Register the syssolver + bool registerAll(); + } + + + + //! Solve a Clover Fermion System using the QUDA inverter + /*! \ingroup invert + *** WARNING THIS SOLVER WORKS FOR Clover FERMIONS ONLY *** + */ + + class LinOpSysSolverQUDAClover : public LinOpSystemSolver + { + public: + typedef LatticeFermion T; + typedef LatticeColorMatrix U; + typedef multi1d Q; + + typedef LatticeFermionF TF; + typedef LatticeColorMatrixF UF; + typedef multi1d QF; + + typedef LatticeFermionF TD; + typedef LatticeColorMatrixF UD; + typedef multi1d QD; + + typedef WordType::Type_t REALT; + //! Constructor + /*! + * \param M_ Linear operator ( Read ) + * \param invParam inverter parameters ( Read ) + */ + LinOpSysSolverQUDAClover(Handle< LinearOperator > A_, + Handle< FermState > state_, + const SysSolverQUDACloverParams& invParam_) : + A(A_), invParam(invParam_) + { + + QDPIO::cout << "LinOpSysSolverQUDAClover:" << std::endl; + + bool is_precond = true; + auto sub = A->subset(); + + if ( sub.start() == all.start() && sub.numSiteTable() == all.numSiteTable() ) is_precond = false; + // FOLLOWING INITIALIZATION in test QUDA program + + // 1) work out cpu_prec, cuda_prec, cuda_prec_sloppy + int s = sizeof( WordType::Type_t ); + if (s == 4) { + cpu_prec = QUDA_SINGLE_PRECISION; + } + else { + cpu_prec = QUDA_DOUBLE_PRECISION; + } + + + // Work out GPU precision + switch( invParam.cudaPrecision ) { + case HALF: + gpu_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_prec = cpu_prec; + break; + } + + // Work out GPU Sloppy precision + // Default: No Sloppy + switch( invParam.cudaSloppyPrecision ) { + case HALF: + gpu_half_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_half_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_half_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_half_prec = gpu_prec; + break; + } + + // 2) pull 'new; GAUGE and Invert params + q_gauge_param = newQudaGaugeParam(); + quda_inv_param = newQudaInvertParam(); + + // 3) set lattice size + const multi1d& latdims = Layout::subgridLattSize(); + + q_gauge_param.X[0] = latdims[0]; + q_gauge_param.X[1] = latdims[1]; + q_gauge_param.X[2] = latdims[2]; + q_gauge_param.X[3] = latdims[3]; + + // 4) - deferred (anisotropy) + + // 5) - set QUDA_WILSON_LINKS, QUDA_GAUGE_ORDER + q_gauge_param.type = QUDA_WILSON_LINKS; #ifndef BUILD_QUDA_DEVIFACE_GAUGE - q_gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; // gauge[mu], p + q_gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; // gauge[mu], p #else - QDPIO::cout << "MDAGM Using QDP-JIT gauge order" << std::endl; - q_gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - q_gauge_param.gauge_order = QUDA_QDPJIT_GAUGE_ORDER; + QDPIO::cout << "MDAGM Using QDP-JIT gauge order" << std::endl; + q_gauge_param.location = QUDA_CUDA_FIELD_LOCATION; + q_gauge_param.gauge_order = QUDA_QDPJIT_GAUGE_ORDER; #endif - // 6) - set t_boundary - // Convention: BC has to be applied already - // This flag just tells QUDA that this is so, - // so that QUDA can take care in the reconstruct - if( invParam.AntiPeriodicT ) { - q_gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; - } - else { - q_gauge_param.t_boundary = QUDA_PERIODIC_T; - } - - // Set cpu_prec, cuda_prec, reconstruct and sloppy versions - q_gauge_param.cpu_prec = cpu_prec; - q_gauge_param.cuda_prec = gpu_prec; - - - switch( invParam.cudaReconstruct ) { - case RECONS_NONE: - q_gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - break; - case RECONS_8: - q_gauge_param.reconstruct = QUDA_RECONSTRUCT_8; - break; - case RECONS_12: - q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; - break; - default: - q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; - break; - }; - - q_gauge_param.cuda_prec_sloppy = gpu_half_prec; - - // Default for no preconditioner -- may be overwritten based - // on innerParams - q_gauge_param.cuda_prec_precondition = gpu_half_prec; - - switch( invParam.cudaSloppyReconstruct ) { - case RECONS_NONE: - q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; - break; - case RECONS_8: - q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8; - break; - case RECONS_12: - q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; - break; - default: - q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; - break; - }; - - // Default. This may be overrridden later. - q_gauge_param.reconstruct_precondition = q_gauge_param.reconstruct_sloppy; - // Gauge fixing: - - // These are the links - // They may be smeared and the BC's may be applied - Q links_single(Nd); - - // Now downcast to single prec fields. - for(int mu=0; mu < Nd; mu++) { - links_single[mu] = (state_->getLinks())[mu]; - } - - // GaugeFix - if( invParam.axialGaugeP ) { - QDPIO::cout << "Fixing Temporal Gauge" << std::endl; - temporalGauge(links_single, GFixMat, Nd-1); - for(int mu=0; mu < Nd; mu++){ - links_single[mu] = GFixMat*(state_->getLinks())[mu]*adj(shift(GFixMat, FORWARD, mu)); - } - q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES; - } - else { - // No GaugeFix - q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; // No Gfix yet - } - - // deferred 4) Gauge Anisotropy - const AnisoParam_t& aniso = invParam.CloverParams.anisoParam; - if( aniso.anisoP ) { // Anisotropic case - Real gamma_f = aniso.xi_0 / aniso.nu; - q_gauge_param.anisotropy = toDouble(gamma_f); - } - else { - q_gauge_param.anisotropy = 1.0; - } - - // MAKE FSTATE BEFORE RESCALING links_single - // Because the clover term expects the unrescaled links... - Handle > fstate( new PeriodicFermState(links_single)); - - if( aniso.anisoP ) { // Anisotropic case - multi1d cf=makeFermCoeffs(aniso); - for(int mu=0; mu < Nd; mu++) { - links_single[mu] *= cf[mu]; - } - } - - // Now onto the inv param: - // Dslash type - quda_inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; - - // Invert type: - switch( invParam.solverType ) { - case CG: - quda_inv_param.inv_type = QUDA_CG_INVERTER; - solver_string = "CG"; - break; - case BICGSTAB: - quda_inv_param.inv_type = QUDA_BICGSTAB_INVERTER; - solver_string = "BICGSTAB"; - break; - case GCR: - quda_inv_param.inv_type = QUDA_GCR_INVERTER; - solver_string = "GCR"; - break; - default: - QDPIO::cerr << "Unknown SOlver type" << std::endl; - QDP_abort(1); - break; - } - - // Mass - - // Fiendish idea from Ron. Set the kappa=1/2 and use - // unmodified clover term, and ask for Kappa normalization - // This should give us A - (1/2)D as the unpreconditioned operator - // and probabl 1 - {1/4} A^{-1} D A^{-1} D as the preconditioned - // op. Apart from the A_oo stuff on the antisymmetric we have - // nothing to do... - quda_inv_param.kappa = 0.5; - - // FIXME: If we want QUDA to compute the clover coeff, we need to be able to deal - // with awfuless of anisotropy - // The value below is a dummy one. - quda_inv_param.clover_coeff = 1.0; // Dummy tree level value. Not used - quda_inv_param.tol = toDouble(invParam.RsdTarget); - quda_inv_param.maxiter = invParam.MaxIter; - quda_inv_param.reliable_delta = toDouble(invParam.Delta); - quda_inv_param.pipeline = invParam.Pipeline; - - // Solution type - quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; - - // Solve type - switch( invParam.solverType ) { - case CG: - quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; - break; - case BICGSTAB: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - case GCR: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - case CA_GCR: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - case MR: - quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; - break; - - default: - quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; - - break; - } - - if( invParam.asymmetricP ) { - QDPIO::cout << "Using Asymmetric Linop: A_oo - D A^{-1}_ee D" << std::endl; - quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC; - } - else { - QDPIO::cout << "Using Symmetric Linop: 1 - A^{-1}_oo D A^{-1}_ee D" << std::endl; - quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; - } - - quda_inv_param.dagger = QUDA_DAG_NO; - quda_inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; - - quda_inv_param.cpu_prec = cpu_prec; - quda_inv_param.cuda_prec = gpu_prec; - quda_inv_param.cuda_prec_sloppy = gpu_half_prec; - - // Default. May be overridden by inner params - quda_inv_param.cuda_prec_precondition = gpu_half_prec; - - - quda_inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; - quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + // 6) - set t_boundary + // Convention: BC has to be applied already + // This flag just tells QUDA that this is so, + // so that QUDA can take care in the reconstruct + if( invParam.AntiPeriodicT ) { + q_gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; + } + else { + q_gauge_param.t_boundary = QUDA_PERIODIC_T; + } + + // Set cpu_prec, cuda_prec, reconstruct and sloppy versions + q_gauge_param.cpu_prec = cpu_prec; + q_gauge_param.cuda_prec = gpu_prec; + + + switch( invParam.cudaReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + }; + + q_gauge_param.cuda_prec_sloppy = gpu_half_prec; + + // Default for no preconditioner -- may be overwritten based + // on innerParams + q_gauge_param.cuda_prec_precondition = gpu_half_prec; + + switch( invParam.cudaSloppyReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + }; + + // Default. This may be overrridden later. + q_gauge_param.reconstruct_precondition = q_gauge_param.reconstruct_sloppy; + // Gauge fixing: + + // These are the links + // They may be smeared and the BC's may be applied + + Q links_single(Nd); + + // Now downcast to single prec fields. + for(int mu=0; mu < Nd; mu++) { + links_single[mu] = (state_->getLinks())[mu]; + } + + // GaugeFix + if( invParam.axialGaugeP ) { + QDPIO::cout << "Fixing Temporal Gauge" << std::endl; + temporalGauge(links_single, GFixMat, Nd-1); + for(int mu=0; mu < Nd; mu++){ + links_single[mu] = GFixMat*(state_->getLinks())[mu]*adj(shift(GFixMat, FORWARD, mu)); + } + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES; + } + else { + // No GaugeFix + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; // No Gfix yet + } + + // deferred 4) Gauge Anisotropy + const AnisoParam_t& aniso = invParam.CloverParams.anisoParam; + if( aniso.anisoP ) { // Anisotropic case + Real gamma_f = aniso.xi_0 / aniso.nu; + q_gauge_param.anisotropy = toDouble(gamma_f); + } + else { + q_gauge_param.anisotropy = 1.0; + } + + // MAKE FSTATE BEFORE RESCALING links_single + // Because the clover term expects the unrescaled links... + Handle > fstate( new PeriodicFermState(links_single)); + + if( aniso.anisoP ) { // Anisotropic case + multi1d cf=makeFermCoeffs(aniso); + for(int mu=0; mu < Nd; mu++) { + links_single[mu] *= cf[mu]; + } + } + + // Now onto the inv param: + // Dslash type + quda_inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + // Invert type: + switch( invParam.solverType ) { + case CG: + quda_inv_param.inv_type = QUDA_CG_INVERTER; + solver_string = "CG"; + break; + case BICGSTAB: + quda_inv_param.inv_type = QUDA_BICGSTAB_INVERTER; + solver_string = "BICGSTAB"; + break; + case GCR: + quda_inv_param.inv_type = QUDA_GCR_INVERTER; + solver_string = "GCR"; + break; + default: + QDPIO::cerr << "Unknown SOlver type" << std::endl; + QDP_abort(1); + break; + } + + // Mass + + // Fiendish idea from Ron. Set the kappa=1/2 and use + // unmodified clover term, and ask for Kappa normalization + // This should give us A - (1/2)D as the unpreconditioned operator + // and probabl 1 - {1/4} A^{-1} D A^{-1} D as the preconditioned + // op. Apart from the A_oo stuff on the antisymmetric we have + // nothing to do... + quda_inv_param.kappa = 0.5; + + // FIXME: If we want QUDA to compute the clover coeff, we need to be able to deal + // with awfuless of anisotropy + // The value below is a dummy one. + quda_inv_param.clover_coeff = 1.0; // Dummy tree level value. Not used + quda_inv_param.tol = toDouble(invParam.RsdTarget); + quda_inv_param.maxiter = invParam.MaxIter; + quda_inv_param.reliable_delta = toDouble(invParam.Delta); + quda_inv_param.pipeline = invParam.Pipeline; + + // Solve type: We always solve with a PC solve (even for a MAT solution) + // because PC solves have better conditioning. QUDA needs to do the EO reconstructs + switch( invParam.solverType ) { + case CG: + quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; + break; + case BICGSTAB: + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + break; + case GCR: + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + break; + case CA_GCR: + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + break; + case MR: + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + break; + + default: + quda_inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; + break; + } + + if( is_precond ) { + + // Solution type + quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; + + // Preconditioned system + if( invParam.asymmetricP ) { + QDPIO::cout << "Using Asymmetric Linop: A_oo - D A^{-1}_ee D" << std::endl; + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC; + } + else { + QDPIO::cout << "Using Symmetric Linop: 1 - A^{-1}_oo D A^{-1}_ee D" << std::endl; + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; + } + } + else { + // Solution type + quda_inv_param.solution_type = QUDA_MAT_SOLUTION; + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; //QUDA's native preconditioning when using QUDA_DIRECT_PC solution + } + + quda_inv_param.dagger = QUDA_DAG_NO; + quda_inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; + + quda_inv_param.cpu_prec = cpu_prec; + quda_inv_param.cuda_prec = gpu_prec; + quda_inv_param.cuda_prec_sloppy = gpu_half_prec; + + // Default. May be overridden by inner params + quda_inv_param.cuda_prec_precondition = gpu_half_prec; + + + quda_inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; + quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; #ifndef BUILD_QUDA_DEVIFACE_SPINOR - quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; + quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; #else - QDPIO::cout << "MDAGM Using QDP-JIT spinor order" << std::endl; - quda_inv_param.dirac_order = QUDA_QDPJIT_DIRAC_ORDER; - quda_inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; - quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; + QDPIO::cout << "MDAGM Using QDP-JIT spinor order" << std::endl; + quda_inv_param.dirac_order = QUDA_QDPJIT_DIRAC_ORDER; + quda_inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; #endif - // Clover precision and order - quda_inv_param.clover_cpu_prec = cpu_prec; - quda_inv_param.clover_cuda_prec = gpu_prec; - quda_inv_param.clover_cuda_prec_sloppy = gpu_half_prec; + // Clover precision and order + quda_inv_param.clover_cpu_prec = cpu_prec; + quda_inv_param.clover_cuda_prec = gpu_prec; + quda_inv_param.clover_cuda_prec_sloppy = gpu_half_prec; - // Default. may be overrridden by inner params - quda_inv_param.clover_cuda_prec_precondition = gpu_half_prec; + // Default. may be overrridden by inner params + quda_inv_param.clover_cuda_prec_precondition = gpu_half_prec; #ifndef BUILD_QUDA_DEVIFACE_CLOVER #warning "NOT USING QUDA DEVICE IFACE" - quda_inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + quda_inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; #else #warning "USING QUDA DEVICE IFACE" - QDPIO::cout << "MDAGM clover CUDA location\n"; - quda_inv_param.clover_location = QUDA_CUDA_FIELD_LOCATION; - quda_inv_param.clover_order = QUDA_QDPJIT_CLOVER_ORDER; + QDPIO::cout << "MDAGM clover CUDA location\n"; + quda_inv_param.clover_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.clover_order = QUDA_QDPJIT_CLOVER_ORDER; #endif - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } - - - // Setup padding - multi1d face_size(4); - face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; - face_size[1] = latdims[0]*latdims[2]*latdims[3]/2; - face_size[2] = latdims[0]*latdims[1]*latdims[3]/2; - face_size[3] = latdims[0]*latdims[1]*latdims[2]/2; - - int max_face = face_size[0]; - for(int i=1; i <=3; i++) { - if ( face_size[i] > max_face ) { - max_face = face_size[i]; - } - } - - - q_gauge_param.ga_pad = max_face; - - if( invParam.innerParamsP ) { - QDPIO::cout << "Setting inner solver params" << std::endl; - // Dereference handle - const GCRInnerSolverParams& ip = *(invParam.innerParams); - - // Set preconditioner precision - switch( ip.precPrecondition ) { - case HALF: - quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; - quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; - q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; - break; - - case SINGLE: - quda_inv_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; - quda_inv_param.clover_cuda_prec_precondition = QUDA_SINGLE_PRECISION; - q_gauge_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; - break; - - case DOUBLE: - quda_inv_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; - quda_inv_param.clover_cuda_prec_precondition = QUDA_DOUBLE_PRECISION; - q_gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; - break; - default: - quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; - quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; - q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; - break; - } - - switch( ip.reconstructPrecondition ) { - case RECONS_NONE: - q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; - break; - case RECONS_8: - q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_8; - break; - case RECONS_12: - q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; - break; - default: - q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; - break; - }; - - quda_inv_param.tol_precondition = toDouble(ip.tolPrecondition); - quda_inv_param.maxiter_precondition = ip.maxIterPrecondition; - quda_inv_param.gcrNkrylov = ip.gcrNkrylov; - switch( ip.schwarzType ) { - case ADDITIVE_SCHWARZ : - quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; - break; - case MULTIPLICATIVE_SCHWARZ : - quda_inv_param.schwarz_type = QUDA_MULTIPLICATIVE_SCHWARZ; - break; - default: - quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; - break; - } - quda_inv_param.precondition_cycle = ip.preconditionCycle; - - if( ip.verboseInner ) { - quda_inv_param.verbosity_precondition = QUDA_VERBOSE; - } - else { - quda_inv_param.verbosity_precondition = QUDA_SILENT; - } - - switch( ip.invTypePrecondition ) { - case CG: - quda_inv_param.inv_type_precondition = QUDA_CG_INVERTER; - break; - case BICGSTAB: - quda_inv_param.inv_type_precondition = QUDA_BICGSTAB_INVERTER; - - break; - - case CA_GCR: - quda_inv_param.inv_type_precondition= QUDA_CA_GCR_INVERTER; - - case MR: - quda_inv_param.inv_type_precondition= QUDA_MR_INVERTER; - break; - - default: - quda_inv_param.inv_type_precondition = QUDA_MR_INVERTER; - break; - } - } - else { - QDPIO::cout << "Setting Precondition stuff to defaults for not using" << std::endl; - quda_inv_param.inv_type_precondition= QUDA_INVALID_INVERTER; - quda_inv_param.tol_precondition = 1.0e-1; - quda_inv_param.maxiter_precondition = 1000; - quda_inv_param.verbosity_precondition = QUDA_SILENT; - q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; - quda_inv_param.gcrNkrylov = 1; - } - - - if( invParam.verboseP ) { - quda_inv_param.verbosity = QUDA_VERBOSE; - } - else { - quda_inv_param.verbosity = QUDA_SUMMARIZE; - } - - // Set up the links - void* gauge[4]; - + // Setup padding + multi1d face_size(4); + face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; + face_size[1] = latdims[0]*latdims[2]*latdims[3]/2; + face_size[2] = latdims[0]*latdims[1]*latdims[3]/2; + face_size[3] = latdims[0]*latdims[1]*latdims[2]/2; + + int max_face = face_size[0]; + for(int i=1; i <=3; i++) { + if ( face_size[i] > max_face ) { + max_face = face_size[i]; + } + } + + + q_gauge_param.ga_pad = max_face; + + if( invParam.innerParamsP ) { + QDPIO::cout << "Setting inner solver params" << std::endl; + // Dereference handle + const GCRInnerSolverParams& ip = *(invParam.innerParams); + + // Set preconditioner precision + switch( ip.precPrecondition ) { + case HALF: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + + case SINGLE: + quda_inv_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_SINGLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + break; + + case DOUBLE: + quda_inv_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + break; + default: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + } + + switch( ip.reconstructPrecondition ) { + case RECONS_NONE: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + }; + + quda_inv_param.tol_precondition = toDouble(ip.tolPrecondition); + quda_inv_param.maxiter_precondition = ip.maxIterPrecondition; + quda_inv_param.gcrNkrylov = ip.gcrNkrylov; + switch( ip.schwarzType ) { + case ADDITIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + case MULTIPLICATIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_MULTIPLICATIVE_SCHWARZ; + break; + default: + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + } + quda_inv_param.precondition_cycle = ip.preconditionCycle; + + if( ip.verboseInner ) { + quda_inv_param.verbosity_precondition = QUDA_VERBOSE; + } + else { + quda_inv_param.verbosity_precondition = QUDA_SILENT; + } + + switch( ip.invTypePrecondition ) { + case CG: + quda_inv_param.inv_type_precondition = QUDA_CG_INVERTER; + break; + case BICGSTAB: + quda_inv_param.inv_type_precondition = QUDA_BICGSTAB_INVERTER; + + break; + + case CA_GCR: + quda_inv_param.inv_type_precondition= QUDA_CA_GCR_INVERTER; + + case MR: + quda_inv_param.inv_type_precondition= QUDA_MR_INVERTER; + break; + + default: + quda_inv_param.inv_type_precondition = QUDA_MR_INVERTER; + break; + } + } + else { + QDPIO::cout << "Setting Precondition stuff to defaults for not using" << std::endl; + quda_inv_param.inv_type_precondition= QUDA_INVALID_INVERTER; + quda_inv_param.tol_precondition = 1.0e-1; + quda_inv_param.maxiter_precondition = 1000; + quda_inv_param.verbosity_precondition = QUDA_SILENT; + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; + quda_inv_param.gcrNkrylov = 1; + } + + + if( invParam.verboseP ) { + quda_inv_param.verbosity = QUDA_VERBOSE; + } + else { + quda_inv_param.verbosity = QUDA_SUMMARIZE; + } + + void* gauge[4]= { nullptr, nullptr, nullptr, nullptr }; + #ifndef BUILD_QUDA_DEVIFACE_GAUGE - for(int mu=0; mu < Nd; mu++) { - gauge[mu] = (void *)&(links_single[mu].elem(all.start()).elem().elem(0,0).real()); - } + for(int mu=0; mu < Nd; mu++) { + gauge[mu] = (void *)&(links_single[mu].elem(all.start()).elem().elem(0,0).real()); + } #else - GetMemoryPtrGauge(gauge,links_single); - // gauge[mu] = GetMemoryPtr( links_single[mu].getId() ); - // QDPIO::cout << "MDAGM CUDA gauge[" << mu << "] in = " << gauge[mu] << "\n"; + GetMemoryPtrGauge(gauge,links_single); + // gauge[mu] = GetMemoryPtr( links_single[mu].getId() ); + // QDPIO::cout << "MDAGM CUDA gauge[" << mu << "] in = " << gauge[mu] << "\n"; #endif - loadGaugeQuda((void *)gauge, &q_gauge_param); + loadGaugeQuda((void *)gauge, &q_gauge_param); - // Setup the clover term... - QDPIO::cout << "Creating CloverTerm" << std::endl; - clov->create(fstate, invParam_.CloverParams); - // Don't recompute, just copy - invclov->create(fstate, invParam_.CloverParams); + // Setup the clover term... + CloverTermT clov; + CloverTermT invclov; - QDPIO::cout << "Inverting CloverTerm" << std::endl; - invclov->choles(0); - invclov->choles(1); + QDPIO::cout << "Creating CloverTerm" << std::endl; + clov.create(fstate, invParam_.CloverParams); + // Don't recompute, just copy + invclov.create(fstate, invParam_.CloverParams); + QDPIO::cout << "Inverting CloverTerm" << std::endl; + invclov.choles(0); + invclov.choles(1); -#ifndef BUILD_QUDA_DEVIFACE_CLOVER - multi1d > packed_clov; - - - packed_clov.resize(all.siteTable().size()); - clov->packForQUDA(packed_clov, 0); - clov->packForQUDA(packed_clov, 1); - +#ifndef BUILD_QUDA_DEVIFACE_CLOVER + multi1d > packed_clov(all.siteTable().size()); + multi1d > packed_invclov(all.siteTable().size());; + + clov.packForQUDA(packed_clov, 0); + clov.packForQUDA(packed_clov, 1); - // Always need inverse - multi1d > packed_invclov(all.siteTable().size()); - invclov->packForQUDA(packed_invclov, 0); - invclov->packForQUDA(packed_invclov, 1); + invclov.packForQUDA(packed_invclov, 0); + invclov.packForQUDA(packed_invclov, 1); - - loadCloverQuda(&(packed_clov[0]), &(packed_invclov[0]),&quda_inv_param); - + loadCloverQuda(&(packed_clov[0]), &(packed_invclov[0]),&quda_inv_param); #else - void *clover[2]; - void *cloverInv[2]; - - GetMemoryPtrClover(clov->getOffId(),clov->getDiaId(),invclov->getOffId(),invclov->getDiaId()); - - loadCloverQuda( (void*)(clover) , (void*)(cloverInv) ,&quda_inv_param); - -#endif - + void *clover[2]; + void *cloverInv[2]; + // This is a yucky macro and needs the existence of 'clover' and 'cloverInv' to work + GetMemoryPtrClover(clov.getOffId(),clov.getDiaId(),invclov.getOffId(),invclov.getDiaId()); - } - - - //! Destructor is automatic - ~LinOpSysSolverQUDAClover() - { - QDPIO::cout << "Destructing" << std::endl; - freeGaugeQuda(); - freeCloverQuda(); - } - - //! Return the subset on which the operator acts - const Subset& subset() const {return A->subset();} - - //! Solver the linear system - /*! - * \param psi solution ( Modify ) - * \param chi source ( Read ) - * \return syssolver results - */ - SystemSolverResults_t operator() (T& psi, const T& chi) const - { - SystemSolverResults_t res; - - START_CODE(); - StopWatch swatch; - swatch.start(); - - // T MdagChi; - - // This is a CGNE. So create new RHS - // (*A)(MdagChi, chi, MINUS); - // Handle< LinearOperator > MM(new MdagMLinOp(A)); - if ( invParam.axialGaugeP ) { - T g_chi,g_psi; - - // Gauge Fix source and initial guess - QDPIO::cout << "Gauge Fixing source and initial guess" << std::endl; - g_chi[ rb[1] ] = GFixMat * chi; - g_psi[ rb[1] ] = GFixMat * psi; - QDPIO::cout << "Solving" << std::endl; - res = qudaInvert(*clov, - *invclov, - g_chi, - g_psi); - QDPIO::cout << "Untransforming solution." << std::endl; - psi[ rb[1]] = adj(GFixMat)*g_psi; - - } - else { - res = qudaInvert(*clov, - *invclov, - chi, - psi); - } - - swatch.stop(); - - - { - T r; - r[A->subset()]=chi; - T tmp; - (*A)(tmp, psi, PLUS); - r[A->subset()] -= tmp; - res.resid = sqrt(norm2(r, A->subset())); - } - - Double rel_resid = res.resid/sqrt(norm2(chi,A->subset())); - - QDPIO::cout << "QUDA_"<< solver_string <<"_CLOVER_SOLVER: " << res.n_count << " iterations. Rsd = " << res.resid << " Relative Rsd = " << rel_resid << std::endl; - - // Convergence Check/Blow Up - if ( ! invParam.SilentFailP ) { - if ( toBool( rel_resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { - QDPIO::cerr << "ERROR: QUDA Solver residuum is outside tolerance: QUDA resid="<< rel_resid << " Desired =" << invParam.RsdTarget << " Max Tolerated = " << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; - QDP_abort(1); - } - } - - END_CODE(); - return res; - } - - -private: - // Hide default constructor - LinOpSysSolverQUDAClover() {} - -#if 1 - Q links_orig; + loadCloverQuda( (void*)(clover) , (void*)(cloverInv) ,&quda_inv_param); #endif - U GFixMat; - QudaPrecision_s cpu_prec; - QudaPrecision_s gpu_prec; - QudaPrecision_s gpu_half_prec; - - Handle< LinearOperator > A; - const SysSolverQUDACloverParams invParam; - QudaGaugeParam q_gauge_param; - QudaInvertParam quda_inv_param; - - Handle< CloverTermT > clov; - Handle< CloverTermT > invclov; - - SystemSolverResults_t qudaInvert(const CloverTermT& clover, - const CloverTermT& inv_clov, - const T& chi_s, - T& psi_s - )const ; - - std::string solver_string; -}; + } + + + //! Destructor is automatic + ~LinOpSysSolverQUDAClover() + { + QDPIO::cout << "Destructing" << std::endl; + freeGaugeQuda(); + freeCloverQuda(); + } + + //! Return the subset on which the operator acts + const Subset& subset() const {return A->subset();} + + //! Solver the linear system + /*! + * \param psi solution ( Modify ) + * \param chi source ( Read ) + * \return syssolver results + */ + SystemSolverResults_t operator() (T& psi, const T& chi) const + { + SystemSolverResults_t res; + + START_CODE(); + StopWatch swatch; + swatch.start(); + + // T MdagChi; + + // This is a CGNE. So create new RHS + // (*A)(MdagChi, chi, MINUS); + // Handle< LinearOperator > MM(new MdagMLinOp(A)); + if ( invParam.axialGaugeP ) { + T g_chi,g_psi; + + // Gauge Fix source and initial guess + QDPIO::cout << "Gauge Fixing source and initial guess" << std::endl; + g_chi[ A->subset() ] = GFixMat * chi; + g_psi[ A->subset() ] = GFixMat * psi; + QDPIO::cout << "Solving" << std::endl; + res = qudaInvert( g_chi, g_psi); + QDPIO::cout << "Untransforming solution." << std::endl; + psi[ A->subset() ] = adj(GFixMat)*g_psi; + + } + else { + res = qudaInvert( chi, psi); + } + + swatch.stop(); + + // If required, check the solutions + if( invParam.SolutionCheckP ) { + T r; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + res.resid = sqrt(norm2(r, A->subset())) / sqrt(norm2(chi,A->subset())); + } + else { + QDPIO::cout << "Chroma <-> QUDA Solution Check disabled. Using (trusting) QUDA Residuum\n"; + } + + QDPIO::cout << "QUDA_"<< solver_string <<"_CLOVER_SOLVER: " << res.n_count << " iterations. Relative Rsd = " << res.resid << std::endl; + + // Convergence Check/Blow Up + if ( ! invParam.SilentFailP ) { + if ( toBool( res.resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << "ERROR: QUDA Solver residuum is outside tolerance: QUDA resid="<< res.resid << " Desired =" << invParam.RsdTarget << " Max Tolerated = " << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + QDP_abort(1); + } + } + + END_CODE(); + return res; + } + + + + std::vector operator() (const std::vector>& psi, const std::vector>& chi) const override + { + + START_CODE(); + QDPIO::cout << "Entering MRHS solution: N_src = " << chi.size() << "\n"; + + std::vector res(chi.size()); + if( psi.size() != chi.size() ) { + QDPIO::cout << "Number of sources does not match number of solutions\n"; + QDPIO::cout << "psi.size() = " << psi.size() << " but chi.size() = " << chi.size() << "\n"; + QDP_abort(1); + } + + StopWatch swatch; + swatch.start(); + + if ( invParam.axialGaugeP ) { + QDPIO::cerr << "Multi RHS solve in axial gauge not yet implemented\n"; + QDP_abort(1); + } + + qudaInvertMultiSrc( psi, chi, res); + + swatch.stop(); + + // Check solutions + if( invParam.SolutionCheckP ) { + T r; + T tmp; + for(int soln =0; soln < psi.size(); soln++) { + r[A->subset()]=*(chi[ soln ]); + (*A)(tmp, *(psi[soln]), PLUS); + r[A->subset()] -= tmp; + res[soln].resid = sqrt(norm2(r, A->subset())) / sqrt(norm2(*(chi[soln]),A->subset())); + } + } + else { + QDPIO::cout << "Chroma <-> QUDA Solution Check disabled. Using (trusting) QUDA Residua\n"; + } + + + for(int soln=0; soln < psi.size(); soln++) { + QDPIO::cout << "QUDA_"<< solver_string <<"_CLOVER_SOLVER: solution " << soln << + " : " << res[soln].n_count << " iterations. Relative Rsd = " << res[soln].resid << std::endl; + + // Convergence Check/Blow Up + if ( ! invParam.SilentFailP ) { + if ( toBool( res[soln].resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << "ERROR: QUDA Solver residuum for solution " << soln + << " is outside tolerance: QUDA resid="<< res[soln].resid << " Desired =" + << invParam.RsdTarget << " Max Tolerated = " + << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + QDP_abort(1); + } + } + } + + END_CODE(); + return res; + } + + private: + // Hide default constructor + LinOpSysSolverQUDAClover() {} + + U GFixMat; + QudaPrecision_s cpu_prec; + QudaPrecision_s gpu_prec; + QudaPrecision_s gpu_half_prec; + + Handle< LinearOperator > A; + const SysSolverQUDACloverParams invParam; + QudaGaugeParam q_gauge_param; + QudaInvertParam quda_inv_param; + + SystemSolverResults_t qudaInvert( const T& chi_s, T& psi_s) const ; + + void qudaInvertMultiSrc( const std::vector>& psi, + const std::vector>& chi, + std::vector& res) const; + + std::string solver_string; + }; } // End namespace - #endif // BUILD_QUDA #endif diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.cc new file mode 100644 index 0000000000..5f75968bf3 --- /dev/null +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.cc @@ -0,0 +1,253 @@ +/*! \file + * \QUDA MULTIGRID ExpClover solver. + */ +// comment +#include "actions/ferm/invert/syssolver_linop_factory.h" +#include "actions/ferm/invert/syssolver_linop_aggregate.h" +#include "actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h" +#include "actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.h" +#include "io/aniso_io.h" + + +#include "handle.h" +#include "actions/ferm/fermstates/periodic_fermstate.h" +#include "actions/ferm/linop/lwldslash_w.h" +#include "meas/glue/mesplq.h" +// QUDA Headers +#include +// #include +#include "actions/ferm/invert/quda_solvers/quda_mg_utils.h" + +namespace Chroma +{ + namespace LinOpSysSolverQUDAMULTIGRIDExpCloverEnv + { + + //! Anonymous namespace + namespace + { + //! Name to be used + const std::string name("QUDA_MULTIGRID_EXP_CLOVER_INVERTER"); + + //! Local registration flag + bool registered = false; + } + + + + LinOpSystemSolver* createFerm(XMLReader& xml_in, + const std::string& path, + Handle< FermState< LatticeFermion, multi1d, multi1d > > state, + + Handle< LinearOperator > A) + { + return new LinOpSysSolverQUDAMULTIGRIDExpClover(A, state,SysSolverQUDAMULTIGRIDCloverParams(xml_in, path)); + } + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= Chroma::TheLinOpFermSystemSolverFactory::Instance().registerObject(name, createFerm); + registered = true; + } + return success; + } + } + + SystemSolverResults_t + LinOpSysSolverQUDAMULTIGRIDExpClover::qudaInvert(const ExpCloverTermT& clover, + const ExpCloverTermT& invclov, + const T& chi_s, + T& psi_s) const{ + + SystemSolverResults_t ret; + const auto& sub = A->subset(); + T mod_chi; + + // Copy source into mod_chi, and zero the off-parity + if( is_precond ) { + mod_chi[rb[0]] = zero; + if( invParam.asymmetricP ) { + // + // symmetric + // Solve with M_symm = 1 - A^{-1}_oo D A^{-1}ee D + // + // Chroma M = A_oo ( M_symm ) + // + // So M x = b => A_oo (M_symm) x = b + // => M_symm x = A^{-1}_oo b = chi_mod + invclov.apply(mod_chi, chi_s, PLUS, 1); + } + else { + mod_chi[rb[1]] = chi_s; + } + } + else { + mod_chi = chi_s; + } +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + void* spinorIn =(void *)&(mod_chi.elem(sub.start()).elem(0).elem(0).real()); + void* spinorOut =(void *)&(psi_s.elem(sub.start()).elem(0).elem(0).real()); +#else + // void* spinorIn = GetMemoryPtr( mod_chi.getId() ); + // void* spinorOut = GetMemoryPtr( psi_s.getId() ); + void* spinorIn; + void* spinorOut; + GetMemoryPtr2(spinorIn,spinorOut,mod_chi.getId(),psi_s.getId()) + +#endif + + // Do the solve here + StopWatch swatch1; + swatch1.reset(); + swatch1.start(); + invertQuda(spinorOut, spinorIn, (QudaInvertParam*)&quda_inv_param); + swatch1.stop(); + + + QDPIO::cout << solver_string<< "time="<< quda_inv_param.secs <<" s" ; + QDPIO::cout << "\tPerformance="<< quda_inv_param.gflops/quda_inv_param.secs<<" GFLOPS" ; + QDPIO::cout << "\tTotal Time (incl. load gauge)=" << swatch1.getTimeInSeconds() <<" s"<& invclov, + const std::vector>& psi_s, + const std::vector>& chi_s, + std::vector& res) const { + + + StopWatch source_prep; + source_prep.reset(); + source_prep.start(); + + multi1d mod_chi(chi_s.size()); + const auto& sub = A->subset(); + + for(int i=0; i < chi_s.size(); i++) { + + if( is_precond ) { + // Copy source into mod_chi, and zero the off-parity + mod_chi[i][rb[0]] = zero; + + if( invParam.asymmetricP ) { + // + // symmetric + // Solve with M_symm = 1 - A^{-1}_oo D A^{-1}ee D + // + // Chroma M = A_oo ( M_symm ) + // + // So M x = b => A_oo (M_symm) x = b + // => M_symm x = A^{-1}_oo b = chi_mod + invclov.apply(mod_chi[i], *(chi_s[i]), PLUS, 1); + } + else { + mod_chi[i][rb[1]] = *(chi_s[i]); + } + } + else { + mod_chi[i] = *(chi_s[i]); + } + } + + std::vector spinorIn(chi_s.size()); + std::vector spinorOut(psi_s.size()); + + int N_src = chi_s.size(); +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + // Regular non-qdpjit approach. Just collect the pointers + for(int soln=0; soln < chi_s.size(); soln++) { + spinorIn[soln] = (void *)&(mod_chi[soln].elem(sub.start()).elem(0).elem(0).real()); + spinorOut[soln] = (void *)&(psi_s[soln]->elem(sub.start()).elem(0).elem(0).real()); + } +#else + std::vector ids(2*N_src); + + for(int soln=0; soln < N_src; soln++) { + ids[soln] = mod_chi[soln].getId(); + ids[N_src+soln] = psi_s[soln]->getId(); + } + + // Grab all the keys + auto dev_ptr = QDP_get_global_cache().get_dev_ptrs( multi1d( ids.data(), ids.size()) ); + + + for(int soln=0; soln < N_src; soln++) { + spinorIn[soln] = dev_ptr(soln); + spinorOut[soln] = dev_ptr(N_src+soln); + } + source_prep.stop(); +#endif + + // Local quda_inv_param (?) + // Relies on quda_inv_param being just a dumb/struct and or copyable + QudaInvertParam local_quda_inv_param = quda_inv_param ; + local_quda_inv_param.num_src = mod_chi.size(); + + // No grid splitting for MG yet, so commenting that out +#if 0 + int totalSubgrids=1; + const multi1d& machine_size=QDP::Layout::logicalSize(); + + for (int i = 0; i < Nd; i++) { + local_quda_inv_param.split_grid[i] = invParam.GridSplitDims[i]; + totalSubgrids *= invParam.GridSplitDims[i]; + if ( machine_size[i] % invParam.GridSplitDims[i] != 0 ) { + QDPIO::cerr << "The split-grid-subgrid dimensions must divide the number ranks in each dimension exactly\n"; + QDPIO::cerr << "Currently this is not the case: dim=" << i << " machine_size["< + +#include "handle.h" +#include "state.h" +#include "syssolver.h" +#include "linearop.h" +#include "actions/ferm/fermbcs/simple_fermbc.h" +#include "actions/ferm/fermstates/periodic_fermstate.h" +#include "actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h" +#include "actions/ferm/linop/exp_clover_term_w.h" +#include "meas/gfix/temporal_gauge.h" +#include "io/aniso_io.h" +#include "quda_mg_utils.h" +#include +#include +#include "util/gauge/reunit.h" +#ifdef QDP_IS_QDPJIT +#include "actions/ferm/invert/quda_solvers/qdpjit_memory_wrapper.h" +#endif + +//#include + +namespace Chroma +{ + + //! Richardson system solver namespace + namespace LinOpSysSolverQUDAMULTIGRIDExpCloverEnv + { + //! Register the syssolver + bool registerAll(); + } + + //! Solve a ExpClover Fermion System using the QUDA inverter + /*! \ingroup invert + *** WARNING THIS SOLVER WORKS FOR ExpClover FERMIONS ONLY *** + */ + + class LinOpSysSolverQUDAMULTIGRIDExpClover : public LinOpSystemSolver + { + public: + typedef LatticeFermion T; + typedef LatticeColorMatrix U; + typedef multi1d Q; + + typedef LatticeFermionF TF; + typedef LatticeColorMatrixF UF; + typedef multi1d QF; + + typedef LatticeFermionF TD; + typedef LatticeColorMatrixF UD; + typedef multi1d QD; + + typedef WordType::Type_t REALT; + //! Constructor + /*! + * \param M_ Linear operator ( Read ) + * \param invParam inverter parameters ( Read ) + */ + LinOpSysSolverQUDAMULTIGRIDExpClover(Handle< LinearOperator > A_, + Handle< FermState > state_, + const SysSolverQUDAMULTIGRIDCloverParams& invParam_) : + A(A_), is_precond( true ), invParam(invParam_), clov(new ExpCloverTermT() ), invclov(new ExpCloverTermT()) + { + StopWatch init_swatch; + init_swatch.reset(); init_swatch.start(); + // Set the solver string + { + std::ostringstream solver_string_stream; + solver_string_stream << "QUDA_MULTIGRID_EXP_CLOVER_LINOP_SOLVER( " + << invParam.SaveSubspaceID << " ): "; + solver_string = solver_string_stream.str(); + + } + QDPIO::cout << solver_string << "Initializing" << std::endl; + + // FOLLOWING INITIALIZATION in test QUDA program + const auto& sub = A->subset(); + if( sub.start() == all.start() && sub.numSiteTable() == all.numSiteTable()) { + is_precond = false; + } + + // 1) work out cpu_prec, cuda_prec, cuda_prec_sloppy + int s = sizeof( WordType::Type_t ); + if (s == 4) { + cpu_prec = QUDA_SINGLE_PRECISION; + } + else { + cpu_prec = QUDA_DOUBLE_PRECISION; + } + + // Work out GPU precision + switch( invParam.cudaPrecision ) { + case HALF: + gpu_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_prec = cpu_prec; + break; + } + + // Work out GPU Sloppy precision + // Default: No Sloppy + switch( invParam.cudaSloppyPrecision ) { + case HALF: + gpu_half_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_half_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_half_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_half_prec = gpu_prec; + break; + } + + // 2) pull 'new; GAUGE and Invert params + q_gauge_param = newQudaGaugeParam(); + quda_inv_param = newQudaInvertParam(); + + // 3) set lattice size + const multi1d& latdims = Layout::subgridLattSize(); + + q_gauge_param.X[0] = latdims[0]; + q_gauge_param.X[1] = latdims[1]; + q_gauge_param.X[2] = latdims[2]; + q_gauge_param.X[3] = latdims[3]; + + // 4) - deferred (anisotropy) + + // 5) - set QUDA_WILSON_LINKS, QUDA_GAUGE_ORDER + q_gauge_param.type = QUDA_WILSON_LINKS; +#ifndef BUILD_QUDA_DEVIFACE_GAUGE + q_gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; // gauge[mu], p +#else + q_gauge_param.location = QUDA_CUDA_FIELD_LOCATION; + q_gauge_param.gauge_order = QUDA_QDPJIT_GAUGE_ORDER; +#endif + + // 6) - set t_boundary + // Convention: BC has to be applied already + // This flag just tells QUDA that this is so, + // so that QUDA can take care in the reconstruct + if( invParam.AntiPeriodicT ) { + q_gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; + } + else { + q_gauge_param.t_boundary = QUDA_PERIODIC_T; + } + + // Set cpu_prec, cuda_prec, reconstruct and sloppy versions + q_gauge_param.cpu_prec = cpu_prec; + q_gauge_param.cuda_prec = gpu_prec; + + switch( invParam.cudaReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + }; + + q_gauge_param.cuda_prec_sloppy = gpu_half_prec; + + // Default. This may be overwritten by inner params + q_gauge_param.cuda_prec_precondition = gpu_half_prec; + + switch( invParam.cudaSloppyReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + }; + q_gauge_param.reconstruct_precondition = q_gauge_param.reconstruct_sloppy; + // Gauge fixing: + + // These are the links + // They may be smeared and the BC's may be applied + Q links_single(Nd); + + // Now downcast to single prec fields. + for(int mu=0; mu < Nd; mu++) { + links_single[mu] = (state_->getLinks())[mu]; + } + + // GaugeFix + if( invParam.axialGaugeP ) { + temporalGauge(links_single, GFixMat, Nd-1); + for(int mu=0; mu < Nd; mu++) { + links_single[mu] = GFixMat*(state_->getLinks())[mu]*adj(shift(GFixMat, FORWARD, mu)); + } + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES; + } + else { + // No GaugeFix + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;// No Gfix yet + } + + // deferred 4) Gauge Anisotropy + const AnisoParam_t& aniso = invParam.CloverParams.anisoParam; + if( aniso.anisoP ) { // Anisotropic case + Real gamma_f = aniso.xi_0 / aniso.nu; + q_gauge_param.anisotropy = toDouble(gamma_f); + } + else { + q_gauge_param.anisotropy = 1.0; + } + + // MAKE FSTATE BEFORE RESCALING links_single + // Because the clover term expects the unrescaled links... + Handle > fstate( new PeriodicFermState(links_single)); + + if( aniso.anisoP ) { // Anisotropic case + multi1d cf=makeFermCoeffs(aniso); + for(int mu=0; mu < Nd; mu++) { + links_single[mu] *= cf[mu]; + } + } + + // Now onto the inv param: + // Dslash type + quda_inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + // Hardwire to GCR + quda_inv_param.inv_type = QUDA_GCR_INVERTER; + + + quda_inv_param.kappa = 0.5; + quda_inv_param.clover_coeff = 1.0; // Dummy, not used + quda_inv_param.Ls = 1; + + quda_inv_param.tol = toDouble(invParam.RsdTarget); + quda_inv_param.maxiter = invParam.MaxIter; + quda_inv_param.reliable_delta = toDouble(invParam.Delta); + quda_inv_param.pipeline = invParam.Pipeline; + + // Solution type + //quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; + //Taken from invert test. + quda_inv_param.solution_type = is_precond ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; // Always + + quda_inv_param.dagger = QUDA_DAG_NO; + quda_inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; + + quda_inv_param.cpu_prec = cpu_prec; + quda_inv_param.cuda_prec = gpu_prec; + quda_inv_param.cuda_prec_sloppy = gpu_half_prec; + quda_inv_param.cuda_prec_precondition = gpu_half_prec; + + // + //Done... + quda_inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; + quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; + quda_inv_param.input_location = QUDA_CPU_FIELD_LOCATION; + quda_inv_param.output_location = QUDA_CPU_FIELD_LOCATION; + +#else + quda_inv_param.dirac_order = QUDA_QDPJIT_DIRAC_ORDER; + quda_inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; +#endif + + + // Setup padding + + multi1d face_size(4); + face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; + face_size[1] = latdims[0]*latdims[2]*latdims[3]/2; + face_size[2] = latdims[0]*latdims[1]*latdims[3]/2; + face_size[3] = latdims[0]*latdims[1]*latdims[2]/2; + + int max_face = face_size[0]; + for(int i=1; i <=3; i++) { + if ( face_size[i] > max_face ) { + max_face = face_size[i]; + } + } + + q_gauge_param.ga_pad = max_face; + + // ExpClover precision and order + quda_inv_param.clover_cpu_prec = cpu_prec; + quda_inv_param.clover_cuda_prec = gpu_prec; + quda_inv_param.clover_cuda_prec_sloppy = gpu_half_prec; + quda_inv_param.clover_cuda_prec_precondition = gpu_half_prec; + if( invParam.MULTIGRIDParamsP ) { + const MULTIGRIDSolverParams& ip = *(invParam.MULTIGRIDParams); + + // Set preconditioner precision + switch( ip.prec ) { + case HALF: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + + case SINGLE: + quda_inv_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_SINGLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + break; + + case DOUBLE: + quda_inv_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + break; + default: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + } + + switch( ip.reconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + }; + } + // Set up the links + void* gauge[4]; + +#ifndef BUILD_QUDA_DEVIFACE_GAUGE + for(int mu=0; mu < Nd; mu++) { + gauge[mu] = (void *)&(links_single[mu].elem(all.start()).elem().elem(0,0).real()); + } +#else + //gauge[mu] = GetMemoryPtr( links_single[mu].getId() ); + GetMemoryPtrGauge(gauge,links_single); +#endif + + loadGaugeQuda((void *)gauge, &q_gauge_param); + + MULTIGRIDSolverParams ip = *(invParam.MULTIGRIDParams); + // + quda_inv_param.tol_precondition = toDouble(ip.tol[0]); + quda_inv_param.maxiter_precondition = ip.maxIterations[0]; + quda_inv_param.gcrNkrylov = ip.outer_gcr_nkrylov; + quda_inv_param.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); + + + //Replacing above with what's in the invert test. + switch( ip.schwarzType ) { + case ADDITIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + case MULTIPLICATIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_MULTIPLICATIVE_SCHWARZ; + break; + default: + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + } + quda_inv_param.precondition_cycle = 1; + //Invert test always sets this to 1. + + + if( invParam.verboseP ) { + quda_inv_param.verbosity = QUDA_VERBOSE; + } + else { + quda_inv_param.verbosity = QUDA_SUMMARIZE; + } + + quda_inv_param.verbosity_precondition = QUDA_SILENT; + + quda_inv_param.inv_type_precondition = QUDA_MG_INVERTER; + + QDPIO::cout<< solver_string << "Basic MULTIGRID params copied."<create(fstate, invParam_.CloverParams); + + // Don't recompute, just copy + invclov->create(fstate, invParam_.CloverParams); + + QDPIO::cout << solver_string << "Inverting ExpCloverTerm" << std::endl; + invclov->choles(0); + invclov->choles(1); + +#ifndef BUILD_QUDA_DEVIFACE_CLOVER +#warning "NOT USING QUDA DEVICE IFACE" + quda_inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + + multi1d > packed_clov; + + + packed_clov.resize(all.siteTable().size()); + + clov->packForQUDA(packed_clov, 0); + clov->packForQUDA(packed_clov, 1); + + // Always need inverse + multi1d > packed_invclov(all.siteTable().size()); + invclov->packForQUDA(packed_invclov, 0); + invclov->packForQUDA(packed_invclov, 1); + + loadCloverQuda(&(packed_clov[0]), &(packed_invclov[0]), &quda_inv_param); + +#else + +#warning "USING QUDA DEVICE IFACE" + quda_inv_param.clover_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.clover_order = QUDA_QDPJIT_CLOVER_ORDER; + + void *clover[2]; + void *cloverInv[2]; + + GetMemoryPtrClover(clov->getOffId(),clov->getDiaId(),invclov->getOffId(),invclov->getDiaId()); + + loadCloverQuda( (void*)(clover), (void *)(cloverInv), &quda_inv_param); + +#endif + + quda_inv_param.omega = toDouble(ip.relaxationOmegaOuter); + + +// merged from mdgam_clover_quda_multigrid, begin + if(TheNamedObjMap::Instance().check(invParam.SaveSubspaceID)) + { + StopWatch update_swatch; + update_swatch.reset(); update_swatch.start(); + // Subspace ID exists add it to mg_state + QDPIO::cout<< solver_string <<"Recovering subspace..."<(invParam.SaveSubspaceID); + for(int j=0; j < ip.mg_levels-1;++j) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + update_swatch.stop(); + + QDPIO::cout << solver_string << " subspace_update_time = " + << update_swatch.getTimeInSeconds() << " sec. " << std::endl; + } + else + { + // Create the subspace. + StopWatch create_swatch; + create_swatch.reset(); create_swatch.start(); + QDPIO::cout << solver_string << "Creating Subspace" << std::endl; + subspace_pointers = QUDAMGUtils::create_subspace(invParam); + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + create_swatch.stop(); + QDPIO::cout << solver_string << " subspace_create_time = " + << create_swatch.getTimeInSeconds() << " sec. " << std::endl; + + } + quda_inv_param.preconditioner = subspace_pointers->preconditioner; +// merged from mdgam_clover_quda_multigrid, end + + init_swatch.stop(); + QDPIO::cout << solver_string << "init_time = " + << init_swatch.getTimeInSeconds() << " sec. " << std::endl; + + } + + //! Destructor is automatic + ~LinOpSysSolverQUDAMULTIGRIDExpClover() + { + QDPIO::cout << solver_string << "Destructing" << std::endl; + quda_inv_param.preconditioner = nullptr; + subspace_pointers = nullptr; + freeGaugeQuda(); + freeCloverQuda(); +// destroyMultigridQuda(quda_inv_param.preconditioner); + } + + //! Return the subset on which the operator acts + const Subset& subset() const {return A->subset();} + + //! Solver the linear system + /*! + * \param psi solution ( Modify ) + * \param chi source ( Read ) + * \return syssolver results + */ + SystemSolverResults_t operator() (T& psi, const T& chi) const + { + SystemSolverResults_t res; + + START_CODE(); + StopWatch swatch; + swatch.start(); + + psi = zero; // Zero initial guess + // T MdagChi; + + // This is a CGNE. So create new RHS + // (*A)(MdagChi, chi, MINUS); + // Handle< LinearOperator > MM(new MdagMLinOp(A)); + if ( invParam.axialGaugeP ) { + T g_chi,g_psi; + + // Gauge Fix source and initial guess + g_chi[ A->subset() ] = GFixMat * chi; + g_psi[ A->subset() ] = GFixMat * psi; + res = qudaInvert(*clov, + *invclov, + g_chi, + g_psi); + psi[ A->subset() ] = adj(GFixMat)*g_psi; + + } + else { + res = qudaInvert(*clov, + *invclov, + chi, + psi); + } + + swatch.stop(); + + if( invParam.SolutionCheckP ) { + T r; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + res.resid = sqrt(norm2(r, A->subset()))/sqrt(norm2(chi,A->subset())); + } + else { + QDPIO::cout << "Chroma <-> QUDA solution check disabled. Using (trusting) QUDA residuum\n"; + } + + QDPIO::cout << solver_string << res.n_count << " iterations. Relative Rsd = " << res.resid << std::endl; + + // Convergence Check/Blow Up + if ( ! invParam.SilentFailP ) { + if ( toBool( res.resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << solver_string << "ERROR: Solver residuum is outside tolerance: QUDA resid="<< res.resid << " Desired =" << invParam.RsdTarget << " Max Tolerated = " << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + QDP_abort(1); + } + } + + END_CODE(); + return res; + } + + std::vector operator() (const std::vector>& psi, const std::vector>& chi) const override + { + + START_CODE(); + QDPIO::cout << "Entering MRHS solution: N_src = " << chi.size() << "\n"; + + std::vector res(chi.size()); + if( psi.size() != chi.size() ) { + QDPIO::cout << "Number of sources does not match number of solutions\n"; + QDPIO::cout << "psi.size() = " << psi.size() << " but chi.size() = " << chi.size() << "\n"; + QDP_abort(1); + } + + StopWatch swatch; + swatch.start(); + + if ( invParam.axialGaugeP ) { + QDPIO::cerr << "Multi RHS solve in axial gauge not yet implemented\n"; + QDP_abort(1); + } + + qudaInvertMultiSrc(*invclov, psi, chi, res); + + swatch.stop(); + + // Check solutions -- if desired + + if( invParam.SolutionCheckP ) { + for(int soln =0; soln < psi.size(); soln++) { + T r; + r[A->subset()]=*(chi[ soln ]); + T tmp; + (*A)(tmp, *(psi[soln]), PLUS); + r[A->subset()] -= tmp; + res[soln].resid = sqrt(norm2(r, A->subset()))/sqrt(norm2(*(chi[soln]),A->subset())); + } + } + else { + QDPIO::cout << "Chroma <-> QUDA solution check disabled. Using (trusting) QUDA residua\n"; + } + + for(int soln=0; soln < psi.size(); soln++ ) { + QDPIO::cout << "QUDA_"<< solver_string <<" solution " << soln << + " : " << res[soln].n_count << " iterations. Relative Rsd = " << res[soln].resid << std::endl; + + // Convergence Check/Blow Up + if ( ! invParam.SilentFailP ) { + if ( toBool( res[soln].resid > invParam.RsdToleranceFactor*invParam.RsdTarget) ) { + QDPIO::cerr << "ERROR: QUDA Solver residuum for solution " << soln + << " is outside tolerance: QUDA resid="<< res[soln].resid << " Desired =" + << invParam.RsdTarget << " Max Tolerated = " + << invParam.RsdToleranceFactor*invParam.RsdTarget << std::endl; + QDP_abort(1); + } + } + } + + + END_CODE(); + return res; + } + + private: + // Hide default constructor + LinOpSysSolverQUDAMULTIGRIDExpClover() {} + +#if 1 + Q links_orig; +#endif + bool is_precond; + + U GFixMat; + QudaPrecision_s cpu_prec; + QudaPrecision_s gpu_prec; + QudaPrecision_s gpu_half_prec; + + Handle< LinearOperator > A; + const SysSolverQUDAMULTIGRIDCloverParams invParam; + QudaGaugeParam q_gauge_param; + QudaInvertParam quda_inv_param; + mutable QUDAMGUtils::MGSubspacePointers* subspace_pointers; + + Handle< ExpCloverTermT > clov; + Handle< ExpCloverTermT > invclov; + + SystemSolverResults_t qudaInvert(const ExpCloverTermT& clover, + const ExpCloverTermT& invclov, + const T& chi_s, + T& psi_s + )const; + + void qudaInvertMultiSrc(const ExpCloverTermT& invclov, + const std::vector>& psi_s, + const std::vector>& chi_s, + std::vector& res) const; + + std::string solver_string; + }; + +} // End namespace + +#endif // BUILD_QUDA +#endif + diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h index 3e6c830d7d..669a4807cf 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h @@ -319,16 +319,6 @@ namespace Chroma quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; #endif - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - quda_inv_param.tune = QUDA_TUNE_NO; - } - // Setup padding multi1d face_size(4); diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h index c8a7fa436f..85eedcfaed 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h @@ -340,19 +340,6 @@ class LinOpSysSolverQUDAMULTIGRIDWilson : public LinOpSystemSolver quda_inv_param.use_init_guess = QUDA_USE_INIT_GUESS_NO; quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } - - + // Setup padding multi1d face_size(4); face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h index 693f6b2dea..85d280a829 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h @@ -302,14 +302,6 @@ class MdagMSysSolverQUDAMULTIGRIDClover : public MdagMSystemSolver face_size(4); face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h index 60a73c14fe..0bcbe673ee 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h @@ -382,19 +382,6 @@ class MdagMSysSolverQUDAClover : public MdagMSystemSolver #endif - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } - - // Setup padding multi1d face_size(4); face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.cc new file mode 100644 index 0000000000..99c6528b9f --- /dev/null +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.cc @@ -0,0 +1,368 @@ +/*! \file + * \QUDA MULTIGRID MdagM Clover solver. + */ +// comment +#include "actions/ferm/invert/syssolver_mdagm_factory.h" +#include "actions/ferm/invert/syssolver_mdagm_aggregate.h" +#include "actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h" +#include "actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h" +#include "io/aniso_io.h" + + +#include "handle.h" +#include "actions/ferm/fermstates/periodic_fermstate.h" +#include "actions/ferm/linop/lwldslash_w.h" +#include "meas/glue/mesplq.h" +// QUDA Headers +#include +// #include + +#include +#include +#include +#include +namespace Chroma +{ + namespace MdagMSysSolverQUDAMULTIGRIDExpCloverEnv + { + + //! Anonymous namespace + namespace + { + //! Name to be used + const std::string name("QUDA_MULTIGRID_EXP_CLOVER_INVERTER"); + + //! Local registration flag + bool registered = false; + } + + + + MdagMSystemSolver* createFerm(XMLReader& xml_in, + const std::string& path, + Handle< FermState< LatticeFermion, multi1d, multi1d > > state, + + Handle< LinearOperator > A) + { + return new MdagMSysSolverQUDAMULTIGRIDExpClover(A, state,SysSolverQUDAMULTIGRIDCloverParams(xml_in, path)); + } + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= Chroma::TheMdagMFermSystemSolverFactory::Instance().registerObject(name, createFerm); + registered = true; + } + return success; + } + } + + SystemSolverResults_t + MdagMSysSolverQUDAMULTIGRIDExpClover::qudaInvert(const ExpCloverTermT& clover, + const ExpCloverTermT& invclov, + const T& chi_s, + T& psi_s) const{ + + SystemSolverResults_t ret; + + T mod_chi; + + // Copy source into mod_chi, and zero the off-parity + mod_chi[rb[0]] = zero; + + + // This solver always solves with the SYMMETRIC preconditioned + // Operator. If we are working with Asymmetric preconditioning + // Then we must apply a clover inverse. + if( invParam.asymmetricP) { + + // + // symmetric + // Solve with M_symm = 1 - A^{-1}_oo D A^{-1}ee D + // + // Chroma M = A_oo ( M_symm ) + // + // So M x = b => A_oo (M_symm) x = b + // => M_symm x = A^{-1}_oo b = chi_mod + invclov.apply(mod_chi, chi_s, PLUS, 1); + } + else { + // If we work with symmetric preconditioning nothing else needs done + mod_chi[rb[1]] = chi_s; + } + +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + void* spinorIn =(void *)&(mod_chi.elem(rb[1].start()).elem(0).elem(0).real()); + void* spinorOut =(void *)&(psi_s.elem(rb[1].start()).elem(0).elem(0).real()); +#else + // void* spinorIn = GetMemoryPtr( mod_chi.getId() ); + // void* spinorOut = GetMemoryPtr( psi_s.getId() ); + void* spinorIn; + void* spinorOut; + GetMemoryPtr2(spinorIn,spinorOut,mod_chi.getId(),psi_s.getId()); +#endif + + // Do the solve here + StopWatch swatch1; + swatch1.reset(); + swatch1.start(); + invertQuda(spinorOut, spinorIn, (QudaInvertParam*)&quda_inv_param); + swatch1.stop(); + + + + QDPIO::cout << solver_string<<"time="<< quda_inv_param.secs <<" s" ; + QDPIO::cout << "\tPerformance="<< quda_inv_param.gflops/quda_inv_param.secs<<" GFLOPS" ; + QDPIO::cout << "\tTotal Time (incl. load gauge)=" << swatch1.getTimeInSeconds() <<" s"<getLinks()); + writer.close(); + } + + // Dump chi + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "ChiFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "ChiRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping chi (original source) vector to " << chi_filename << std::endl; + + QDPFileWriter writer(filebuf, chi_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, chi); + writer.close(); + + + } + + // Dump Y + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "YFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "YRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping Y (source) vector to " << Y_filename << std::endl; + + QDPFileWriter writer(filebuf, Y_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, Y); + writer.close(); + } + + // Dump MG state + { + auto mg_params = *(invParam.MULTIGRIDParams); + for(int l = 0; l < mg_params.mg_levels; ++l) { + std::ostringstream subspace_prefix; + subspace_prefix << file_prefix << "_subspace_l" << l; + + // Up to the length of the buffer (256) padded with zeros + std::strncpy((subspace_pointers->mg_param).vec_outfile[l], (subspace_prefix.str()).c_str(), 256); + // If source string is too long it will be truncated and not null terminated, so null terminate + if( subspace_prefix.str().size() > 255 ) { (subspace_pointers->mg_param).vec_outfile[l][255] = '\0'; } + + } + // Make sure everyone has thei varibles set before calling dump Multigrid + // Strictly speaking I am using a sum as a barrier; + double i=10; + QDPInternal::globalSum(i); + +#ifdef QUDA_MG_DUMP_ENABLED + dumpMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); +#endif + + for(int l = 0; l < mg_params.mg_levels; ++l) { + (subspace_pointers->mg_param).vec_outfile[l][0] ='\0'; + } + QDPInternal::globalSum(i); + } + } + + + unsigned long MdagMSysSolverQUDAMULTIGRIDExpClover::seqno = 0; + + void MdagMSysSolverQUDAMULTIGRIDExpClover::dumpXSolver(const LatticeFermion& chi, + const LatticeFermion& Y, + const LatticeFermion& X) const + + { + // Grab the time - took this C++ way from stackoverflow + auto time_value = std::time(nullptr); + auto local_time = std::localtime(&time_value); + std::ostringstream time_strstream; + time_strstream << "./failed_X_solve_" << std::put_time(local_time, "%d-%m-%Y-%H-%M-%S"); + + + std::string file_prefix( time_strstream.str() ); + + std::string gauge_filename = file_prefix + "_gauge_field.lime"; + std::string chi_filename = file_prefix + "_chi.lime"; + std::string Y_filename = file_prefix + "_Y.lime"; + std::string X_filename = file_prefix + "_X.lime"; + + int foo = 5; // Some rubbish for the XML Files + // Dump gauge field + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "GaugeFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "GaugeRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping gauge links to " << gauge_filename << std::endl; + + QDPFileWriter writer(filebuf,gauge_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, gstate->getLinks()); + writer.close(); + } + + // Dump chi + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "ChiFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "ChiRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping chi (original source) vector to " << chi_filename << std::endl; + + QDPFileWriter writer(filebuf, chi_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, chi); + writer.close(); + + + } + + // Dump Y + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "YFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "YRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping Y (source) vector to " << Y_filename << std::endl; + + QDPFileWriter writer(filebuf, Y_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, Y); + writer.close(); + } + + // Dump final X + { + XMLBufferWriter filebuf; + XMLBufferWriter recbuf; + push( filebuf, "XFile" ); + write( filebuf, "FilePrefix", file_prefix); + pop( filebuf); + + push( recbuf, "XRecord" ); + write( recbuf, "FilePrefix", file_prefix); + pop( recbuf ); + + QDPIO::cout << "Dumping X (solution) vector to " << X_filename << std::endl; + + QDPFileWriter writer(filebuf, X_filename, QDPIO_SINGLEFILE, QDPIO_PARALLEL); + write(writer, recbuf, X); + writer.close(); + } + + + // Dump MG state + { + auto mg_params = *(invParam.MULTIGRIDParams); + for(int l = 0; l < mg_params.mg_levels; ++l) { + std::ostringstream subspace_prefix; + subspace_prefix << file_prefix << "_subspace_l" << l; + + // Up to the length of the buffer (256) padded with zeros + std::strncpy((subspace_pointers->mg_param).vec_outfile[l], (subspace_prefix.str()).c_str(), 256); + // If source string is too long it will be truncated and not null terminated, so null terminate + if( subspace_prefix.str().size() > 255 ) { (subspace_pointers->mg_param).vec_outfile[l][255] = '\0'; } + + } + // Make sure everyone has thei varibles set before calling dump Multigrid + // I use a global sum as a barrier here + double i=10; + QDPInternal::globalSum(i); + + +#ifdef QUDA_MG_DUMP_ENABLED + dumpMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); +#endif + + for(int l = 0; l < mg_params.mg_levels; ++l) { + (subspace_pointers->mg_param).vec_outfile[l][0] ='\0'; + } + // I use a global sum as a weak barrier here. + QDPInternal::globalSum(i); // Make sure everyone is done + } + } + + + +} // namespace + diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h new file mode 100644 index 0000000000..138600b1b3 --- /dev/null +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h @@ -0,0 +1,1540 @@ +// -*- C++ -*- +/*! \file + * \QUDA MULTIGRID MdagM Clover solver. + */ + +#ifndef __syssolver_mdagm_quda_multigrid_exp_clover_h__ +#define __syssolver_mdagm_quda_multigrid_exp_clover_h__ + +#include "chroma_config.h" +#include "chromabase.h" +#include +#include + +using namespace QDP; + + + +#include "handle.h" +#include "state.h" +#include "syssolver.h" +#include "linearop.h" +#include "actions/ferm/fermbcs/simple_fermbc.h" +#include "actions/ferm/fermstates/periodic_fermstate.h" +#include "actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h" +#include "actions/ferm/linop/exp_clover_term_w.h" +#include "meas/gfix/temporal_gauge.h" +#include "io/aniso_io.h" +#include +#include + +#include "lmdagm.h" +#include "util/gauge/reunit.h" +#include "actions/ferm/invert/quda_solvers/quda_mg_utils.h" +#include "actions/ferm/invert/mg_solver_exception.h" + +//#include +#ifdef BUILD_QUDA +#include +#ifdef QDP_IS_QDPJIT +#include "actions/ferm/invert/quda_solvers/qdpjit_memory_wrapper.h" +#endif + +#include "update/molecdyn/predictor/zero_guess_predictor.h" +#include "update/molecdyn/predictor/quda_predictor.h" +#include "meas/inline/io/named_objmap.h" + +namespace Chroma +{ + +namespace MdagMSysSolverQUDAMULTIGRIDExpCloverEnv +{ +//! Register the syssolver +bool registerAll(); + +} + +class MdagMSysSolverQUDAMULTIGRIDExpClover : public MdagMSystemSolver +{ +public: + typedef LatticeFermion T; + typedef LatticeColorMatrix U; + typedef multi1d Q; + + typedef LatticeFermionF TF; + typedef LatticeColorMatrixF UF; + typedef multi1d QF; + + typedef LatticeFermionF TD; + typedef LatticeColorMatrixF UD; + typedef multi1d QD; + + typedef WordType::Type_t REALT; + + + + MdagMSysSolverQUDAMULTIGRIDExpClover(Handle< LinearOperator > A_, + Handle< FermState > state_, + const SysSolverQUDAMULTIGRIDCloverParams& invParam_) : + A(A_), gstate(state_), invParam(invParam_), clov(new ExpCloverTermT() ), invclov(new ExpCloverTermT()) + { + StopWatch init_swatch; + init_swatch.reset(); init_swatch.start(); + + // Set the solver string + { + std::ostringstream solver_string_stream; + solver_string_stream << "QUDA_MULTIGRID_CLOVER_MDAGM_SOLVER( Mass = " << invParam.CloverParams.Mass <<" , Id = " + << invParam.SaveSubspaceID << " ): "; + solver_string = solver_string_stream.str(); + + } + QDPIO::cout << solver_string << "Initializing" << std::endl; + + // Check free mem +#if 0 + size_t free_mem = QUDAMGUtils::getCUDAFreeMem(); + QDPIO::cout << solver_string << "MEMCHECK: free mem = " << free_mem << std::endl; +#endif + // FOLLOWING INITIALIZATION in test QUDA program + + // 1) work out cpu_prec, cuda_prec, cuda_prec_sloppy + int s = sizeof( WordType::Type_t ); + if (s == 4) { + cpu_prec = QUDA_SINGLE_PRECISION; + } + else { + cpu_prec = QUDA_DOUBLE_PRECISION; + } + + // Work out GPU precision + switch( invParam.cudaPrecision ) { + case HALF: + gpu_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_prec = cpu_prec; + break; + } + + // Work out GPU Sloppy precision + // Default: No Sloppy + switch( invParam.cudaSloppyPrecision ) { + case HALF: + gpu_half_prec = QUDA_HALF_PRECISION; + break; + case SINGLE: + gpu_half_prec = QUDA_SINGLE_PRECISION; + break; + case DOUBLE: + gpu_half_prec = QUDA_DOUBLE_PRECISION; + break; + default: + gpu_half_prec = gpu_prec; + break; + } + + // 2) pull 'new; GAUGE and Invert params + q_gauge_param = newQudaGaugeParam(); + quda_inv_param = newQudaInvertParam(); + + // 3) set lattice size + const multi1d& latdims = Layout::subgridLattSize(); + + q_gauge_param.X[0] = latdims[0]; + q_gauge_param.X[1] = latdims[1]; + q_gauge_param.X[2] = latdims[2]; + q_gauge_param.X[3] = latdims[3]; + + // 4) - deferred (anisotropy) + + // 5) - set QUDA_WILSON_LINKS, QUDA_GAUGE_ORDER + q_gauge_param.type = QUDA_WILSON_LINKS; +#ifndef BUILD_QUDA_DEVIFACE_GAUGE + q_gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; // gauge[mu], p +#else + q_gauge_param.location = QUDA_CUDA_FIELD_LOCATION; + q_gauge_param.gauge_order = QUDA_QDPJIT_GAUGE_ORDER; +#endif + + // 6) - set t_boundary + // Convention: BC has to be applied already + // This flag just tells QUDA that this is so, + // so that QUDA can take care in the reconstruct + if( invParam.AntiPeriodicT ) { + q_gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; + } + else { + q_gauge_param.t_boundary = QUDA_PERIODIC_T; + } + + // Set cpu_prec, cuda_prec, reconstruct and sloppy versions + q_gauge_param.cpu_prec = cpu_prec; + q_gauge_param.cuda_prec = gpu_prec; + + switch( invParam.cudaReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct = QUDA_RECONSTRUCT_12; + break; + }; + + q_gauge_param.cuda_prec_sloppy = gpu_half_prec; + q_gauge_param.cuda_prec_precondition = gpu_half_prec; + + switch( invParam.cudaSloppyReconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + break; + }; + + // This may be overridden later + q_gauge_param.reconstruct_precondition=q_gauge_param.reconstruct_sloppy; + + // Gauge fixing: + + // These are the links + // They may be smeared and the BC's may be applied + Q links_single(Nd); + + // Now downcast to single prec fields. + for(int mu=0; mu < Nd; mu++) { + links_single[mu] = (state_->getLinks())[mu]; + } + + // GaugeFix + if( invParam.axialGaugeP ) { + temporalGauge(links_single, GFixMat, Nd-1); + for(int mu=0; mu < Nd; mu++) { + links_single[mu] = GFixMat*(state_->getLinks())[mu]*adj(shift(GFixMat, FORWARD, mu)); + } + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES; + } + else { + // No GaugeFix + q_gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;// No Gfix yet + } + + // deferred 4) Gauge Anisotropy + const AnisoParam_t& aniso = invParam.CloverParams.anisoParam; + if( aniso.anisoP ) { // Anisotropic case + Real gamma_f = aniso.xi_0 / aniso.nu; + q_gauge_param.anisotropy = toDouble(gamma_f); + } + else { + q_gauge_param.anisotropy = 1.0; + } + + // MAKE FSTATE BEFORE RESCALING links_single + // Because the clover term expects the unrescaled links... + Handle > fstate( new PeriodicFermState(links_single)); + + if( aniso.anisoP ) { // Anisotropic case + multi1d cf=makeFermCoeffs(aniso); + for(int mu=0; mu < Nd; mu++) { + links_single[mu] *= cf[mu]; + } + } + + // Now onto the inv param: + // Dslash type + quda_inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + // Hardwire to GCR + quda_inv_param.inv_type = QUDA_GCR_INVERTER; + quda_inv_param.compute_true_res = 0; + + quda_inv_param.kappa = 0.5; + quda_inv_param.clover_coeff = 1.0; // Dummy, not used + quda_inv_param.Ls=1; + quda_inv_param.tol = toDouble(invParam.RsdTarget); + quda_inv_param.maxiter = invParam.MaxIter; + quda_inv_param.reliable_delta = toDouble(invParam.Delta); + quda_inv_param.pipeline = invParam.Pipeline; + + quda_inv_param.solution_type = QUDA_MATPC_SOLUTION; + quda_inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + + + quda_inv_param.matpc_type = QUDA_MATPC_ODD_ODD; + + quda_inv_param.dagger = QUDA_DAG_NO; + quda_inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; + + quda_inv_param.cpu_prec = cpu_prec; + quda_inv_param.cuda_prec = gpu_prec; + quda_inv_param.cuda_prec_sloppy = gpu_half_prec; + quda_inv_param.cuda_prec_precondition = gpu_half_prec; + quda_inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; + quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + +#ifndef BUILD_QUDA_DEVIFACE_SPINOR + quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; + quda_inv_param.input_location = QUDA_CPU_FIELD_LOCATION; + quda_inv_param.output_location = QUDA_CPU_FIELD_LOCATION; + +#else + quda_inv_param.dirac_order = QUDA_QDPJIT_DIRAC_ORDER; + quda_inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; +#endif + + // Setup padding + multi1d face_size(4); + face_size[0] = latdims[1]*latdims[2]*latdims[3]/2; + face_size[1] = latdims[0]*latdims[2]*latdims[3]/2; + face_size[2] = latdims[0]*latdims[1]*latdims[3]/2; + face_size[3] = latdims[0]*latdims[1]*latdims[2]/2; + + int max_face = face_size[0]; + for(int i=1; i <=3; i++) { + if ( face_size[i] > max_face ) { + max_face = face_size[i]; + } + } + + q_gauge_param.ga_pad = max_face; + + // Clover precision and order + quda_inv_param.clover_cpu_prec = cpu_prec; + quda_inv_param.clover_cuda_prec = gpu_prec; + quda_inv_param.clover_cuda_prec_sloppy = gpu_half_prec; + quda_inv_param.clover_cuda_prec_precondition = gpu_half_prec; + + if( !invParam.MULTIGRIDParamsP ) { + QDPIO::cout << solver_string << "ERROR: MG Solver had MULTIGRIDParamsP set to false" << std::endl; + QDP_abort(1); + } + + // Dereference handle + const MULTIGRIDSolverParams& ip = *(invParam.MULTIGRIDParams); + + + // Set preconditioner precision + switch( ip.prec ) { + case HALF: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + + case SINGLE: + quda_inv_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_SINGLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_SINGLE_PRECISION; + break; + + case DOUBLE: + quda_inv_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; + break; + default: + quda_inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + quda_inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION; + q_gauge_param.cuda_prec_precondition = QUDA_HALF_PRECISION; + break; + } + + switch( ip.reconstruct ) { + case RECONS_NONE: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_NO; + break; + case RECONS_8: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_8; + break; + case RECONS_12: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + default: + q_gauge_param.reconstruct_precondition = QUDA_RECONSTRUCT_12; + break; + }; + + // Set up the links + void* gauge[4]; + +#ifndef BUILD_QUDA_DEVIFACE_GAUGE + for(int mu=0; mu < Nd; mu++) { + gauge[mu] = (void *)&(links_single[mu].elem(all.start()).elem().elem(0,0).real()); + } +#else + GetMemoryPtrGauge(gauge,links_single); + // std::vector ids; + // for(int mu=0; mu < Nd; mu++) + // ids.push_back( links_single[mu].getId() ); + // std::vector dev_ptr = GetMemoryPtr( ids ); + // for(int mu=0; mu < Nd; mu++) + // gauge[mu] = dev_ptr[mu]; +#endif + + loadGaugeQuda((void *)gauge, &q_gauge_param); + + + quda_inv_param.tol_precondition = toDouble(ip.tol[0]); + quda_inv_param.maxiter_precondition = ip.maxIterations[0]; + quda_inv_param.gcrNkrylov = ip.outer_gcr_nkrylov; + quda_inv_param.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); + + //Replacing above with what's in the invert test. + switch( ip.schwarzType ) { + case ADDITIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + case MULTIPLICATIVE_SCHWARZ : + quda_inv_param.schwarz_type = QUDA_MULTIPLICATIVE_SCHWARZ; + break; + default: + quda_inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + break; + } + quda_inv_param.precondition_cycle = 1; + //Invert test always sets this to 1. + + + if( invParam.verboseP ) { + quda_inv_param.verbosity = QUDA_VERBOSE; + } + else { + quda_inv_param.verbosity = QUDA_SUMMARIZE; + } + + quda_inv_param.verbosity_precondition = QUDA_SILENT; + + quda_inv_param.inv_type_precondition = QUDA_MG_INVERTER; + + // Setup the clover term... + QDPIO::cout <create(fstate, invParam_.CloverParams); + + // Don't recompute, just copy + invclov->create(fstate, invParam_.CloverParams); + + QDPIO::cout <choles(0); + invclov->choles(1); + +#ifndef BUILD_QUDA_DEVIFACE_CLOVER +#warning "NOT USING QUDA DEVICE IFACE" + quda_inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + + multi1d > packed_clov; + + packed_clov.resize(all.siteTable().size()); + + clov->packForQUDA(packed_clov, 0); + clov->packForQUDA(packed_clov, 1); + + // Always need inverse + multi1d > packed_invclov(all.siteTable().size()); + invclov->packForQUDA(packed_invclov, 0); + invclov->packForQUDA(packed_invclov, 1); + + loadCloverQuda(&(packed_clov[0]), &(packed_invclov[0]), &quda_inv_param); + +#else + +#warning "USING QUDA DEVICE IFACE" + + quda_inv_param.clover_location = QUDA_CUDA_FIELD_LOCATION; + quda_inv_param.clover_order = QUDA_QDPJIT_CLOVER_ORDER; + + void *clover[2]; + void *cloverInv[2]; + + GetMemoryPtrClover(clov->getOffId(),clov->getDiaId(),invclov->getOffId(),invclov->getDiaId()); + + loadCloverQuda( (void*)(clover), (void *)(cloverInv), &quda_inv_param); +#endif + + quda_inv_param.omega = toDouble(ip.relaxationOmegaOuter); + +// Copy ThresholdCount from invParams into threshold_counts. +threshold_counts = invParam.ThresholdCount; + +if(TheNamedObjMap::Instance().check(invParam.SaveSubspaceID)) +{ + StopWatch update_swatch; + update_swatch.reset(); update_swatch.start(); + // Subspace ID exists add it to mg_state + QDPIO::cout<< solver_string <<"Recovering subspace..."<(invParam.SaveSubspaceID); + for(int j=0; j < ip.mg_levels-1;++j) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + update_swatch.stop(); + + QDPIO::cout << solver_string << " subspace_update_time = " + << update_swatch.getTimeInSeconds() << " sec. " << std::endl; +} +else +{ + // Create the subspace. + StopWatch create_swatch; + create_swatch.reset(); create_swatch.start(); + QDPIO::cout << solver_string << "Creating Subspace" << std::endl; + subspace_pointers = QUDAMGUtils::create_subspace(invParam); + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + create_swatch.stop(); + QDPIO::cout << solver_string << " subspace_create_time = " + << create_swatch.getTimeInSeconds() << " sec. " << std::endl; + +} +quda_inv_param.preconditioner = subspace_pointers->preconditioner; + +init_swatch.stop(); +QDPIO::cout << solver_string << " init_time = " + << init_swatch.getTimeInSeconds() << " sec. " + << std::endl; + + } + + //! Destructor is not automatic + ~MdagMSysSolverQUDAMULTIGRIDExpClover() + { + + quda_inv_param.preconditioner = nullptr; + subspace_pointers = nullptr; + freeGaugeQuda(); + freeCloverQuda(); + } + + //! Return the subset on which the operator acts + const Subset& subset() const {return A->subset();} + + //! Solver the linear system + /*! + * \param psi solution ( Modify ) + * \param chi source ( Read ) + * \return syssolver results + */ + SystemSolverResults_t operator() (T& psi, const T& chi) const + { + SystemSolverResults_t res1; + SystemSolverResults_t res2; + SystemSolverResults_t res; + + START_CODE(); + StopWatch swatch; + swatch.start(); + + // I want to use the predictor versions of the code as they have been made robust. + // So I should use either a null predictor or a zero guess predictor here. + // The MG two step solve logic is quite complicated and may need to reinit the fields. + // I don't want to triplicate that logic so I'll just use a dummy predictor and call through. + ZeroGuess4DChronoPredictor dummy_predictor; + res = (*this)(psi, chi, dummy_predictor); + + + END_CODE(); + return res; + } + + + + SystemSolverResults_t operator() (T& psi, const T& chi, Chroma::AbsTwoStepChronologicalPredictor4D& predictor ) const + { + + START_CODE(); + + StopWatch swatch; + swatch.start(); + + MULTIGRIDSolverParams& ip = *(invParam.MULTIGRIDParams); + // Use this in residuum checks. + Double norm2chi=sqrt(norm2(chi, A->subset())); + + // Allow QUDA to use initial guess + QudaUseInitGuess old_guess_policy = quda_inv_param.use_init_guess; + quda_inv_param.use_init_guess = QUDA_USE_INIT_GUESS_YES; + + + SystemSolverResults_t res; + SystemSolverResults_t res1; + SystemSolverResults_t res2; + + // Create MdagM op + Handle< LinearOperator > MdagM( new MdagMLinOp(A) ); + + + QDPIO::cout << solver_string <<"Two Step Solve" << std::endl; + + + // Try to cast the predictor to a two step predictor + StopWatch X_prediction_timer; X_prediction_timer.reset(); + StopWatch Y_prediction_timer; Y_prediction_timer.reset(); + StopWatch Y_solve_timer; Y_solve_timer.reset(); + StopWatch X_solve_timer; X_solve_timer.reset(); + StopWatch Y_predictor_add_timer; Y_predictor_add_timer.reset(); + StopWatch X_predictor_add_timer; X_predictor_add_timer.reset(); + StopWatch X_refresh_timer; X_refresh_timer.reset(); + StopWatch Y_refresh_timer; Y_refresh_timer.reset(); + + QDPIO::cout << solver_string << "Predicting Y" << std::endl; + Y_prediction_timer.start(); + T Y_prime = zero; + { + T tmp_vec = psi; + predictor.predictY(tmp_vec, *A, chi); // Predicts for M^\dagger Y = chi + + // We are going to solve M \gamma + Y_prime = Gamma(Nd*Nd-1)*tmp_vec; + } + Y_prediction_timer.stop(); + + // Y solve: M^\dagger Y = chi + // g_5 M g_5 Y = chi + // => M Y' = chi' with chi' = gamma_5*chi + + Y_solve_timer.start(); + + + T g5chi = zero; + T Y = zero; + g5chi[rb[1]]= Gamma(Nd*Nd-1)*chi; + + // Y solve at 0.5 * Target Residuum -- Evan's bound + quda_inv_param.tol = toDouble(Real(0.5)*invParam.RsdTarget); + if( invParam.asymmetricP == true ) { + res1 = qudaInvert(*clov, + *invclov, + g5chi, + Y_prime); + Y[rb[1]] = Gamma(Nd*Nd -1)*Y_prime; + } + else { + T tmp = zero; + invclov->apply(tmp,g5chi,MINUS,1); + + res1 = qudaInvert(*clov, + *invclov, + tmp, + Y_prime); +#ifdef QUDA_DEBUG + { + char Y_prime_norm[256]; + char Y_prime_norm_full[256]; + std::sprintf(Y_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime, A->subset()))); + std::sprintf(Y_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime))); + QDPIO::cout << "Y solution: norm2(subset) = " << Y_prime_norm << " norm(full) = " << Y_prime_norm_full << std::endl; + } +#endif + tmp[rb[1]] = Gamma(Nd*Nd-1)*Y_prime; + clov->apply(Y,tmp,MINUS,1); + + } + + + bool solution_good = true; + + // Check solution + { + T r=zero; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, Y, MINUS); + r[A->subset()] -= tmp; + + res1.resid = sqrt(norm2(r, A->subset())); + QDPIO::cout << "Y-solve: ||r||=" << res1.resid << " ||r||/||b||=" + << res1.resid/sqrt(norm2(chi,rb[1])) << std::endl; + if ( toBool( res1.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + solution_good = false; + } + } + + if ( solution_good ) { + if( res1.n_count >= threshold_counts ) { + QDPIO::cout << solver_string << "Iteration Threshold Exceeded! Y Solver iters = " << res1.n_count << " Threshold=" << threshold_counts << std::endl; + QDPIO::cout << solver_string << "Refreshing Subspace" << std::endl; + + Y_refresh_timer.start(); + // refresh the subspace + // Setup the number of subspace Iterations + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = ip.maxIterSubspaceRefresh[j]; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + Y_refresh_timer.stop(); + QDPIO::cout << solver_string << "Subspace Refresh Time = " << Y_refresh_timer.getTimeInSeconds() << " secs\n"; + } + } + else { + QDPIO::cout << solver_string << "Y-Solve failed (seq: "<< seqno <<"). Blowing away and reiniting subspace" << std::endl; + StopWatch reinit_timer; reinit_timer.reset(); + reinit_timer.start(); + + // Delete the saved subspace completely + QUDAMGUtils::delete_subspace(invParam.SaveSubspaceID); + + // Recreate the subspace + bool saved_value = ip.check_multigrid_setup; + ip.check_multigrid_setup = true; + subspace_pointers = QUDAMGUtils::create_subspace(invParam); + ip.check_multigrid_setup = saved_value; + + // Make subspace XML snippets + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + // Create named object entry. + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + // Assign the pointer into the named object + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + quda_inv_param.preconditioner = subspace_pointers->preconditioner; + + reinit_timer.stop(); + QDPIO::cout << solver_string << "Subspace Reinit Time: " << reinit_timer.getTimeInSeconds() << " sec." << std::endl; + + // Re-solve + QDPIO::cout << solver_string << "Re-Solving for Y with zero guess" << std::endl; + SystemSolverResults_t res_tmp; + + Y_prime = zero; + if( invParam.asymmetricP == true ) { + res_tmp = qudaInvert(*clov, + *invclov, + g5chi, + Y_prime); + Y[rb[1]] = Gamma(Nd*Nd -1)*Y_prime; + } + else { + T tmp = zero; + invclov->apply(tmp,g5chi,MINUS,1); + res_tmp = qudaInvert(*clov, + *invclov, + tmp, + Y_prime); + +#ifdef QUDA_DEBUG + { + char Y_prime_norm[256]; + char Y_prime_norm_full[256]; + std::sprintf(Y_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime, A->subset()))); + std::sprintf(Y_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime))); + QDPIO::cout << "Y solution: norm2(subset) = " << Y_prime_norm << " norm(full) = " << Y_prime_norm_full << std::endl; + } +#endif + + tmp[rb[1]] = Gamma(Nd*Nd-1)*Y_prime; + clov->apply(Y,tmp,MINUS,1); + + } + + // Check solution + { + T r=zero; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, Y,MINUS); + r[A->subset()] -= tmp; + + res_tmp.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res_tmp.resid/sqrt(norm2(chi)) > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + QDPIO::cout << solver_string << "Re Solve for Y Failed (seq: " << seqno << " ) Rsd = " << res_tmp.resid/norm2chi << " RsdTarget = " << invParam.RsdTarget << std::endl; + QDPIO::cout << solver_string << "Throwing Exception! This will ABORT" << std::endl; + + dumpYSolver(g5chi,Y_prime); + + MGSolverException convergence_fail(invParam.CloverParams.Mass, + invParam.SaveSubspaceID, + res_tmp.n_count, + Real(res_tmp.resid/norm2chi), + invParam.RsdTarget*invParam.RsdToleranceFactor); + throw convergence_fail; + } + } // Check solution + + // threhold count is good, and solution is good + res1.n_count += res_tmp.n_count; // Add resolve iterations + res1.resid = res_tmp.resid; // Copy new residuum. + + } + Y_solve_timer.stop(); + + // At this point we should have a good solution. + Y_predictor_add_timer.start(); + predictor.newYVector(Y); + Y_predictor_add_timer.stop(); + + X_prediction_timer.start(); + // Can predict psi in the usual way without reference to Y + predictor.predictX(psi, (*MdagM), chi); + X_prediction_timer.stop(); + + // Restore resid target for X solve + quda_inv_param.tol = toDouble(invParam.RsdTarget); + X_solve_timer.start(); + // Solve for psi + res2 = qudaInvert(*clov, + *invclov, + Y, + psi); +#ifdef QUDA_DEBUG + { + char X_prime_norm[256]; + char X_prime_norm_full[256]; + std::sprintf(X_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(psi, A->subset()))); + std::sprintf(X_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(psi))); + QDPIO::cout << "X solution: norm2(subset) = " << X_prime_norm << " norm(full) = " << X_prime_norm_full << std::endl; + } +#endif + solution_good = true; + + // Check solution + { + T r; + r[A->subset()]=chi; + T tmp; + (*MdagM)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + + res2.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res2.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + solution_good = false; + } + } + + if( solution_good ) { + if( res2.n_count >= threshold_counts ) { + QDPIO::cout << solver_string <<"Threshold Reached! X Solver iters = " << res2.n_count << " Threshold=" << threshold_counts << std::endl; + QDPIO::cout << solver_string << "Refreshing Subspace" << std::endl; + + X_refresh_timer.start(); + // refresh the subspace + // Regenerate space. Destroy and recreate + // Setup the number of subspace Iterations + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = ip.maxIterSubspaceRefresh[j]; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + X_refresh_timer.stop(); + + QDPIO::cout << solver_string << "X Subspace Refresh Time = " << X_refresh_timer.getTimeInSeconds() << " secs\n"; + } + } + else { + + QDPIO::cout << solver_string << "X-Solve failed (seq: "<(invParam); + ip.check_multigrid_setup = saved_value; + + + // Make subspace XML snippets + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + // Create named object entry. + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + // Assign the pointer into the named object + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + quda_inv_param.preconditioner = subspace_pointers->preconditioner; + reinit_timer.stop(); + QDPIO::cout << solver_string << "Subspace Reinit Time: " << reinit_timer.getTimeInSeconds() << " sec." << std::endl; + + // Re-solve + QDPIO::cout << solver_string << "Re-Solving for X with zero guess" << std::endl; + SystemSolverResults_t res_tmp; + psi = zero; + res_tmp = qudaInvert(*clov, + *invclov, + Y, + psi); +#ifdef QUDA_DEBUG + { + char X_prime_norm[256]; + char X_prime_norm_full[256]; + std::sprintf(X_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(psi, A->subset()))); + std::sprintf(X_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(psi))); + QDPIO::cout << "X solution: norm2(subset) = " << X_prime_norm << " norm(full) = " << X_prime_norm_full << std::endl; + } +#endif + + + // Check solution + { + T r; + r[A->subset()]=chi; + T tmp; + (*MdagM)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + + res_tmp.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res_tmp.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + QDPIO::cout << solver_string << "Re Solve for X Failed (seq: " << seqno << " ) Rsd = " << res_tmp.resid/norm2chi << " RsdTarget = " << invParam.RsdTarget << std::endl; + QDPIO::cout << solver_string << "Throwing Exception! This will ABORT" << std::endl; + + dumpXSolver(chi,Y,psi); + + MGSolverException convergence_fail(invParam.CloverParams.Mass, + invParam.SaveSubspaceID, + res_tmp.n_count, + Real(res_tmp.resid/norm2chi), + invParam.RsdTarget*invParam.RsdToleranceFactor); + throw convergence_fail; + + QDP_abort(1); + } + } + // At this point the solution is good + res2.n_count += res_tmp.n_count; + res2.resid = res_tmp.resid; + + } + X_solve_timer.stop(); + + X_predictor_add_timer.start(); + predictor.newXVector(psi); + X_predictor_add_timer.stop(); + swatch.stop(); + double time = swatch.getTimeInSeconds(); + + res.n_count = res1.n_count + res2.n_count; + res.resid = res2.resid; + + Double rel_resid = res.resid/norm2chi; + + QDPIO::cout << solver_string << " seq: " << (seqno++) << " iterations: " << res1.n_count << " + " + << res2.n_count << " = " << res.n_count + << " Rsd = " << res.resid << " Relative Rsd = " << rel_resid << std::endl; + + QDPIO::cout <subset())); + + // Allow QUDA to use initial guess + QudaUseInitGuess old_guess_policy = quda_inv_param.use_init_guess; + quda_inv_param.use_init_guess = QUDA_USE_INIT_GUESS_YES; + + + SystemSolverResults_t res; + SystemSolverResults_t res1; + SystemSolverResults_t res2; + + // Create MdagM op + Handle< LinearOperator > MdagM( new MdagMLinOp(A) ); + + + QDPIO::cout << solver_string <<"Two Step Solve" << std::endl; + + + // Try to cast the predictor to a two step predictor + StopWatch Y_solve_timer; Y_solve_timer.reset(); + StopWatch X_solve_timer; X_solve_timer.reset(); + StopWatch Y_refresh_timer; Y_refresh_timer.reset(); + StopWatch X_refresh_timer; X_refresh_timer.reset(); + int X_index=predictor.getXIndex(); + int Y_index=predictor.getYIndex(); + + QDPIO::cout << "Two Step Solve using QUDA predictor: (Y_index,X_index) = ( " << Y_index << " , " << X_index << " ) \n"; + + + // Select the channel for QUDA's predictor here. + // + // + + quda_inv_param.chrono_max_dim = predictor.getMaxChrono(); + quda_inv_param.chrono_index = Y_index; + quda_inv_param.chrono_make_resident = true; + quda_inv_param.chrono_use_resident = true; + quda_inv_param.chrono_replace_last = false; + + // Y solve is at 0.5*RsdTarget -- Evan's analysis + quda_inv_param.tol = toDouble(Real(0.5)*invParam.RsdTarget); + if ( predictor.getChronoPrecision() == DEFAULT ) { + QDPIO::cout << "Setting Default Chrono precision of " << cpu_prec << std::endl; + quda_inv_param.chrono_precision = cpu_prec; + } + else { + quda_inv_param.chrono_precision = theChromaToQudaPrecisionTypeMap::Instance()[ predictor.getChronoPrecision() ]; + QDPIO::cout << "Setting Chrono precision of " << quda_inv_param.chrono_precision << std::endl; + } + + /// channel set done + T Y_prime = zero; + T Y = zero; + // Y solve: M^\dagger Y = chi + // g_5 M g_5 Y = chi + // => M Y' = chi' with chi' = gamma_5*chi + Y_solve_timer.start(); + T g5chi = zero; + g5chi[rb[1]]= Gamma(Nd*Nd-1)*chi; + if( invParam.asymmetricP == true ) { + res1 = qudaInvert(*clov, + *invclov, + g5chi, + Y_prime); + Y[rb[1]] = Gamma(Nd*Nd -1)*Y_prime; + } + else { + T tmp = zero; + invclov->apply(tmp,g5chi,MINUS,1); + + res1 = qudaInvert(*clov, + *invclov, + tmp, + Y_prime); + +#ifdef QUDA_DEBUG + { + char Y_prime_norm[256]; + char Y_prime_norm_full[256]; + std::sprintf(Y_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime, A->subset()))); + std::sprintf(Y_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime))); + QDPIO::cout << "Y solution: norm2(subset) = " << Y_prime_norm << " norm(full) = " << Y_prime_norm_full << std::endl; + } +#endif + + tmp[rb[1]] = Gamma(Nd*Nd-1)*Y_prime; + clov->apply(Y,tmp,MINUS,1); + + } + + bool solution_good = true; + + // Check solution + { + T r; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, Y, MINUS); + r[A->subset()] -= tmp; + + res1.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res1.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + solution_good = false; + } + } + + if ( solution_good ) { + if( res1.n_count >= threshold_counts ) { + QDPIO::cout << solver_string << "Iteration Threshold Exceeded:Y Solver iters = " << res1.n_count << " Threshold=" << threshold_counts << std::endl; + QDPIO::cout << solver_string << "Refreshing Subspace" << std::endl; + + Y_refresh_timer.start(); + // refresh the subspace + // Setup the number of subspace Iterations + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = ip.maxIterSubspaceRefresh[j]; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + Y_refresh_timer.stop(); + QDPIO::cout << solver_string << "Y Subspace Refresh Time = " << Y_refresh_timer.getTimeInSeconds() << " secs\n"; + } + } + else { + QDPIO::cout << solver_string << "Y-Solve failed (seq: "<(invParam); + ip.check_multigrid_setup = saved_value; + + + // Make subspace XML snippets + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + // Create named object entry. + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + // Assign the pointer into the named object + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + quda_inv_param.preconditioner = subspace_pointers->preconditioner; + reinit_timer.stop(); + QDPIO::cout << solver_string << "Subspace Reinit Time: " << reinit_timer.getTimeInSeconds() << " sec." << std::endl; + + // Re-solve + // This is a re-solve. So use_resident=false means used my initial guess + // (do not repredict) + quda_inv_param.chrono_use_resident = false; + + // The last solve, stored a chrono vector. We will overwrite this + // thanks to the setting below + quda_inv_param.chrono_replace_last = true; + + QDPIO::cout << solver_string << "Re-Solving for Y (zero guess)" << std::endl; + SystemSolverResults_t res_tmp; + Y_prime = zero; + + if( invParam.asymmetricP == true ) { + res_tmp = qudaInvert(*clov, + *invclov, + g5chi, + Y_prime); + Y[rb[1]] = Gamma(Nd*Nd -1)*Y_prime; + } + else { + T tmp = zero; + invclov->apply(tmp,g5chi,MINUS,1); + + res_tmp = qudaInvert(*clov, + *invclov, + tmp, + Y_prime); + +#ifdef QUDA_DEBUG + { + char Y_prime_norm[256]; + char Y_prime_norm_full[256]; + std::sprintf(Y_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime, A->subset()))); + std::sprintf(Y_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(Y_prime))); + QDPIO::cout << "Y solution: norm2(subset) = " << Y_prime_norm << " norm(full) = " << Y_prime_norm_full << std::endl; + } +#endif + tmp[rb[1]] = Gamma(Nd*Nd-1)*Y_prime; + clov->apply(Y,tmp,MINUS,1); + } + + // Check solution + { + T r; + r[A->subset()]=chi; + T tmp; + (*A)(tmp, Y, MINUS); + r[A->subset()] -= tmp; + + res_tmp.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res_tmp.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + // If we fail on the resolve then barf + QDPIO::cout << solver_string << "Re Solve for Y Failed (seq: " << seqno << " ) Rsd = " << res_tmp.resid/norm2chi << " RsdTarget = " << invParam.RsdTarget << std::endl; + + dumpYSolver(g5chi,Y_prime); + + QDPIO::cout << solver_string << "Throwing Exception! This will ABORT" << std::endl; + + MGSolverException convergence_fail(invParam.CloverParams.Mass, + invParam.SaveSubspaceID, + res_tmp.n_count, + Real(res_tmp.resid/norm2chi), + invParam.RsdTarget*invParam.RsdToleranceFactor); + throw convergence_fail; + } + } + + // At this point solution should be good again and subspace should be reinited + res1.n_count += res_tmp.n_count; // Add resolve iterations + res1.resid = res_tmp.resid; // Copy new residuum. + + } + Y_solve_timer.stop(); + + // At this point we should have a good solution. + // After the good solve, solution will be added to the right channel + // by QUDA + // Some diagnostics would be nice + + + // Now select QUDA Chrono Index here + quda_inv_param.chrono_max_dim = predictor.getMaxChrono(); + quda_inv_param.chrono_index = X_index; + quda_inv_param.chrono_make_resident = true; + quda_inv_param.chrono_use_resident = true; + quda_inv_param.chrono_replace_last = false; + + // Reset Target Residuum for X solve + quda_inv_param.tol = toDouble(invParam.RsdTarget); + X_solve_timer.start(); + //psi[A->subset()]=zero; + psi = zero; + // Solve for psi + res2 = qudaInvert(*clov, + *invclov, + Y, + psi); +#ifdef QUDA_DEBUG + { + char X_prime_norm[256]; + char X_prime_norm_full[256]; + std::sprintf(X_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(psi, A->subset()))); + std::sprintf(X_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(psi))); + QDPIO::cout << "X solution: norm2(subset) = " << X_prime_norm << " norm(full) = " << X_prime_norm_full << std::endl; + } +#endif + + + solution_good = true; + // Check solution + { + T r=zero; + r[A->subset()]=Y; + T tmp=zero; + // Checkin MX = Y solve + (*A)(tmp, psi, PLUS); + r[ A->subset() ] -= tmp; + Double resid_MXY = sqrt(norm2(r,A->subset())); + Double normY = sqrt(norm2(Y,A->subset())); + QDPIO::cout << "X solve: || Y - MX || / || Y || = " << resid_MXY/normY << std::endl; + r[A->subset()]=chi; + (*MdagM)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + + res2.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res2.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + solution_good = false; + } + } + + + if( solution_good ) { + if( res2.n_count >= threshold_counts ) { + QDPIO::cout << solver_string <<"Threshold Reached: X Solver iters = " << res2.n_count << " Threshold=" << threshold_counts << std::endl; + QDPIO::cout << solver_string << "Refreshing Subspace" << std::endl; + + X_refresh_timer.start(); + // refresh the subspace + // Regenerate space. Destroy and recreate + // Setup the number of subspace Iterations + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = ip.maxIterSubspaceRefresh[j]; + } + updateMultigridQuda(subspace_pointers->preconditioner, &(subspace_pointers->mg_param)); + for(int j=0; j < ip.mg_levels-1; j++) { + (subspace_pointers->mg_param).setup_maxiter_refresh[j] = 0; + } + X_refresh_timer.stop(); + + QDPIO::cout << solver_string << "Subspace Refresh Time = " << X_refresh_timer.getTimeInSeconds() << " secs\n"; + } + } + else { + + QDPIO::cout << solver_string << "X-Solve failed (seq: "<(invParam); + ip.check_multigrid_setup = saved_value; + + // Make subspace XML snippets + XMLBufferWriter file_xml; + push(file_xml, "FileXML"); + pop(file_xml); + + int foo = 5; + XMLBufferWriter record_xml; + push(record_xml, "RecordXML"); + write(record_xml, "foo", foo); + pop(record_xml); + + + // Create named object entry. + TheNamedObjMap::Instance().create< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setFileXML(file_xml); + TheNamedObjMap::Instance().get(invParam.SaveSubspaceID).setRecordXML(record_xml); + + // Assign the pointer into the named object + TheNamedObjMap::Instance().getData< QUDAMGUtils::MGSubspacePointers* >(invParam.SaveSubspaceID) = subspace_pointers; + quda_inv_param.preconditioner = subspace_pointers->preconditioner; + reinit_timer.stop(); + QDPIO::cout << solver_string << "Subspace Reinit Time: " << reinit_timer.getTimeInSeconds() << " sec." << std::endl; + + // Re-solve + // This is a re-solve. So use_resident=false means used my initial guess + // (do not repredict) + quda_inv_param.chrono_use_resident = false; + + // The last solve, stored a chrono vector. We will overwrite this + // thanks to the setting below + quda_inv_param.chrono_replace_last = true; + + QDPIO::cout << solver_string << "Re-Solving for X (zero guess)" << std::endl; + + SystemSolverResults_t res_tmp; + + // psi[rb[1]] = zero; + psi = zero; + res_tmp = qudaInvert(*clov, + *invclov, + Y, + psi); + +#ifdef QUDA_DEBUG + { + char X_prime_norm[256]; + char X_prime_norm_full[256]; + std::sprintf(X_prime_norm, "%.*e", DECIMAL_DIG, toDouble(norm2(psi, A->subset()))); + std::sprintf(X_prime_norm_full, "%.*e", DECIMAL_DIG, toDouble(norm2(psi))); + QDPIO::cout << "X solution: norm2(subset) = " << X_prime_norm << " norm(full) = " << X_prime_norm_full << std::endl; + } +#endif + // Check solution + { + T r=zero; + r[A->subset()]=Y; + T tmp=zero; + // Checkin MX = Y solve + (*A)(tmp, psi, PLUS); + r[ A->subset() ] -= tmp; + Double resid_MXY = sqrt(norm2(r,A->subset())); + Double normY = sqrt(norm2(Y,A->subset())); + QDPIO::cout << "X re-solve: || Y - MX || / || Y || = " << resid_MXY/normY << std::endl; + r[A->subset()]=chi; + (*MdagM)(tmp, psi, PLUS); + r[A->subset()] -= tmp; + + res_tmp.resid = sqrt(norm2(r, A->subset())); + if ( toBool( res_tmp.resid/norm2chi > invParam.RsdToleranceFactor * invParam.RsdTarget ) ) { + QDPIO::cout << solver_string << "Re Solve for X Failed (seq: " << seqno << " ) Rsd = " << res_tmp.resid/norm2chi << " RsdTarget = " << invParam.RsdTarget << std::endl; + + QDPIO::cout << "Dumping state (solve seqno : " << seqno << " ) " << std::endl; + dumpXSolver(chi,Y,psi); + + + QDPIO::cout << solver_string << "Throwing Exception! This will ABORT" << std::endl; + MGSolverException convergence_fail(invParam.CloverParams.Mass, + invParam.SaveSubspaceID, + res_tmp.n_count, + Real(res_tmp.resid/norm2chi), + invParam.RsdTarget*invParam.RsdToleranceFactor); + throw convergence_fail; + } + } + // At this point the solution is good + res2.n_count += res_tmp.n_count; + res2.resid = res_tmp.resid; + + } + X_solve_timer.stop(); + swatch.stop(); + double time = swatch.getTimeInSeconds(); + + + + // Stats and done + res.n_count = res1.n_count + res2.n_count; + res.resid = res2.resid; + + Double rel_resid = res.resid/norm2chi; + + QDPIO::cout << solver_string << " seq: " << (seqno++) << " iterations: " << res1.n_count << " + " + << res2.n_count << " = " << res.n_count + << " Rsd = " << res.resid << " Relative Rsd = " << rel_resid << std::endl; + + QDPIO::cout << "Y_solve: " << Y_solve_timer.getTimeInSeconds() << " (s) " + << "X_solve: " << X_solve_timer.getTimeInSeconds() << " (s) " + << "Total time: " << time << "(s)" << std::endl; + + quda_inv_param.use_init_guess = old_guess_policy; + + // Turn off chrono. Next solve can turn it on again + quda_inv_param.chrono_make_resident = false; + quda_inv_param.chrono_use_resident = false; + quda_inv_param.chrono_replace_last = false; + + + return res; + } + + + SystemSolverResults_t operator() (T& psi, const T& chi, Chroma::AbsChronologicalPredictor4D& predictor ) const + { + SystemSolverResults_t res; + + // Try using QUDA predictor + try { + Chroma::QUDA4DChronoPredictor& quda_pred = + dynamic_cast(predictor); + + res = (*this)(psi,chi,quda_pred); + return res; + } + catch(MGSolverException &e) { + throw; + } + catch(...) { + QDPIO::cout << "Failed to cast predictor to QUDA predictor" + << std::endl; + } + + // QUDA Predictor failed -- Try abs 2 step + try { + Chroma::AbsTwoStepChronologicalPredictor4D& two_step_pred = + dynamic_cast< Chroma::AbsTwoStepChronologicalPredictor4D&>(predictor); + + res = (*this)(psi,chi,two_step_pred); + return res; + } + catch(MGSolverException &e) { + throw; + } + catch(...) { + QDPIO::cout << "Failed to cast predictor to QUDA or Two Step predictor" + << std::endl; + QDP_abort(1); + } + } + + +private: + // Hide default constructor + MdagMSysSolverQUDAMULTIGRIDExpClover() {} + +#if 1 + Q links_orig; +#endif + + U GFixMat; + QudaPrecision_s cpu_prec; + QudaPrecision_s gpu_prec; + QudaPrecision_s gpu_half_prec; + + Handle< LinearOperator > A; + Handle< FermState > gstate; + mutable SysSolverQUDAMULTIGRIDCloverParams invParam; + QudaGaugeParam q_gauge_param; + mutable QudaInvertParam quda_inv_param; + mutable QUDAMGUtils::MGSubspacePointers* subspace_pointers; + + + Handle< ExpCloverTermT > clov; + Handle< ExpCloverTermT > invclov; + + SystemSolverResults_t qudaInvert(const ExpCloverTermT& clover, + const ExpCloverTermT& inv_clov, + const T& chi_s, + T& psi_s + )const; + + std::string solver_string; + int threshold_counts; + + void dumpYSolver(const LatticeFermion& chi, + const LatticeFermion& Y) const; + + void dumpXSolver(const LatticeFermion& chi, + const LatticeFermion& Y, + const LatticeFermion& X) const; + + static unsigned long seqno; + +}; + +} // End namespace + +#endif // BUILD_QUDA +#endif + diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h index d3dcfc4173..8f119e68df 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h @@ -315,18 +315,6 @@ namespace Chroma quda_inv_param.dirac_order = QUDA_DIRAC_ORDER; quda_inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - // Autotuning - if( invParam.tuneDslashP ) { - QDPIO::cout << "Enabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_YES; - } - else { - QDPIO::cout << "Disabling Dslash Autotuning" << std::endl; - - quda_inv_param.tune = QUDA_TUNE_NO; - } - // Setup padding multi1d face_size(4); diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.cc index a32fe32690..b73e90ee29 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric (i.e. CLOVER) is default } if( paramtop.count("CudaPrecision") > 0 ) { @@ -86,14 +86,6 @@ namespace Chroma { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - if( paramtop.count("GCRInnerParams") > 0 ) { innerParams = new GCRInnerSolverParams(paramtop, "./GCRInnerParams"); @@ -128,6 +120,25 @@ namespace Chroma { // Default Pipeline = 0 Pipeline=0; } + + + if ( paramtop.count("SolutionCheckP") > 0 ) { + read(paramtop, "SolutionCheckP", SolutionCheckP); + } + else { + SolutionCheckP = true; // default solution check is on + } + + if ( paramtop.count("GridSplitDims") > 0) { + read(paramtop, "GridSplitDims", GridSplitDims); + } + else { + GridSplitDims.resize(Nd); + GridSplitDims[0]=1; + GridSplitDims[1]=1; + GridSplitDims[2]=1; + GridSplitDims[3]=1; + } } void read(XMLReader& xml, const std::string& path, @@ -156,13 +167,13 @@ namespace Chroma { write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); - write(xml, "AutotuneDslash", p.tuneDslashP); if( p.innerParamsP ) { write(xml, "GCRInnerParams", *(p.innerParams)); } write(xml, "DumpOnFail", p.dump_on_failP); write(xml, "Pipeline", p.Pipeline); + write(xml, "GridSplitDims", p.GridSplitDims); if( p.backup_invP ) { // Need to dump out the XML for the back up solver here... diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.h b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.h index 2133bc4915..cb5621b6d4 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_clover_params.h @@ -14,22 +14,26 @@ namespace Chroma { struct SysSolverQUDACloverParams { SysSolverQUDACloverParams(XMLReader& xml, const std::string& path); - SysSolverQUDACloverParams() { + SysSolverQUDACloverParams() : GridSplitDims(Nd) { solverType=CG; cudaPrecision=DEFAULT; cudaReconstruct=RECONS_12; cudaSloppyPrecision=DEFAULT; cudaSloppyReconstruct=RECONS_12; - asymmetricP = false; //< Use asymmetric version of the linear operator + asymmetricP = true; //< Use asymmetric version of the linear operator axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; innerParamsP = false; backup_invP = false; dump_on_failP = false; Pipeline = 1; + SolutionCheckP = true; + GridSplitDims[0] = 1; + GridSplitDims[1] = 1; + GridSplitDims[2] = 1; + GridSplitDims[3] = 1; }; SysSolverQUDACloverParams( const SysSolverQUDACloverParams& p) { @@ -48,13 +52,15 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; innerParamsP = p.innerParamsP; innerParams = p.innerParams; backup_invP = p.backup_invP; backup_inv_param = p.backup_inv_param; dump_on_failP = p.dump_on_failP; Pipeline = p.Pipeline; + SolutionCheckP = p.SolutionCheckP; + GridSplitDims.resize(Nd); + for(int i=0; i < Nd; i++) GridSplitDims[i] = p.GridSplitDims[i]; } @@ -73,7 +79,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; bool innerParamsP; // GCR Specific params @@ -87,6 +92,8 @@ namespace Chroma // Pipeline depth int Pipeline; + bool SolutionCheckP; + multi1d GridSplitDims; }; void read(XMLReader& xml, const std::string& path, SysSolverQUDACloverParams& p); diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.cc index 561cdd5710..62e6993b96 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric (i.e. CLOVER) is default } if( paramtop.count("CudaPrecision") > 0 ) { @@ -86,14 +86,6 @@ namespace Chroma { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - read(paramtop, "SubspaceID", SaveSubspaceID); if( paramtop.count("ThresholdCount") == 1 ) { @@ -143,6 +135,16 @@ namespace Chroma { SolutionCheckP = true; // default solution check is on } + if ( paramtop.count("GridSplitDims") > 0) { + read(paramtop, "GridSplitDims", GridSplitDims); + } + else { + GridSplitDims.resize(Nd); + GridSplitDims[0]=1; + GridSplitDims[1]=1; + GridSplitDims[2]=1; + GridSplitDims[3]=1; + } } void read(XMLReader& xml, const std::string& path, @@ -171,8 +173,6 @@ namespace Chroma { write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); - write(xml, "AutotuneDslash", p.tuneDslashP); - //Write the MG persistence params. write(xml, "SubspaceID", p.SaveSubspaceID); write(xml, "ThresholdCount", p.ThresholdCount); @@ -184,6 +184,7 @@ namespace Chroma { write(xml, "DumpOnFail", p.dump_on_failP); write(xml, "SolutionCheckP", p.SolutionCheckP); + write(xml, "GridSplitDims", p.GridSplitDims); if( p.backup_invP ) { // Need to dump out the XML for the back up solver here... diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h index 8227165f20..4d9258308d 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_clover_params.h @@ -14,26 +14,30 @@ namespace Chroma { struct SysSolverQUDAMULTIGRIDCloverParams { SysSolverQUDAMULTIGRIDCloverParams(XMLReader& xml, const std::string& path); - SysSolverQUDAMULTIGRIDCloverParams() { + SysSolverQUDAMULTIGRIDCloverParams() : GridSplitDims(Nd) { solverType=CG; cudaPrecision=DEFAULT; cudaReconstruct=RECONS_12; cudaSloppyPrecision=DEFAULT; cudaSloppyReconstruct=RECONS_12; - asymmetricP = false; //< Use asymmetric version of the linear operator + asymmetricP = true; //< Use asymmetric version of the linear operator axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; MULTIGRIDParamsP = false; backup_invP = false; dump_on_failP = false; Pipeline = 1; SolutionCheckP = true; + GridSplitDims[0] = 1; + GridSplitDims[1] = 1; + GridSplitDims[2] = 1; + GridSplitDims[3] = 1; + }; - SysSolverQUDAMULTIGRIDCloverParams( const SysSolverQUDAMULTIGRIDCloverParams& p) { + SysSolverQUDAMULTIGRIDCloverParams( const SysSolverQUDAMULTIGRIDCloverParams& p) : GridSplitDims(Nd){ CloverParams = p.CloverParams; AntiPeriodicT = p.AntiPeriodicT; MaxIter = p.MaxIter; @@ -49,7 +53,6 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; MULTIGRIDParamsP = p.MULTIGRIDParamsP; MULTIGRIDParams = p.MULTIGRIDParams; backup_invP = p.backup_invP; @@ -59,6 +62,10 @@ namespace Chroma ThresholdCount = p.ThresholdCount; Pipeline = p.Pipeline; SolutionCheckP = p.SolutionCheckP; + GridSplitDims[0] = p.GridSplitDims[0]; + GridSplitDims[1] = p.GridSplitDims[1]; + GridSplitDims[2] = p.GridSplitDims[2]; + GridSplitDims[3] = p.GridSplitDims[3]; } @@ -77,7 +84,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; bool MULTIGRIDParamsP; //New params for MG subspace persistence within NamedObject Storage. @@ -92,7 +98,7 @@ namespace Chroma GroupXML_t backup_inv_param; bool dump_on_failP; bool SolutionCheckP; - + multi1d GridSplitDims; }; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.cc index 4d5f66b5d4..a41146375f 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric is default although it doesn't matter here } if( paramtop.count("CudaPrecision") > 0 ) { @@ -85,15 +85,6 @@ namespace Chroma { else { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - if( paramtop.count("Pipeline") > 0 ) { read(paramtop, "Pipeline", Pipeline); } @@ -136,7 +127,6 @@ namespace Chroma { write(xml, "AxialGaugeFix", p.axialGaugeP); write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); - write(xml, "AutotuneDslash", p.tuneDslashP); write(xml, "Pipeline", p.Pipeline); if( p.MULTIGRIDParamsP ) { diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.h b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.h index 73bb2bd7af..66902cc001 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_multigrid_wilson_params.h @@ -26,7 +26,6 @@ namespace Chroma axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; MULTIGRIDParamsP = false; Pipeline = 1; @@ -48,7 +47,6 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; Pipeline=1; MULTIGRIDParamsP = p.MULTIGRIDParamsP; MULTIGRIDParams = p.MULTIGRIDParams; @@ -71,7 +69,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; int Pipeline; bool MULTIGRIDParamsP; diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.cc index a1fd863384..cd24572f42 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric is default -- although it doesn't matter here I don't think } if( paramtop.count("CudaPrecision") > 0 ) { @@ -86,15 +86,6 @@ namespace Chroma { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - - if( paramtop.count("GCRInnerParams") > 0 ) { innerParams = new GCRInnerSolverParams(paramtop, "./GCRInnerParams"); innerParamsP = true; @@ -162,7 +153,6 @@ namespace Chroma { write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); - write(xml, "AutotuneDslash", p.tuneDslashP); if( p.innerParamsP ) { write(xml, "GCRInnerParams", *(p.innerParams)); } diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.h b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.h index 004bb4dd95..546cd8ae82 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_nef_params.h @@ -20,11 +20,10 @@ namespace Chroma cudaReconstruct=RECONS_12; cudaSloppyPrecision=DEFAULT; cudaSloppyReconstruct=RECONS_12; - asymmetricP = false; //< Use asymmetric version of the linear operator + asymmetricP = true; //< Use asymmetric version of the linear operator (DOES THIS MATTER HERE)? axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; innerParamsP = false; backup_invP = false; @@ -49,7 +48,6 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; innerParamsP = p.innerParamsP; innerParams = p.innerParams; backup_invP = p.backup_invP; @@ -75,7 +73,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; bool innerParamsP; // GCR Specific params diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.cc b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.cc index 003c8cc278..cec6968796 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.cc +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.cc @@ -34,7 +34,7 @@ namespace Chroma { read(paramtop, "AsymmetricLinop", asymmetricP); } else { - asymmetricP = false; // Symmetric is default + asymmetricP = true; // Asymmetric is default -- although I don't think it matters for Wilson } if( paramtop.count("CudaPrecision") > 0 ) { @@ -86,15 +86,6 @@ namespace Chroma { RsdToleranceFactor = Real(10); // Tolerate an order of magnitude difference by default. } - if( paramtop.count("AutotuneDslash") > 0 ) { - read(paramtop, "AutotuneDslash", tuneDslashP); - } - else { - tuneDslashP = false; - } - QDPIO::cout << "tuneDslasP = " << tuneDslashP << std::endl; - - if( paramtop.count("Pipeline") > 0 ) { read(paramtop, "Pipeline", Pipeline); } @@ -137,7 +128,6 @@ namespace Chroma { write(xml, "AxialGaugeFix", p.axialGaugeP); write(xml, "SilentFail", p.SilentFailP); write(xml, "RsdToleranceFactor", p.RsdToleranceFactor); - write(xml, "AutotuneDslash", p.tuneDslashP); write(xml, "Pipeline", p.Pipeline); if( p.innerParamsP ) { write(xml, "GCRInnerParams", *(p.innerParams)); diff --git a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.h b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.h index 4196ba78f2..6847f04d63 100644 --- a/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.h +++ b/lib/actions/ferm/invert/quda_solvers/syssolver_quda_wilson_params.h @@ -22,11 +22,10 @@ namespace Chroma cudaReconstruct=RECONS_12; cudaSloppyPrecision=DEFAULT; cudaSloppyReconstruct=RECONS_12; - asymmetricP = false; //< Use asymmetric version of the linear operator + asymmetricP = true; //< Use asymmetric version of the linear operator axialGaugeP = false; //< Fix Axial Gauge? SilentFailP = false; //< If set to true ignore lack of convergence. Default is 'loud' RsdToleranceFactor = Real(10); //< Tolerate if the solution achived is better (less) than rsdToleranceFactor*RsdTarget - tuneDslashP = false ; //< v0.3 autotune feature verboseP = false; innerParamsP = false; Pipeline = 1; @@ -48,7 +47,6 @@ namespace Chroma axialGaugeP = p.axialGaugeP; SilentFailP = p.SilentFailP; RsdToleranceFactor = p.RsdToleranceFactor; - tuneDslashP = p.tuneDslashP; innerParamsP = p.innerParamsP; innerParams = p.innerParams; Pipeline = p.Pipeline; @@ -70,7 +68,6 @@ namespace Chroma bool axialGaugeP; bool SilentFailP; Real RsdToleranceFactor; - bool tuneDslashP; bool innerParamsP; int Pipeline; diff --git a/lib/actions/ferm/invert/syssolver_linop_aggregate.cc b/lib/actions/ferm/invert/syssolver_linop_aggregate.cc index 77c73aaefa..4e59a02de9 100644 --- a/lib/actions/ferm/invert/syssolver_linop_aggregate.cc +++ b/lib/actions/ferm/invert/syssolver_linop_aggregate.cc @@ -18,12 +18,15 @@ #include "actions/ferm/invert/syssolver_linop_rel_ibicgstab_clover.h" #include "actions/ferm/invert/syssolver_linop_rel_cg_clover.h" #include "actions/ferm/invert/syssolver_linop_fgmres_dr.h" +#include "actions/ferm/invert/projector_random.h" +#include "actions/ferm/invert/projector_null.h" #include "chroma_config.h" #ifdef BUILD_QUDA #include "actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_linop_clover_quda_multigrid_w.h" +#include "actions/ferm/invert/quda_solvers/syssolver_linop_exp_clover_quda_multigrid_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_linop_wilson_quda_multigrid_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_linop_nef_quda_w.h" @@ -84,10 +87,13 @@ namespace Chroma success &= LinOpSysSolverReliableIBiCGStabCloverEnv::registerAll(); success &= LinOpSysSolverReliableCGCloverEnv::registerAll(); success &= LinOpSysSolverFGMRESDREnv::registerAll(); + success &= ProjectorRandomEnv::registerAll(); + success &= ProjectorNullEnv::registerAll(); #ifdef BUILD_QUDA success &= LinOpSysSolverQUDACloverEnv::registerAll(); success &= LinOpSysSolverQUDAMULTIGRIDCloverEnv::registerAll(); + success &= LinOpSysSolverQUDAMULTIGRIDExpCloverEnv::registerAll(); success &= LinOpSysSolverQUDAWilsonEnv::registerAll(); success &= LinOpSysSolverQUDAMULTIGRIDWilsonEnv::registerAll(); success &= LinOpSysSolverQUDANEFEnv::registerAll(); @@ -135,7 +141,9 @@ namespace Chroma #ifdef BUILD_MDWF success &= LinOpSysSolverMDWFArrayEnv::registerAll(); #endif +#if ! defined (QDP_IS_QDPJIT2) success &= LinOpSysSolverEigCGArrayEnv::registerAll(); +#endif registered = true; } return success; diff --git a/lib/actions/ferm/invert/syssolver_linop_eigcg_array.cc b/lib/actions/ferm/invert/syssolver_linop_eigcg_array.cc index 7dbb18d47d..202a513757 100644 --- a/lib/actions/ferm/invert/syssolver_linop_eigcg_array.cc +++ b/lib/actions/ferm/invert/syssolver_linop_eigcg_array.cc @@ -11,6 +11,8 @@ #include "actions/ferm/invert/inv_eigcg2_array.h" #include "actions/ferm/invert/norm_gram_schm.h" +#if ! defined (QDP_IS_QDPJIT2) + //for debugging //#include "octave.h" #define TEST_ALGORITHM @@ -246,3 +248,5 @@ namespace Chroma } + +#endif diff --git a/lib/actions/ferm/invert/syssolver_linop_eigcg_array.h b/lib/actions/ferm/invert/syssolver_linop_eigcg_array.h index a1e1930f69..64cbe183da 100644 --- a/lib/actions/ferm/invert/syssolver_linop_eigcg_array.h +++ b/lib/actions/ferm/invert/syssolver_linop_eigcg_array.h @@ -17,6 +17,8 @@ #include "actions/ferm/invert/syssolver_eigcg_params.h" #include "actions/ferm/invert/containers.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -97,4 +99,5 @@ namespace Chroma } // End namespace #endif +#endif diff --git a/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.cc b/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.cc index 418862d3a1..79aea30b71 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.cc +++ b/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.cc @@ -14,6 +14,7 @@ #include "actions/ferm/invert/syssolver_mdagm_OPTeigcg.h" #include "containers.h" +#if ! defined (QDP_IS_QDPJIT2) namespace Chroma { @@ -315,3 +316,5 @@ namespace Chroma #endif } + +#endif diff --git a/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.h b/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.h index d3d528c0e5..f2ed455106 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.h +++ b/lib/actions/ferm/invert/syssolver_mdagm_OPTeigcg.h @@ -20,6 +20,8 @@ #include "util/info/unique_id.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -282,5 +284,6 @@ namespace Chroma +#endif #endif diff --git a/lib/actions/ferm/invert/syssolver_mdagm_aggregate.cc b/lib/actions/ferm/invert/syssolver_mdagm_aggregate.cc index 0658a4c056..f8277038e6 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_aggregate.cc +++ b/lib/actions/ferm/invert/syssolver_mdagm_aggregate.cc @@ -25,6 +25,7 @@ #ifdef BUILD_QUDA #include "actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_mdagm_clover_quda_multigrid_w.h" +#include "actions/ferm/invert/quda_solvers/syssolver_mdagm_exp_clover_quda_multigrid_w.h" #include "actions/ferm/invert/quda_solvers/syssolver_mdagm_wilson_quda_w.h" #endif @@ -53,7 +54,9 @@ namespace Chroma success &= MdagMSysSolverCGTimingsEnv::registerAll(); success &= MdagMSysSolverBiCGStabEnv::registerAll(); success &= MdagMSysSolverIBiCGStabEnv::registerAll(); +#if ! defined (QDP_IS_QDPJIT2) success &= MdagMSysSolverEigCGEnv::registerAll(); +#endif success &= MdagMSysSolverRichardsonCloverEnv::registerAll(); success &= MdagMSysSolverReliableBiCGStabCloverEnv::registerAll(); success &= MdagMSysSolverReliableIBiCGStabCloverEnv::registerAll(); @@ -65,6 +68,7 @@ namespace Chroma #ifdef BUILD_QUDA success &= MdagMSysSolverQUDACloverEnv::registerAll(); success &= MdagMSysSolverQUDAMULTIGRIDCloverEnv::registerAll(); + success &= MdagMSysSolverQUDAMULTIGRIDExpCloverEnv::registerAll(); success &= MdagMSysSolverQUDAWilsonEnv::registerAll(); #endif diff --git a/lib/actions/ferm/invert/syssolver_mdagm_eigcg.h b/lib/actions/ferm/invert/syssolver_mdagm_eigcg.h index 8d8ce0d106..9d0630ae1e 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_eigcg.h +++ b/lib/actions/ferm/invert/syssolver_mdagm_eigcg.h @@ -16,7 +16,11 @@ namespace Chroma namespace MdagMSysSolverEigCGEnv { //! Register the syssolver - inline bool registerAll() {return MdagMSysSolverOptEigCGEnv::registerAll();} + inline bool registerAll() { +#if ! defined (QDP_IS_QDPJIT2) + return MdagMSysSolverOptEigCGEnv::registerAll(); +#endif + } } } // end namespace Chroma @@ -32,7 +36,11 @@ namespace Chroma namespace MdagMSysSolverEigCGEnv { //! Register the syssolver - inline bool registerAll() {return MdagMSysSolverQDPEigCGEnv::registerAll();} + inline bool registerAll() { +#if ! defined (QDP_IS_QDPJIT2) + return MdagMSysSolverQDPEigCGEnv::registerAll(); +#endif + } } } // end namespace Chroma @@ -40,3 +48,4 @@ namespace Chroma #endif + diff --git a/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.cc b/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.cc index 9d1ba2a1a9..80637b2fa8 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.cc +++ b/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.cc @@ -12,6 +12,8 @@ #include "actions/ferm/invert/norm_gram_schm.h" #include "actions/ferm/invert/invcg2.h" +#if ! defined (QDP_IS_QDPJIT2) + //for debugging //#include "octave.h" #define TEST_ALGORITHM @@ -272,3 +274,5 @@ namespace Chroma #endif } + +#endif diff --git a/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.h b/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.h index 4d5047338e..bbc93011d7 100644 --- a/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.h +++ b/lib/actions/ferm/invert/syssolver_mdagm_eigcg_qdp.h @@ -17,6 +17,8 @@ #include "actions/ferm/invert/syssolver_eigcg_params.h" #include "actions/ferm/invert/containers.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -124,4 +126,5 @@ namespace Chroma } // End namespace #endif +#endif diff --git a/lib/actions/ferm/linop/central_tprec_nospin_utils.h b/lib/actions/ferm/linop/central_tprec_nospin_utils.h index c1e2a9ec93..5012025181 100644 --- a/lib/actions/ferm/linop/central_tprec_nospin_utils.h +++ b/lib/actions/ferm/linop/central_tprec_nospin_utils.h @@ -12,6 +12,8 @@ #if QDP_ND == 4 #include "chromabase.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -636,6 +638,8 @@ namespace Chroma } // Namespace chroma +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/linop/clover_term_jit2_w.h b/lib/actions/ferm/linop/clover_term_jit2_w.h new file mode 100644 index 0000000000..c43c1345f6 --- /dev/null +++ b/lib/actions/ferm/linop/clover_term_jit2_w.h @@ -0,0 +1,1975 @@ +// -*- C++ -*- +/*! \file + * \brief Clover term linear operator + */ + +#ifndef __clover_term_jit2_w_h__ +#define __clover_term_jit2_w_h__ + +//#warning "Using QDP-JIT clover term" + +#include "state.h" +#include "actions/ferm/fermacts/clover_fermact_params_w.h" +#include "actions/ferm/linop/clover_term_base_w.h" +#include "meas/glue/mesfield.h" + +#if defined (QDP_IS_QDPJIT2) + +namespace QDP +{ + class PackForQUDATimer { + double acc_time; + PackForQUDATimer(): acc_time(0.0) {} + public: + static PackForQUDATimer& Instance() { + static PackForQUDATimer singleton; + return singleton; + } + + double& get() { return acc_time; } + const double& get() const { return acc_time; } + }; + + + template + struct PComp + { + typedef T Sub_t; + T comp[2]; + }; + + template + class WordSize< PComp > + { + public: + static int value( const QV& ret ) + { + return 2 * WordSize::value( ret ); + } + }; + + template + struct GetLimit,0> + { + static int limit(const QV& var) + { + return 2; + } + }; + + + template + struct PCompJIT: public BaseJIT + { + using BaseJIT::BaseJIT; + int getN() const { return 2; } + + int this_size() const { return 2; } + template + PCompJIT operator=( const PCompJIT& rhs) { + this->elem(0) = rhs.elem(0); + this->elem(1) = rhs.elem(1); + return *this; + } + + PCompJIT operator=( const PCompJIT& rhs) { + this->elem(0) = rhs.elem(0); + this->elem(1) = rhs.elem(1); + return *this; + } + }; + + + template + struct ScalarType > + { + typedef PComp::Type_t> Type_t; + }; + + template + struct ScalarType > + { + typedef PCompJIT::Type_t> Type_t; + }; + + template + struct JITType > + { + typedef PCompJIT::Type_t> Type_t; + }; + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + + + + + + template + struct PTriDia + { + typedef T Sub_t; + T diag[2*Nc]; + }; + + template + class WordSize< PTriDia > + { + public: + static int value( const QV& ret ) + { + return 2 * Nc * WordSize::value( ret ); + } + }; + + template + struct GetLimit,0> + { + static int limit(const QV& var) + { + return 2*Nc; + } + }; + + + + + template + struct PTriDiaJIT: public BaseJIT + { + using BaseJIT::BaseJIT; + + int getN() const { return 2 * Nc; } + int this_size() const { return 2 * Nc; } + + template + PTriDiaJIT operator=( const PTriDiaJIT& rhs) + { + JitForLoop i( 0 , 2 * Nc ); + { + this->elem( i.index() ) = rhs.elem( i.index() ); + } + i.end(); + return *this; + } + + PTriDiaJIT operator=( const PTriDiaJIT& rhs) + { + JitForLoop i( 0 , 2 * Nc ); + { + this->elem( i.index() ) = rhs.elem( i.index() ); + } + i.end(); + return *this; + } + }; + + + + + template + struct ScalarType > + { + typedef PTriDia::Type_t> Type_t; + }; + + template + struct ScalarType > + { + typedef PTriDiaJIT::Type_t> Type_t; + }; + + + template + struct JITType > + { + typedef PTriDiaJIT::Type_t> Type_t; + }; + + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + + + + + + template + struct PTriOff + { + typedef T Sub_t; + T offd[2*Nc*Nc-Nc]; + }; + + + template + class WordSize< PTriOff > + { + public: + static int value( const QV& ret ) + { + return ( 2 * Nc * Nc - Nc ) * WordSize::value( ret ); + } + }; + + template + struct GetLimit,0> + { + static int limit(const QV& var) + { + return 2*Nc*Nc-Nc; + } + }; + + + + template + struct PTriOffJIT: public BaseJIT + { + using BaseJIT::BaseJIT; + + int getN() const { return this_size(); } + int this_size() const { return 2 * Nc * Nc - Nc; } + + template + PTriOffJIT operator=( const PTriOffJIT& rhs) { + JitForLoop i( 0 , 2*Nc*Nc-Nc ); + { + this->elem( i.index() ) = rhs.elem( i.index() ); + } + i.end(); + return *this; + } + + PTriOffJIT operator=( const PTriOffJIT& rhs) { + JitForLoop i( 0 , 2*Nc*Nc-Nc ); + { + this->elem( i.index() ) = rhs.elem( i.index() ); + } + i.end(); + return *this; + } + }; + + + + template + struct ScalarType > + { + typedef PTriOff::Type_t> Type_t; + }; + + template + struct ScalarType > + { + typedef PTriOffJIT::Type_t> Type_t; + }; + + + template + struct JITType > + { + typedef PTriOffJIT::Type_t> Type_t; + }; + + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + template + struct WordType > + { + typedef typename WordType::Type_t Type_t; + }; + + + + template + struct LeafFunctor, PrintTag> + { + typedef int Type_t; + static int apply(const PrintTag &f) + { + f.os_m << "PComp<"; + LeafFunctor::apply(f); + f.os_m << ">"; + return 0; + } + }; + + template + struct LeafFunctor, PrintTag> + { + typedef int Type_t; + static int apply(const PrintTag &f) + { + f.os_m << "PTriDia<"; + LeafFunctor::apply(f); + f.os_m << ">"; + return 0; + } + }; + + template + struct LeafFunctor, PrintTag> + { + typedef int Type_t; + static int apply(const PrintTag &f) + { + f.os_m << "PTriOff<"; + LeafFunctor::apply(f); + f.os_m << ">"; + return 0; + } + }; +} // QDP + + + + +namespace Chroma +{ + template + struct QUDAPackedClovSite { + R diag1[6]; + R offDiag1[15][2]; + R diag2[6]; + R offDiag2[15][2]; + }; + + + template + class JITCloverTermT : public CloverTermBase + { + public: + // Typedefs to save typing + typedef typename WordType::Type_t REALT; + + typedef OLattice< PScalar< PScalar< RScalar< Word< REALT> > > > > LatticeREAL; + typedef OScalar< PScalar< PScalar< RScalar< Word< REALT> > > > > RealT; + + //! Empty constructor. Must use create later + JITCloverTermT(); + + //! No real need for cleanup here + ~JITCloverTermT() {} + + //! Creation routine + void create(Handle< FermState, multi1d > > fs, + const CloverFermActParams& param_); + + virtual void create(Handle< FermState, multi1d > > fs, + const CloverFermActParams& param_, + const JITCloverTermT& from_); + + //! Computes the inverse of the term on cb using Cholesky + /*! + * \param cb checkerboard of work (Read) + */ + void choles(int cb); + + //! Computes the inverse of the term on cb using Cholesky + /*! + * \param cb checkerboard of work (Read) + * \return logarithm of the determinant + */ + Double cholesDet(int cb) const ; + + /** + * Apply a dslash + * + * Performs the operation + * + * chi <- (L + D + L^dag) . psi + * + * where + * L is a lower triangular matrix + * D is the real diagonal. (stored together in type TRIANG) + * + * Arguments: + * \param chi result (Write) + * \param psi source (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of OUTPUT std::vector (Read) + */ + void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const; + + + void applySite(T& chi, const T& psi, enum PlusMinus isign, int site) const; + + //! Calculates Tr_D ( Gamma_mat L ) + void triacntr(U& B, int mat, int cb) const; + + //! Return the fermion BC object for this linear operator + const FermBC, multi1d >& getFermBC() const {return *fbc;} + + //! PACK UP the Clover term for QUDA library: + void packForQUDA(multi1d >& quda_pack, int cb) const; + + int getDiaId() const { return tri_dia.getId(); } + int getOffId() const { return tri_off.getId(); } + + + protected: + //! Create the clover term on cb + /*! + * \param f field strength tensor F(mu,nu) (Read) + * \param cb checkerboard (Read) + */ + void makeClov(const multi1d& f, const RealT& diag_mass); + + //! Invert the clover term on cb + //void chlclovms(LatticeREAL& log_diag, int cb); + void ldagdlinv(LatticeREAL& tr_log_diag, int cb); + + //! Get the u field + const multi1d& getU() const {return u;} + + //! Calculates Tr_D ( Gamma_mat L ) + Real getCloverCoeff(int mu, int nu) const; + + + private: + Handle< FermBC,multi1d > > fbc; + multi1d u; + CloverFermActParams param; + LatticeREAL tr_log_diag_; // Fill this out during create + // but save the global sum until needed. + multi1d choles_done; // Keep note of whether the decomposition has been done + // on a particular checkerboard. + + OLattice > > > > tri_dia; + OLattice > > > > tri_off; + }; + + + + + + // Empty constructor. Must use create later + template + JITCloverTermT::JITCloverTermT() {} + + // Now copy + template + void JITCloverTermT::create(Handle< FermState,multi1d > > fs, + const CloverFermActParams& param_, + const JITCloverTermT& from) + { + START_CODE(); + + //std::cout << "PTX Clover create from other " << (void*)this << "\n"; + + u.resize(Nd); + + u = fs->getLinks(); + fbc = fs->getFermBC(); + param = param_; + + // Sanity check + if (fbc.operator->() == 0) { + QDPIO::cerr << "JITCloverTerm: error: fbc is null" << std::endl; + QDP_abort(1); + } + + { + RealT ff = param.anisoParam.anisoP ? Real(1) / param.anisoParam.xi_0 : Real(1); + param.clovCoeffR *= Real(0.5) * ff; + param.clovCoeffT *= Real(0.5); + } + + // + // Yuk. Some bits of knowledge of the dslash term are buried in the + // effective mass term. They show up here. If I wanted some more + // complicated dslash then this will have to be fixed/adjusted. + // + RealT diag_mass; + { + RealT ff = param.anisoParam.anisoP ? param.anisoParam.nu / param.anisoParam.xi_0 : Real(1); + diag_mass = 1 + (Nd-1)*ff + param.Mass; + } + + + /* Calculate F(mu,nu) */ + //multi1d f; + //mesField(f, u); + //makeClov(f, diag_mass); + + choles_done.resize(rb.numSubsets()); + for(int i=0; i < rb.numSubsets(); i++) { + choles_done[i] = from.choles_done[i]; + } + + tr_log_diag_ = from.tr_log_diag_; + + tri_dia = from.tri_dia; + tri_off = from.tri_off; + + END_CODE(); + } + + + //! Creation routine + template + void JITCloverTermT::create(Handle< FermState,multi1d > > fs, + const CloverFermActParams& param_) + { + START_CODE(); + + //std::cout << "PTX Clover create " << (void*)this << "\n"; + + u.resize(Nd); + + u = fs->getLinks(); + fbc = fs->getFermBC(); + param = param_; + + // Sanity check + if (fbc.operator->() == 0) { + QDPIO::cerr << "JITCloverTerm: error: fbc is null" << std::endl; + QDP_abort(1); + } + + { + RealT ff = param.anisoParam.anisoP ? Real(1) / param.anisoParam.xi_0 : Real(1); + param.clovCoeffR *= RealT(0.5) * ff; + param.clovCoeffT *= RealT(0.5); + } + + // + // Yuk. Some bits of knowledge of the dslash term are buried in the + // effective mass term. They show up here. If I wanted some more + // complicated dslash then this will have to be fixed/adjusted. + // + RealT diag_mass; + { + RealT ff = param.anisoParam.anisoP ? param.anisoParam.nu / param.anisoParam.xi_0 : Real(1); + diag_mass = 1 + (Nd-1)*ff + param.Mass; + } + + + /* Calculate F(mu,nu) */ + multi1d f; + mesField(f, u); + makeClov(f, diag_mass); + + choles_done.resize(rb.numSubsets()); + for(int i=0; i < rb.numSubsets(); i++) { + choles_done[i] = false; + } + + END_CODE(); + } + + + /* + * MAKCLOV + * + * In this routine, MAKCLOV calculates + + * 1 - (1/4)*sigma(mu,nu) F(mu,nu) + + * using F from mesfield + + * F(mu,nu) = (1/4) sum_p (1/2) [ U_p(x) - U^dag_p(x) ] + + * using basis of SPPROD and stores in a lower triangular matrix + * (no diagonal) plus real diagonal + + * where + * U_1 = u(x,mu)*u(x+mu,nu)*u_dag(x+nu,mu)*u_dag(x,nu) + * U_2 = u(x,nu)*u_dag(x-mu+nu,mu)*u_dag(x-mu,nu)*u(x-mu,mu) + * U_3 = u_dag(x-mu,mu)*u_dag(x-mu-nu,nu)*u(x-mu-nu,mu)*u(x-nu,nu) + * U_4 = u_dag(x-nu,nu)*u(x-nu,mu)*u(x-nu+mu,nu)*u_dag(x,mu) + + * and + + * | sigF(1) sigF(3) 0 0 | + * sigF = | sigF(5) -sigF(1) 0 0 | + * | 0 0 -sigF(0) -sigF(2) | + * | 0 0 -sigF(4) sigF(0) | + * where + * sigF(i) is a color matrix + + * sigF(0) = i*(ClovT*E_z + ClovR*B_z) + * = i*(ClovT*F(3,2) + ClovR*F(1,0)) + * sigF(1) = i*(ClovT*E_z - ClovR*B_z) + * = i*(ClovT*F(3,2) - ClovR*F(1,0)) + * sigF(2) = i*(E_+ + B_+) + * sigF(3) = i*(E_+ - B_+) + * sigF(4) = i*(E_- + B_-) + * sigF(5) = i*(E_- - B_-) + * i*E_+ = (i*ClovT*E_x - ClovT*E_y) + * = (i*ClovT*F(3,0) - ClovT*F(3,1)) + * i*E_- = (i*ClovT*E_x + ClovT*E_y) + * = (i*ClovT*F(3,0) + ClovT*F(3,1)) + * i*B_+ = (i*ClovR*B_x - ClovR*B_y) + * = (i*ClovR*F(2,1) + ClovR*F(2,0)) + * i*B_- = (i*ClovR*B_x + ClovR*B_y) + * = (i*ClovR*F(2,1) - ClovR*F(2,0)) + + * NOTE: I am using i*F of the usual F defined by UKQCD, Heatlie et.al. + + * NOTE: the above definitions assume that the time direction, t_dir, + * is 3. In general F(k,j) is multiplied with ClovT if either + * k=t_dir or j=t_dir, and with ClovR otherwise. + + *+++ + * Here are some notes on the origin of this routine. NOTE, ClovCoeff or u0 + * are not actually used in MAKCLOV. + * + * The clover mass term is suppose to act on a std::vector like + * + * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi + + * Definitions used here (NOTE: no "i") + * sigma(mu,nu) = gamma(mu)*gamma(nu) - gamma(nu)*gamma(mu) + * = 2*gamma(mu)*gamma(nu) for mu != nu + * + * chi = sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu < nu + * = (1/2) * sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu != nu + * = (1/4) * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)*psi + * + * + * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi + * = psi - (ClovCoeff/u0^3) * kappa * chi + * == psi - kappa * chi + * + * We have absorbed ClovCoeff/u0^3 into kappa. A u0 was previously absorbed into kappa + * for compatibility to ancient conventions. + *--- + + * Arguments: + * \param f field strength tensor F(cb,mu,nu) (Read) + * \param diag_mass effective mass term (Read) + */ + + template + void function_make_clov_exec(JitFunction& function, + const RealT& diag_mass, + const U& f0, + const U& f1, + const U& f2, + const U& f3, + const U& f4, + const U& f5, + X& tri_dia, + Y& tri_off) + { +#ifdef QDP_DEEP_LOG + function.type_W = typeid(REAL).name(); + //function.set_dest_id( tri_dia.getId() ); + function.set_dest_id( tri_off.getId() ); + function.set_is_lat(true); +#endif + + AddressLeaf addr_leaf(all); + + forEach(diag_mass, addr_leaf, NullCombine()); + forEach(f0, addr_leaf, NullCombine()); + forEach(f1, addr_leaf, NullCombine()); + forEach(f2, addr_leaf, NullCombine()); + forEach(f3, addr_leaf, NullCombine()); + forEach(f4, addr_leaf, NullCombine()); + forEach(f5, addr_leaf, NullCombine()); + forEach(tri_dia, addr_leaf, NullCombine()); + forEach(tri_off, addr_leaf, NullCombine()); + + int th_count = Layout::sitesOnNode(); + + WorkgroupGuardExec workgroupGuardExec(th_count , MG::get(f0.get_layout_ref()).sitesOnNode()); + + std::vector ids; + workgroupGuardExec.check(ids); + ids.push_back( all.getIdSiteTable() ); + for(unsigned i=0; i < addr_leaf.ids.size(); ++i) + ids.push_back( addr_leaf.ids[i] ); + jit_launch(function,th_count,ids); + } + + + + template + void function_make_clov_build(JitFunction& function, + const RealT& diag_mass, + const U& f0, + const U& f1, + const U& f2, + const U& f3, + const U& f4, + const U& f5, + const X& tri_dia, + const Y& tri_off) + { + //std::cout << __PRETTY_FUNCTION__ << ": entering\n"; + + typedef typename WordType::Type_t REALT; + + llvm_start_new_function("make_clov",__PRETTY_FUNCTION__ ); + + WorkgroupGuard workgroupGuard; + ParamRef p_site_table = llvm_add_param(); + + ParamLeaf param_leaf(workgroupGuard); + + typedef typename LeafFunctor::Type_t RealTJIT; + RealTJIT diag_mass_jit(forEach(diag_mass, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t UJIT; + UJIT f0_jit(forEach(f0, param_leaf, TreeCombine())); + UJIT f1_jit(forEach(f1, param_leaf, TreeCombine())); + UJIT f2_jit(forEach(f2, param_leaf, TreeCombine())); + UJIT f3_jit(forEach(f3, param_leaf, TreeCombine())); + UJIT f4_jit(forEach(f4, param_leaf, TreeCombine())); + UJIT f5_jit(forEach(f5, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t XJIT; + XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t YJIT; + YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine())); + + llvm::Value* r_idx_thread = llvm_thread_idx(); + + workgroupGuard.check(r_idx_thread); + llvm::Value* r_idx = llvm_array_type_indirection( p_site_table , r_idx_thread ); + + auto f0_j = f0_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto f1_j = f1_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto f2_j = f2_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto f3_j = f3_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto f4_j = f4_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto f5_j = f5_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + + auto tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + auto tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced , r_idx ); + + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 2*Nc; ii++) { + tri_dia_j.elem(jj).elem(ii) = diag_mass_jit.elem().elem().elem(); + //tri[site].diag[jj][ii] = diag_mass.elem().elem().elem(); + } + } + + + // RComplexREG > E_minus; + // RComplexREG > B_minus; + // RComplexREG > ctmp_0; + // RComplexREG > ctmp_1; + // RScalarREG > rtmp_0; + // RScalarREG > rtmp_1; + + typedef RComplexJIT > C_t; + typedef RScalarJIT > R_t; + + C_t E_minus = StackAllocJit< C_t >::alloc(); + C_t B_minus = StackAllocJit< C_t >::alloc(); + C_t ctmp_0 = StackAllocJit< C_t >::alloc(); + C_t ctmp_1 = StackAllocJit< C_t >::alloc(); + R_t rtmp_0 = StackAllocJit< R_t >::alloc(); + R_t rtmp_1 = StackAllocJit< R_t >::alloc(); + + + JitForLoop i( 0 , Nc ); + { + ctmp_0 = f5_j.elem().elem(i,i); + ctmp_0 -= f0_j.elem().elem(i,i); + rtmp_0 = imag(ctmp_0); + tri_dia_j.elem(0).elem(i) += rtmp_0; + + tri_dia_j.elem(0).elem(i+Nc) -= rtmp_0; + + ctmp_1 = f5_j.elem().elem(i,i); + ctmp_1 += f0_j.elem().elem(i,i); + rtmp_1 = imag(ctmp_1); + tri_dia_j.elem(1).elem(i) -= rtmp_1; + + tri_dia_j.elem(1).elem(i+Nc) += rtmp_1; + } + i.end(); + + for(int i = 1; i < Nc; ++i) { + for(int j = 0; j < i; ++j) { + + int elem_ij = i*(i-1)/2 + j; + int elem_tmp = (i+Nc)*(i+Nc-1)/2 + j+Nc; + + ctmp_0 = f0_j.elem().elem(i,j); + ctmp_0 -= f5_j.elem().elem(i,j); + tri_off_j.elem(0).elem(elem_ij) = timesI(ctmp_0); + + zero_rep( tri_off_j.elem(0).elem(elem_tmp) ); + tri_off_j.elem(0).elem(elem_tmp) -= tri_off_j.elem(0).elem(elem_ij);// * -1.0; + + ctmp_1 = f5_j.elem().elem(i,j); + ctmp_1 += f0_j.elem().elem(i,j); + tri_off_j.elem(1).elem(elem_ij) = timesI(ctmp_1); + + zero_rep( tri_off_j.elem(1).elem(elem_tmp) ); + tri_off_j.elem(1).elem(elem_tmp) -= tri_off_j.elem(1).elem(elem_ij); + } + } + + for(int i = 0; i < Nc; ++i) { + for(int j = 0; j < Nc; ++j) { + + int elem_ij = (i+Nc)*(i+Nc-1)/2 + j; + + //E_minus = timesI(f2_j.elem().elem(i,j)); + E_minus = f2_j.elem().elem(i,j); + E_minus = timesI( E_minus ); + + E_minus += f4_j.elem().elem(i,j); + + //B_minus = timesI(f3_j.elem().elem(i,j)); + B_minus = f3_j.elem().elem(i,j); + B_minus = timesI( B_minus ); + + B_minus -= f1_j.elem().elem(i,j); + + tri_off_j.elem(0).elem(elem_ij) = B_minus - E_minus; + + tri_off_j.elem(1).elem(elem_ij) = E_minus + B_minus; + } + } + + // std::cout << __PRETTY_FUNCTION__ << ": leaving\n"; + + jit_get_function(function); + } + + + + + /* This now just sets up and dispatches... */ + template + void JITCloverTermT::makeClov(const multi1d& f, const RealT& diag_mass) + { + START_CODE(); + + if ( Nd != 4 ){ + QDPIO::cerr << __func__ << ": expecting Nd==4" << std::endl; + QDP_abort(1); + } + + if ( Ns != 4 ){ + QDPIO::cerr << __func__ << ": expecting Ns==4" << std::endl; + QDP_abort(1); + } + + U f0 = f[0] * getCloverCoeff(0,1); + U f1 = f[1] * getCloverCoeff(0,2); + U f2 = f[2] * getCloverCoeff(0,3); + U f3 = f[3] * getCloverCoeff(1,2); + U f4 = f[4] * getCloverCoeff(1,3); + U f5 = f[5] * getCloverCoeff(2,3); + + + //QDPIO::cout << "PTX Clover make " << (void*)this << "\n"; + //std::cout << "PTX Clover make " << (void*)this << "\n"; + static JitFunction function; + + if (function.empty()) + function_make_clov_build(function, diag_mass, f0,f1,f2,f3,f4,f5, tri_dia , tri_off ); + + // Execute the function + function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,tri_dia, tri_off); + + END_CODE(); + } + + + //! Invert + /*! + * Computes the inverse of the term on cb using Cholesky + */ + template + void JITCloverTermT::choles(int cb) + { + START_CODE(); + + // When you are doing the cholesky - also fill out the trace_log_diag piece) + // chlclovms(tr_log_diag_, cb); + // Switch to LDL^\dag inversion + ldagdlinv(tr_log_diag_,cb); + + END_CODE(); + } + + + //! Invert + /*! + * Computes the inverse of the term on cb using Cholesky + * + * \return logarithm of the determinant + */ + template + Double JITCloverTermT::cholesDet(int cb) const + { + START_CODE(); + + if( choles_done[cb] == false ) + { + QDPIO::cout << __func__ << ": Error: you have not done the Cholesky.on this operator on this subset" << std::endl; + QDPIO::cout << "You sure you should not be asking invclov?" << std::endl; + QDP_abort(1); + } + + LatticeREAL ff=tr_log_diag_; + + + END_CODE(); + + // Need to thread generic sums in QDP++? + // Need to thread generic norm2() in QDP++? + return sum(tr_log_diag_, rb[cb]); + } + + + template + void function_ldagdlinv_exec( JitFunction& function, + T& tr_log_diag, + X& tri_dia, + Y& tri_off, + const Subset& s) + { +#ifdef QDP_DEEP_LOG + function.type_W = typeid(REAL).name(); + function.set_dest_id( tr_log_diag.getId() ); + function.set_is_lat(true); +#endif + + AddressLeaf addr_leaf(s); + + forEach(tr_log_diag, addr_leaf, NullCombine()); + forEach(tri_dia, addr_leaf, NullCombine()); + forEach(tri_off, addr_leaf, NullCombine()); + + int th_count = s.numSiteTable(); + + WorkgroupGuardExec workgroupGuardExec(th_count,MG::get(tr_log_diag.get_layout_ref()).sitesOnNode()); + + std::vector ids; + workgroupGuardExec.check(ids); + ids.push_back( s.getIdSiteTable() ); + for(unsigned i=0; i < addr_leaf.ids.size(); ++i) + ids.push_back( addr_leaf.ids[i] ); + jit_launch(function,th_count,ids); + } + + + + + + template + void function_ldagdlinv_build(JitFunction& function, + const T& tr_log_diag, + const X& tri_dia, + const Y& tri_off, + const Subset& s) + { + typedef typename WordType::Type_t REALT; + + //std::cout << __PRETTY_FUNCTION__ << " entering\n"; + + llvm_start_new_function("ldagdlinv",__PRETTY_FUNCTION__); + + WorkgroupGuard workgroupGuard; + ParamRef p_site_table = llvm_add_param(); + + ParamLeaf param_leaf(workgroupGuard); + + typedef typename LeafFunctor::Type_t TJIT; + TJIT tr_log_diag_jit(forEach(tr_log_diag, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t XJIT; + XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t YJIT; + YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine())); + + llvm::Value* r_idx_thread = llvm_thread_idx(); + + workgroupGuard.check(r_idx_thread); + + llvm::Value* r_idx = llvm_array_type_indirection( p_site_table , r_idx_thread ); + + auto tr_log_diag_j = tr_log_diag_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx); + + //typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r; + //typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r; + + // tri_dia_r.setup( tri_dia_j ); + // tri_off_r.setup( tri_off_j ); + + //RScalarREG > zip; + typedef RScalarJIT > R_t; + typedef RComplexJIT > C_t; + + R_t zip = StackAllocJit< R_t >::alloc(); + zero_rep(zip); + int N = 2*Nc; + + //int site_neg_logdet=0; + + auto inv_d = stack_alloc_array_jit(6); + auto inv_offd = stack_alloc_array_jit(15); + auto v = stack_alloc_array_jit(6); + auto diag_g = stack_alloc_array_jit(6); + + C_t A_ii = StackAllocJit< C_t >::alloc(); + //C_t sum = StackAllocJit< C_t >::alloc(); + R_t one = StackAllocJit< R_t >::alloc(); + one = 1.0; + + for(int block=0; block < 2; block++) { + + // RScalarREG > inv_d[6] ; + // RComplexREG > inv_offd[15] ; + // RComplexREG > v[6] ; + // RScalarREG > diag_g[6] ; + + for(int i=0; i < N; i++) { + inv_d[i] = tri_dia_j.elem(block).elem(i); + } + + for(int i=0; i < 15; i++) { + inv_offd[i] = tri_off_j.elem(block).elem(i); + } + + + for(int j=0; j < N; ++j) { + + for(int i=0; i < j; i++) { + int elem_ji = j*(j-1)/2 + i; + + //RComplexREG > A_ii = cmplx( inv_d[i], zip ); + A_ii = cmplx( inv_d[i], zip ); + v[i] = A_ii*adj(inv_offd[elem_ji]); + } + + + v[j] = cmplx(inv_d[j],zip); + + for(int k=0; k < j; k++) { + int elem_jk = j*(j-1)/2 + k; + v[j] -= inv_offd[elem_jk]*v[k]; + } + + inv_d[j] = real( v[j] ); + + for(int k=j+1; k < N; k++) { + int elem_kj = k*(k-1)/2 + j; + for(int l=0; l < j; l++) { + int elem_kl = k*(k-1)/2 + l; + inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l]; + } + inv_offd[elem_kj] /= v[j]; + } + } + + + // Now fix up the inverse + //RScalarREG > one(1.0); + //one.elem() = (REALT)1; + + for(int i=0; i < N; i++) { + diag_g[i] = one/inv_d[i]; + + // Compute the trace log + // NB we are always doing trace log | A | + // (because we are always working with actually A^\dagger A + // even in one flavour case where we square root) + tr_log_diag_j.elem().elem() += log(fabs(inv_d[i])); + // However, it is worth counting just the no of negative logdets + // on site +#if 0 + if( inv_d[i].elem() < 0 ) { + site_neg_logdet++; + } +#endif + } + + // Now we need to invert the L D L^\dagger + // We can do this by solving: + // + // L D L^\dagger M^{-1} = 1 + // + // This can be done by solving L D X = 1 (X = L^\dagger M^{-1}) + // + // Then solving L^\dagger M^{-1} = X + // + // LD is lower diagonal and so X will also be lower diagonal. + // LD X = 1 can be solved by forward substitution. + // + // Likewise L^\dagger is strictly upper triagonal and so + // L^\dagger M^{-1} = X can be solved by forward substitution. + + //RComplexREG > sum; + for(int k = 0; k < N; ++k) { + + for(int i = 0; i < k; ++i) { + zero_rep(v[i]); + } + + /*# Forward substitution */ + + // The first element is the inverse of the diagonal + v[k] = cmplx(diag_g[k],zip); + + for(int i = k+1; i < N; ++i) { + zero_rep(v[i]); + + for(int j = k; j < i; ++j) { + int elem_ij = i*(i-1)/2+j; + + // subtract l_ij*d_j*x_{kj} + v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j]; + + } + + // scale out by 1/d_i + v[i] *= diag_g[i]; + } + + /*# Backward substitution */ + // V[N-1] remains unchanged + // Start from V[N-2] + + for(int i = N-2; (int)i >= (int)k; --i) { + for(int j = i+1; j < N; ++j) { + int elem_ji = j*(j-1)/2 + i; + // Subtract terms of typ (l_ji)*x_kj + v[i] -= adj(inv_offd[elem_ji]) * v[j]; + } + } + + /*# Overwrite column k of invcl.offd */ + inv_d[k] = real(v[k]); + for(int i = k+1; i < N; ++i) { + + int elem_ik = i*(i-1)/2+k; + inv_offd[elem_ik] = v[i]; + } + } + + // Overwrite original data + for(int i=0; i < N; i++) { + tri_dia_j.elem(block).elem(i) = inv_d[i]; + } + for(int i=0; i < 15; i++) { + tri_off_j.elem(block).elem(i) = inv_offd[i]; + } + } + + // std::cout << __PRETTY_FUNCTION__ << " leaving\n"; + + jit_get_function(function); + } + + + + + + /*! An LDL^\dag decomposition and inversion? */ + template + void JITCloverTermT::ldagdlinv(LatticeREAL& tr_log_diag, int cb) + { + START_CODE(); + + if ( 2*Nc < 3 ) + { + QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl; + QDP_abort(1); + } + + // Zero trace log + tr_log_diag[rb[cb]] = zero; + + //QDPIO::cout << "PTX Clover ldagdlinv " << (void*)this << "\n"; + //std::cout << "PTX Clover ldagdlinv " << (void*)this << "\n"; + static JitFunction function; + + if (function.empty()) + function_ldagdlinv_build(function, tr_log_diag, tri_dia, tri_off, rb[cb] ); + + // Execute the function + function_ldagdlinv_exec(function, tr_log_diag, tri_dia, tri_off, rb[cb] ); + + // This comes from the days when we used to do Cholesky + choles_done[cb] = true; + END_CODE(); + } + + /*! CHLCLOVMS - Cholesky decompose the clover mass term and uses it to + * compute lower(A^-1) = lower((L.L^dag)^-1) + * Adapted from Golub and Van Loan, Matrix Computations, 2nd, Sec 4.2.4 + * + * Arguments: + * + * \param DetP flag whether to compute determinant (Read) + * \param logdet logarithm of the determinant (Write) + * \param cb checkerboard of work (Read) + */ + + + + + + + //! TRIACNTR + /*! + * \ingroup linop + * + * Calculates + * Tr_D ( Gamma_mat L ) + * + * This routine is specific to Wilson fermions! + * + * the trace over the Dirac indices for one of the 16 Gamma matrices + * and a hermitian color x spin matrix A, stored as a block diagonal + * complex lower triangular matrix L and a real diagonal diag_L. + + * Here 0 <= mat <= 15 and + * if mat = mat_1 + mat_2 * 2 + mat_3 * 4 + mat_4 * 8 + * + * Gamma(mat) = gamma(1)^(mat_1) * gamma(2)^(mat_2) * gamma(3)^(mat_3) + * * gamma(4)^(mat_4) + * + * Further, in basis for the Gamma matrices used, A is of the form + * + * | A_0 | 0 | + * A = | --------- | + * | 0 | A_1 | + * + * + * Arguments: + * + * \param B the resulting SU(N) color matrix (Write) + * \param clov clover term (Read) + * \param mat label of the Gamma matrix (Read) + */ + + + template + void function_triacntr_exec( JitFunction& function, + U& B, + const X& tri_dia, + const Y& tri_off, + int mat, + const Subset& s) + { +#ifdef QDP_DEEP_LOG + function.type_W = typeid(REAL).name(); + function.set_dest_id( B.getId() ); + function.set_is_lat(true); +#endif + + AddressLeaf addr_leaf(s); + + forEach(B, addr_leaf, NullCombine()); + forEach(tri_dia, addr_leaf, NullCombine()); + forEach(tri_off, addr_leaf, NullCombine()); + + int th_count = s.numSiteTable(); + + WorkgroupGuardExec workgroupGuardExec(th_count,MG::get(tri_dia.get_layout_ref()).sitesOnNode()); + + JitParam jit_mat( QDP_get_global_cache().addJitParamInt( mat ) ); + + std::vector ids; + workgroupGuardExec.check(ids); + ids.push_back( s.getIdSiteTable() ); + ids.push_back( jit_mat.get_id() ); + for(unsigned i=0; i < addr_leaf.ids.size(); ++i) + ids.push_back( addr_leaf.ids[i] ); + jit_launch(function,th_count,ids); + } + + + + + template + void function_triacntr_build( JitFunction& function, + const U& B, + const X& tri_dia, + const Y& tri_off, + int mat, + const Subset& s) + { + //std::cout << __PRETTY_FUNCTION__ << ": entering\n"; + + typedef typename WordType::Type_t REALT; + + llvm_start_new_function( "triacntr" , __PRETTY_FUNCTION__ ); + + WorkgroupGuard workgroupGuard; + ParamRef p_site_table = llvm_add_param(); + + ParamRef p_mat = llvm_add_param(); + + ParamLeaf param_leaf(workgroupGuard); + + typedef typename LeafFunctor::Type_t UJIT; + UJIT B_jit(forEach(B, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t XJIT; + XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine())); + + typedef typename LeafFunctor::Type_t YJIT; + YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine())); + + llvm::Value* r_idx_thread = llvm_thread_idx(); + + workgroupGuard.check(r_idx_thread); + + llvm::Value* r_idx = llvm_array_type_indirection( p_site_table , r_idx_thread ); + + llvm::Value * r_mat = llvm_derefParam( p_mat ); + + auto B_j = B_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx); + + typedef RScalarJIT > R_t; + typedef RComplexJIT > C_t; + + JitSwitch sw(r_mat); + { + /*# gamma( 0) 1 0 0 0 # ( 0000 ) --> 0 */ + /*# 0 1 0 0 */ + /*# 0 0 1 0 */ + /*# 0 0 0 1 */ + /*# From diagonal part */ + sw.case_begin(0); + { + // RComplexREG > lctmp0; + // RScalarREG< WordREG > lr_zero0; + // RScalarREG< WordREG > lrtmp0; + + C_t lctmp0 = StackAllocJit< C_t >::alloc(); + R_t lr_zero0 = StackAllocJit< R_t >::alloc(); + R_t lrtmp0 = StackAllocJit< R_t >::alloc(); + + zero_rep(lr_zero0); + + for(int i0 = 0; i0 < Nc; ++i0) + { + lrtmp0 = tri_dia_j.elem(0).elem(i0); + lrtmp0 += tri_dia_j.elem(0).elem(i0+Nc); + lrtmp0 += tri_dia_j.elem(1).elem(i0); + lrtmp0 += tri_dia_j.elem(1).elem(i0+Nc); + B_j.elem().elem(i0,i0) = cmplx(lrtmp0,lr_zero0); + } + + /*# From lower triangular portion */ + int elem_ij0 = 0; + for(int i0 = 1; i0 < Nc; ++i0) { + + int elem_ijb0 = (i0+Nc)*(i0+Nc-1)/2 + Nc; + + for(int j0 = 0; j0 < i0; ++j0) { + + lctmp0 = tri_off_j.elem(0).elem(elem_ij0); + lctmp0 += tri_off_j.elem(0).elem(elem_ijb0); + lctmp0 += tri_off_j.elem(1).elem(elem_ij0); + lctmp0 += tri_off_j.elem(1).elem(elem_ijb0); + + B_j.elem().elem(j0,i0) = lctmp0; + B_j.elem().elem(i0,j0) = adj(lctmp0); + + elem_ij0++; + elem_ijb0++; + } + } + } + sw.case_end(); + + + /*# gamma( 12) -i 0 0 0 # ( 0011 ) --> 3 */ + /*# 0 i 0 0 */ + /*# 0 0 -i 0 */ + /*# 0 0 0 i */ + /*# From diagonal part */ + sw.case_begin( 3 ); + { + // RComplexREG > lctmp3; + // RScalarREG > lr_zero3; + // RScalarREG > lrtmp3; + + C_t lctmp3 = StackAllocJit< C_t >::alloc(); + R_t lr_zero3 = StackAllocJit< R_t >::alloc(); + R_t lrtmp3 = StackAllocJit< R_t >::alloc(); + + lr_zero3 = 0; + + for(int i3 = 0; i3 < Nc; ++i3) { + + lrtmp3 = tri_dia_j.elem(0).elem(i3+Nc); + lrtmp3 -= tri_dia_j.elem(0).elem(i3); + lrtmp3 -= tri_dia_j.elem(1).elem(i3); + lrtmp3 += tri_dia_j.elem(1).elem(i3+Nc); + B_j.elem().elem(i3,i3) = cmplx(lr_zero3,lrtmp3); + } + + /*# From lower triangular portion */ + int elem_ij3 = 0; + for(int i3 = 1; i3 < Nc; ++i3) { + + int elem_ijb3 = (i3+Nc)*(i3+Nc-1)/2 + Nc; + + for(int j3 = 0; j3 < i3; ++j3) { + + lctmp3 = tri_off_j.elem(0).elem(elem_ijb3); + lctmp3 -= tri_off_j.elem(0).elem(elem_ij3); + lctmp3 -= tri_off_j.elem(1).elem(elem_ij3); + lctmp3 += tri_off_j.elem(1).elem(elem_ijb3); + + B_j.elem().elem(j3,i3) = timesI(adj(lctmp3)); + B_j.elem().elem(i3,j3) = timesI(lctmp3); + + elem_ij3++; + elem_ijb3++; + } + } + } + sw.case_end(); + + /*# gamma( 13) 0 -1 0 0 # ( 0101 ) --> 5 */ + /*# 1 0 0 0 */ + /*# 0 0 0 -1 */ + /*# 0 0 1 0 */ + sw.case_begin( 5 ); + { + // RComplexREG > lctmp5; + // RScalarREG > lrtmp5; + + C_t lctmp5 = StackAllocJit< C_t >::alloc(); + R_t lrtmp5 = StackAllocJit< R_t >::alloc(); + + for(int i5 = 0; i5 < Nc; ++i5) { + + int elem_ij5 = (i5+Nc)*(i5+Nc-1)/2; + + for(int j5 = 0; j5 < Nc; ++j5) { + + int elem_ji5 = (j5+Nc)*(j5+Nc-1)/2 + i5; + + lctmp5 = adj(tri_off_j.elem(0).elem(elem_ji5)); + lctmp5 -= tri_off_j.elem(0).elem(elem_ij5); + lctmp5 += adj(tri_off_j.elem(1).elem(elem_ji5)); + lctmp5 -= tri_off_j.elem(1).elem(elem_ij5); + + B_j.elem().elem(i5,j5) = lctmp5; + + elem_ij5++; + } + } + } + sw.case_end(); + + /*# gamma( 23) 0 -i 0 0 # ( 0110 ) --> 6 */ + /*# -i 0 0 0 */ + /*# 0 0 0 -i */ + /*# 0 0 -i 0 */ + sw.case_begin( 6 ); + { + // RComplexREG > lctmp6; + // RScalarREG > lrtmp6; + + C_t lctmp6 = StackAllocJit< C_t >::alloc(); + R_t lrtmp6 = StackAllocJit< R_t >::alloc(); + + for(int i6 = 0; i6 < Nc; ++i6) { + + int elem_ij6 = (i6+Nc)*(i6+Nc-1)/2; + + for(int j6 = 0; j6 < Nc; ++j6) { + + int elem_ji6 = (j6+Nc)*(j6+Nc-1)/2 + i6; + + lctmp6 = adj(tri_off_j.elem(0).elem(elem_ji6)); + lctmp6 += tri_off_j.elem(0).elem(elem_ij6); + lctmp6 += adj(tri_off_j.elem(1).elem(elem_ji6)); + lctmp6 += tri_off_j.elem(1).elem(elem_ij6); + + B_j.elem().elem(i6,j6) = timesMinusI(lctmp6); + + elem_ij6++; + } + } + } + sw.case_end(); + + /*# gamma( 14) 0 i 0 0 # ( 1001 ) --> 9 */ + /*# i 0 0 0 */ + /*# 0 0 0 -i */ + /*# 0 0 -i 0 */ + sw.case_begin( 9 ); + { + // RComplexREG > lctmp9; + // RScalarREG > lrtmp9; + + C_t lctmp9 = StackAllocJit< C_t >::alloc(); + R_t lrtmp9 = StackAllocJit< R_t >::alloc(); + + for(int i9 = 0; i9 < Nc; ++i9) { + + int elem_ij9 = (i9+Nc)*(i9+Nc-1)/2; + + for(int j9 = 0; j9 < Nc; ++j9) { + + int elem_ji9 = (j9+Nc)*(j9+Nc-1)/2 + i9; + + lctmp9 = adj(tri_off_j.elem(0).elem(elem_ji9)); + lctmp9 += tri_off_j.elem(0).elem(elem_ij9); + lctmp9 -= adj(tri_off_j.elem(1).elem(elem_ji9)); + lctmp9 -= tri_off_j.elem(1).elem(elem_ij9); + + B_j.elem().elem(i9,j9) = timesI(lctmp9); + + elem_ij9++; + } + } + } + sw.case_end(); + + + /*# gamma( 24) 0 -1 0 0 # ( 1010 ) --> 10 */ + /*# 1 0 0 0 */ + /*# 0 0 0 1 */ + /*# 0 0 -1 0 */ + sw.case_begin( 10 ); + { + // RComplexREG > lctmp10; + // RScalarREG > lrtmp10; + + C_t lctmp10 = StackAllocJit< C_t >::alloc(); + R_t lrtmp10 = StackAllocJit< R_t >::alloc(); + + for(int i10 = 0; i10 < Nc; ++i10) { + + int elem_ij10 = (i10+Nc)*(i10+Nc-1)/2; + + for(int j10 = 0; j10 < Nc; ++j10) { + + int elem_ji10 = (j10+Nc)*(j10+Nc-1)/2 + i10; + + lctmp10 = adj(tri_off_j.elem(0).elem(elem_ji10)); + lctmp10 -= tri_off_j.elem(0).elem(elem_ij10); + lctmp10 -= adj(tri_off_j.elem(1).elem(elem_ji10)); + lctmp10 += tri_off_j.elem(1).elem(elem_ij10); + + B_j.elem().elem(i10,j10) = lctmp10; + + elem_ij10++; + } + } + } + sw.case_end(); + + + /*# gamma( 34) i 0 0 0 # ( 1100 ) --> 12 */ + /*# 0 -i 0 0 */ + /*# 0 0 -i 0 */ + /*# 0 0 0 i */ + /*# From diagonal part */ + sw.case_begin( 12 ); + { + // RComplexREG > lctmp12; + // RScalarREG > lr_zero12; + // RScalarREG > lrtmp12; + + C_t lctmp12 = StackAllocJit< C_t >::alloc(); + R_t lr_zero12 = StackAllocJit< R_t >::alloc(); + R_t lrtmp12 = StackAllocJit< R_t >::alloc(); + + lr_zero12 = 0; + + for(int i12 = 0; i12 < Nc; ++i12) { + + lrtmp12 = tri_dia_j.elem(0).elem(i12); + lrtmp12 -= tri_dia_j.elem(0).elem(i12+Nc); + lrtmp12 -= tri_dia_j.elem(1).elem(i12); + lrtmp12 += tri_dia_j.elem(1).elem(i12+Nc); + B_j.elem().elem(i12,i12) = cmplx(lr_zero12,lrtmp12); + } + + /*# From lower triangular portion */ + int elem_ij12 = 0; + for(int i12 = 1; i12 < Nc; ++i12) { + + int elem_ijb12 = (i12+Nc)*(i12+Nc-1)/2 + Nc; + + for(int j12 = 0; j12 < i12; ++j12) { + + lctmp12 = tri_off_j.elem(0).elem(elem_ij12); + lctmp12 -= tri_off_j.elem(0).elem(elem_ijb12); + lctmp12 -= tri_off_j.elem(1).elem(elem_ij12); + lctmp12 += tri_off_j.elem(1).elem(elem_ijb12); + + B_j.elem().elem(i12,j12) = timesI(lctmp12); + B_j.elem().elem(j12,i12) = timesI(adj(lctmp12)); + + elem_ij12++; + elem_ijb12++; + } + } + } + sw.case_end(); + + sw.case_default(); + { + } + sw.case_end(); + } + + jit_get_function(function); + } + + + + + template + void JITCloverTermT::triacntr(U& B, int mat, int cb) const + { + START_CODE(); + + B = zero; + + if ( mat < 0 || mat > 15 ) + { + QDPIO::cerr << __func__ << ": Gamma out of range: mat = " << mat << std::endl; + QDP_abort(1); + } + + //QDPIO::cout << "PTX Clover triacntr " << (void*)this << "\n"; + //std::cout << "PTX Clover triacntr " << (void*)this << "\n"; + static JitFunction function; + + if (function.empty()) + function_triacntr_build( function, B, tri_dia, tri_off, mat, rb[cb] ); + + // Execute the function + function_triacntr_exec(function, B, tri_dia, tri_off, mat, rb[cb] ); + + END_CODE(); + } + + //! Returns the appropriate clover coefficient for indices mu and nu + template + Real + JITCloverTermT::getCloverCoeff(int mu, int nu) const + { + START_CODE(); + + if( param.anisoParam.anisoP ) { + if (mu==param.anisoParam.t_dir || nu == param.anisoParam.t_dir) { + return param.clovCoeffT; + } + else { + // Otherwise return the spatial coeff + return param.clovCoeffR; + } + } + else { + // If there is no anisotropy just return the spatial one, it will + // be the same as the temporal one + return param.clovCoeffR; + } + + END_CODE(); + } + + + + template + void function_apply_clov_exec(JitFunction& function, + T& chi, + const T& psi, + const X& tri_dia, + const Y& tri_off, + const Subset& s) + { +#ifdef QDP_DEEP_LOG + function.type_W = typeid(REAL).name(); + function.set_dest_id( chi.getId() ); + function.set_is_lat(true); +#endif + + AddressLeaf addr_leaf(s); + + forEach(chi, addr_leaf, NullCombine()); + forEach(psi, addr_leaf, NullCombine()); + forEach(tri_dia, addr_leaf, NullCombine()); + forEach(tri_off, addr_leaf, NullCombine()); + + int th_count = s.numSiteTable(); + WorkgroupGuardExec workgroupGuardExec(th_count , MG::get(chi.get_layout_ref()).sitesOnNode() ); + + std::vector ids; + workgroupGuardExec.check(ids); + ids.push_back( s.getIdSiteTable() ); + for(unsigned i=0; i < addr_leaf.ids.size(); ++i) + ids.push_back( addr_leaf.ids[i] ); + jit_launch(function,th_count,ids); + } + + + + + template + void function_apply_clov_build( JitFunction& function, + const T& chi, + const T& psi, + const X& tri_dia, + const Y& tri_off, + const Subset& s) + { + llvm_start_new_function("apply_clov",__PRETTY_FUNCTION__); + + WorkgroupGuard workgroupGuard; + ParamRef p_site_table = llvm_add_param(); + + ParamLeaf param_leaf(workgroupGuard); + + typedef typename LeafFunctor::Type_t TJIT; + TJIT chi_jit(forEach(chi, param_leaf, TreeCombine())); + TJIT psi_jit(forEach(psi, param_leaf, TreeCombine())); + // typename REGType< typename ScalarType::Type_t >::Type_t psi_r; + // typename REGType< typename ScalarType::Type_t >::Type_t chi_r; + + typedef typename LeafFunctor::Type_t XJIT; + XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine())); + // typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r; + + typedef typename LeafFunctor::Type_t YJIT; + YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine())); + // typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r; + + llvm::Value* r_idx_thread = llvm_thread_idx(); + + workgroupGuard.check(r_idx_thread); + + llvm::Value* r_idx = llvm_array_type_indirection( p_site_table , r_idx_thread ); + + auto chi_j = chi_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto psi_j = psi_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx); + auto tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx); + + auto chi_s = stack_alloc_jit< decltype(chi_j) >( chi_jit.get_var() ); + + int n = 2*Nc; + + for(int i = 0; i < n; ++i) + { + chi_s.elem((0*n+i)/3).elem((0*n+i)%3) = tri_dia_j.elem(0).elem(i) * psi_j.elem((0*n+i)/3).elem((0*n+i)%3); + // cchi[0*n+i] = tri[site].diag[0][i] * ppsi[0*n+i]; + + chi_s.elem((1*n+i)/3).elem((1*n+i)%3) = tri_dia_j.elem(1).elem(i) * psi_j.elem((1*n+i)/3).elem((1*n+i)%3); + // cchi[1*n+i] = tri[site].diag[1][i] * ppsi[1*n+i]; + } + + int kij = 0; + for(int i = 0; i < n; ++i) + { + for(int j = 0; j < i; j++) + { + chi_s.elem((0*n+i)/3).elem((0*n+i)%3) += tri_off_j.elem(0).elem(kij) * psi_j.elem((0*n+j)/3).elem((0*n+j)%3); + // cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j]; + + chi_s.elem((0*n+j)/3).elem((0*n+j)%3) += conj(tri_off_j.elem(0).elem(kij)) * psi_j.elem((0*n+i)/3).elem((0*n+i)%3); + // cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i]; + + chi_s.elem((1*n+i)/3).elem((1*n+i)%3) += tri_off_j.elem(1).elem(kij) * psi_j.elem((1*n+j)/3).elem((1*n+j)%3); + // cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j]; + + chi_s.elem((1*n+j)/3).elem((1*n+j)%3) += conj(tri_off_j.elem(1).elem(kij)) * psi_j.elem((1*n+i)/3).elem((1*n+i)%3); + // cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i]; + + kij++; + } + } + + chi_j = chi_s; + + jit_get_function(function); + } + + + + + + + + /** + * Apply a dslash + * + * Performs the operation + * + * chi <- (L + D + L^dag) . psi + * + * where + * L is a lower triangular matrix + * D is the real diagonal. (stored together in type TRIANG) + * + * Arguments: + * \param chi result (Write) + * \param psi source (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of OUTPUT std::vector (Read) + */ + template + void JITCloverTermT::apply(T& chi, const T& psi, + enum PlusMinus isign, int cb) const + { + START_CODE(); + + if ( Ns != 4 ) { + QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl; + QDP_abort(1); + } + + //QDPIO::cout << "PTX Clover apply" << (void*)this << "\n"; + //std::cout << "PTX Clover apply" << (void*)this << "\n"; + static JitFunction function; + + if (function.empty()) + function_apply_clov_build( function, chi, psi, tri_dia, tri_off, rb[cb] ); + + // Execute the function + function_apply_clov_exec(function, chi, psi, tri_dia, tri_off, rb[cb] ); + + (*this).getFermBC().modifyF(chi, QDP::rb[cb]); + + END_CODE(); + } + + + +#ifndef BUILD_QUDA_DEVIFACE_CLOVER + namespace QDPCloverEnv { + template + struct QUDAPackArgs { + int cb; + multi1d >& quda_array; + const TD& tri_dia; + const TO& tri_off; + }; + + template + void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs* a) { + int cb = a->cb; + int Ns2 = Ns/2; + + multi1d >& quda_array = a->quda_array; + + const TD& tri_dia = a->tri_dia; + const TO& tri_off = a->tri_off; + + const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14}; + + for(int ssite=lo; ssite < hi; ++ssite) { + int site = rb[cb].siteTable()[ssite]; + // First Chiral Block + for(int i=0; i < 6; i++) { + quda_array[site].diag1[i] = tri_dia.elem(site).comp[0].diag[i].elem().elem(); + } + + int target_index=0; + + for(int col=0; col < Nc*Ns2-1; col++) { + for(int row=col+1; row < Nc*Ns2; row++) { + + int source_index = row*(row-1)/2 + col; + + quda_array[site].offDiag1[target_index][0] = tri_off.elem(site).comp[0].offd[source_index].real().elem(); + quda_array[site].offDiag1[target_index][1] = tri_off.elem(site).comp[0].offd[source_index].imag().elem(); + target_index++; + } + } + // Second Chiral Block + for(int i=0; i < 6; i++) { + quda_array[site].diag2[i] = tri_dia.elem(site).comp[1].diag[i].elem().elem(); + } + + target_index=0; + for(int col=0; col < Nc*Ns2-1; col++) { + for(int row=col+1; row < Nc*Ns2; row++) { + + int source_index = row*(row-1)/2 + col; + + quda_array[site].offDiag2[target_index][0] = tri_off.elem(site).comp[1].offd[source_index].real().elem(); + quda_array[site].offDiag2[target_index][1] = tri_off.elem(site).comp[1].offd[source_index].imag().elem(); + target_index++; + } + } + } + QDPIO::cout << "\n"; + } + } + + template + void JITCloverTermT::packForQUDA(multi1d::Type_t> >& quda_array, int cb) const + { + typedef typename WordType::Type_t REALT; + int num_sites = rb[cb].siteTable().size(); + + typedef OLattice > > > > TD; + typedef OLattice > > > > TO; + + StopWatch watch; + watch.start(); + + QDPCloverEnv::QUDAPackArgs args = { cb, quda_array , tri_dia , tri_off }; + dispatch_to_threads(num_sites, args, QDPCloverEnv::qudaPackSiteLoop); + + watch.stop(); + PackForQUDATimer::Instance().get() += watch.getTimeInMicroseconds(); + } + +#endif + + template + void JITCloverTermT::applySite(T& chi, const T& psi, + enum PlusMinus isign, int site) const + { + QDP_error_exit("JITCloverTermT::applySite(T& chi, const T& psi,..) not implemented "); + } + + typedef JITCloverTermT JITCloverTerm; + typedef JITCloverTermT JITCloverTermF; + typedef JITCloverTermT JITCloverTermD; +} // End Namespace Chroma + + + +#endif + +#endif diff --git a/lib/actions/ferm/linop/clover_term_jit_w.h b/lib/actions/ferm/linop/clover_term_jit_w.h index d62ebb3ee1..7c4c3dc631 100644 --- a/lib/actions/ferm/linop/clover_term_jit_w.h +++ b/lib/actions/ferm/linop/clover_term_jit_w.h @@ -13,7 +13,7 @@ #include "actions/ferm/linop/clover_term_base_w.h" #include "meas/glue/mesfield.h" - +#if ! defined (QDP_IS_QDPJIT2) namespace QDP { @@ -1911,4 +1911,5 @@ namespace Chroma +#endif #endif diff --git a/lib/actions/ferm/linop/clover_term_w.h b/lib/actions/ferm/linop/clover_term_w.h index 7823344885..ed8151ec44 100644 --- a/lib/actions/ferm/linop/clover_term_w.h +++ b/lib/actions/ferm/linop/clover_term_w.h @@ -47,7 +47,11 @@ namespace Chroma { } #elif defined(BUILD_JIT_CLOVER_TERM) +#if ! defined (QDP_IS_QDPJIT2) #include "clover_term_jit_w.h" +#else +#include "clover_term_jit2_w.h" +#endif namespace Chroma { using CloverTerm = JITCloverTerm; using CloverTermF = JITCloverTermF; diff --git a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.cc b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.cc index ff66df312c..90f334874a 100644 --- a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.cc +++ b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.cc @@ -10,6 +10,8 @@ #include "actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -318,6 +320,8 @@ namespace Chroma } // End Namespace Chroma +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h index 6496b20f52..b7bbc9336a 100644 --- a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h +++ b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_clover_linop_w.h @@ -13,6 +13,9 @@ #include "actions/ferm/linop/central_tprec_nospin_utils.h" #include "actions/ferm/linop/clover_term_w.h" #include "actions/ferm/invert/invcg2.h" + +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { @@ -301,6 +304,8 @@ namespace Chroma } // End Namespace Chroma +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.cc b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.cc index 5c59fbda64..6d3598f43d 100644 --- a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.cc +++ b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.cc @@ -11,6 +11,8 @@ #include "actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -321,6 +323,8 @@ namespace Chroma } // End Namespace Chroma +#endif + #endif #endif #endif diff --git a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h index 373b1cc2ce..1aab09dfd3 100644 --- a/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h +++ b/lib/actions/ferm/linop/eo3dprec_s_cprec_t_wilson_linop_w.h @@ -13,6 +13,8 @@ #include "actions/ferm/linop/dslash_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -252,6 +254,7 @@ namespace Chroma } // End Namespace Chroma +#endif #endif #endif diff --git a/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.cc b/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.cc new file mode 100644 index 0000000000..907f3b22e8 --- /dev/null +++ b/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.cc @@ -0,0 +1,351 @@ +/*! \file + * \brief Even-odd preconditioned exponentiated clover linear operator + */ + +#include "actions/ferm/linop/eoprec_exp_clover_linop_w.h" + + + +namespace Chroma +{ + + using namespace QDP::Hints; + + //! Creation routine with Anisotropy + /*! + * \param u_ gauge field (Read) + * \param param_ fermion kappa (Read) + */ + void EvenOddPrecExpCloverLinOp::create(Handle< FermState > fs, + const CloverFermActParams& param_) + { + START_CODE(); + // QDPIO::cout << __PRETTY_FUNCTION__ << ": enter" << std::endl; + + QDPIO::cout << "Using even-odd preconditioned exponentiated clover\n"; + + param = param_; + + clov.create(fs, param); + + invclov.create(fs,param,clov); // make a copy + invclov.choles(0); // invert the cb=0 part + + D.create(fs, param.anisoParam); + + clov_deriv_time = 0; + clov_apply_time = 0; + + moveToFastMemoryHint(tmp1); + moveToFastMemoryHint(tmp2); + + // QDPIO::cout << __PRETTY_FUNCTION__ << ": exit" << std::endl; + END_CODE(); + } + + //! Apply the the odd-odd block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::oddOddLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + clov.apply(chi, psi, isign, 1); + chi *= (Real(Nd) + param.Mass); + + swatch.stop(); + clov_apply_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + + //! Apply the the even-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::evenEvenLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + // Nuke for testing + swatch.reset(); swatch.start(); + clov.apply(chi, psi, isign, 0); + chi *= (Real(Nd) + param.Mass); + + swatch.stop(); + clov_apply_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + //! Apply the inverse of the even-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::evenEvenInvLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + + clov.applyInv(chi, psi, isign, 0); + chi /= (Real(Nd) + param.Mass); + + swatch.stop(); + clov_apply_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + + //! Apply even-odd linop component + /*! + * The operator acts on the entire even sublattice + * + * \param chi Pseudofermion field (Write) + * \param psi Pseudofermion field (Read) + * \param isign Flag ( PLUS | MINUS ) (Read) + */ + void + EvenOddPrecExpCloverLinOp::evenOddLinOp(LatticeFermion& chi, + const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + Real mhalf = -0.5; + + D.apply(chi, psi, isign, 0); + chi[rb[0]] *= mhalf; + + END_CODE(); + } + + //! Apply odd-even linop component + /*! + * The operator acts on the entire odd sublattice + * + * \param chi Pseudofermion field (Write) + * \param psi Pseudofermion field (Read) + * \param isign Flag ( PLUS | MINUS ) (Read) + */ + void + EvenOddPrecExpCloverLinOp::oddEvenLinOp(LatticeFermion& chi, + const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + Real mhalf = -0.5; + + D.apply(chi, psi, isign, 1); + chi[rb[1]] *= mhalf; + + END_CODE(); + } + + + //! Apply even-odd preconditioned ExpClover fermion linear operator + /*! + * \param chi Pseudofermion field (Write) + * \param psi Pseudofermion field (Read) + * \param isign Flag ( PLUS | MINUS ) (Read) + */ + void EvenOddPrecExpCloverLinOp::operator()(LatticeFermion & chi, + const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + Real mquarter = -0.25; + + // tmp1_o = D_oe A^(-1)_ee D_eo psi_o + D.apply(tmp1, psi, isign, 0); + + swatch.reset(); swatch.start(); + clov.applyInv(tmp2, tmp1, isign, 0); + tmp2 /= (Real(Nd) + param.Mass); + + swatch.stop(); + clov_apply_time += swatch.getTimeInSeconds(); + + D.apply(tmp1, tmp2, isign, 1); + + // chi_o = A_oo psi_o - tmp1_o + swatch.reset(); swatch.start(); + clov.apply(chi, psi, isign, 1); + chi *= (Real(Nd) + param.Mass); + + swatch.stop(); + clov_apply_time += swatch.getTimeInSeconds(); + + chi[rb[1]] += mquarter*tmp1; + + // Twisted Term? + if( param.twisted_m_usedP ){ + // tmp1 = i mu gamma_5 tmp1 + tmp1[rb[1]] = (Gamma(15) * timesI(psi)); + + if( isign == PLUS ) { + chi[rb[1]] += param.twisted_m * tmp1; + } + else { + chi[rb[1]] -= param.twisted_m * tmp1; + } + } + + END_CODE(); + } + + + //! Apply the even-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivEvenEvenLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + clov.deriv(ds_u, chi, psi, isign, 0); + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + + swatch.stop(); + clov_deriv_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + //! Apply the even-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivEvenEvenLinOpMP(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + clov.derivMultipole(ds_u, chi, psi, isign, 0); + + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + + swatch.stop(); + clov_deriv_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + //! Apply the even-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivLogDetEvenEvenLinOp(multi1d& ds_u, + enum PlusMinus isign) const + { + START_CODE(); + + //invclov.derivTrLn(ds_u, isign, 0); + // Testing Odd Odd Term - get nothing from even even term + clov.derivTrLn(ds_u, isign, 0); + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + + + END_CODE(); + } + + //! Apply the the even-odd block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivEvenOddLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + ds_u.resize(Nd); + D.deriv(ds_u, chi, psi, isign, 0); + for(int mu=0; mu < Nd; mu++) { + ds_u[mu] *= Real(-0.5); + } + END_CODE(); + } + + //! Apply the the odd-even block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivOddEvenLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + ds_u.resize(Nd); + + D.deriv(ds_u, chi, psi, isign, 1); + for(int mu=0; mu < Nd; mu++) { + ds_u[mu] *= Real(-0.5); + } + END_CODE(); + } + + // Inherit this + //! Apply the the odd-odd block onto a source std::vector + void + EvenOddPrecExpCloverLinOp::derivOddOddLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + clov.deriv(ds_u, chi, psi, isign, 1); + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + + swatch.stop(); + clov_deriv_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + void + EvenOddPrecExpCloverLinOp::derivOddOddLinOpMP(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const + { + START_CODE(); + + swatch.reset(); swatch.start(); + clov.derivMultipole(ds_u, chi, psi, isign, 1); + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + + swatch.stop(); + clov_deriv_time += swatch.getTimeInSeconds(); + + END_CODE(); + } + + //! Return flops performed by the operator() + unsigned long EvenOddPrecExpCloverLinOp::nFlops() const + { + unsigned long cbsite_flops = 2*D.nFlops()+2*clov.nFlops()+4*Nc*Ns; + if( param.twisted_m_usedP ) { + cbsite_flops += 4*Nc*Ns; // a + mu*b : a = chi, b = g_5 I psi + } + return cbsite_flops*(Layout::sitesOnNode()/2); + } + + //! Get the log det of the even even part + // BUt for now, return zero for testing. + Double EvenOddPrecExpCloverLinOp::logDetEvenEvenLinOp(void) const { + return invclov.cholesDet(0); + } +} // End Namespace Chroma diff --git a/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.h b/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.h new file mode 100644 index 0000000000..f4789d34d4 --- /dev/null +++ b/lib/actions/ferm/linop/eoprec_exp_clover_linop_w.h @@ -0,0 +1,141 @@ +// -*- C++ -*- +/*! \file + * \brief Even-odd preconditioned ExpClover fermion linear operator + */ + +#ifndef __prec_exp_clover_linop_w_h__ +#define __prec_exp_clover_linop_w_h__ + +#include "state.h" +#include "fermbc.h" +#include "eoprec_logdet_linop.h" +#include "actions/ferm/fermacts/clover_fermact_params_w.h" +#include "actions/ferm/linop/dslash_w.h" +#include "actions/ferm/linop/exp_clover_term_w.h" + + +namespace Chroma +{ + //! Even-odd preconditioned ExpClover-Dirac operator + /*! + * \ingroup linop + * + * This routine is specific to Wilson fermions! + * + * The kernel for ExpClover fermions is + * + * M = A + (d+M) - (1/2) D' + */ + class EvenOddPrecExpCloverLinOp : public EvenOddPrecLogDetLinearOperator, multi1d > + { + public: + // Typedefs to save typing + typedef LatticeFermion T; + typedef multi1d P; + typedef multi1d Q; + + //! Partial constructor + EvenOddPrecExpCloverLinOp() {} + + //! Full constructor + EvenOddPrecExpCloverLinOp(Handle< FermState > fs, + const CloverFermActParams& param_) + { + create(fs,param_); + } + + //! Destructor is automatic + ~EvenOddPrecExpCloverLinOp() { + QDPIO::cout << "EXP_CLOV_LINOP: Time spent in clov deriv (total) = " << clov_deriv_time << std::endl; + QDPIO::cout << "EXP_CLOV_LINOP: Time spent in clov apply/invapply (total) = " << clov_apply_time << std::endl; + + } + + //! Return the fermion BC object for this linear operator + const FermBC& getFermBC() const {return D.getFermBC();} + + //! Creation routine + void create(Handle< FermState > fs, + const CloverFermActParams& param_); + + //! Apply the the even-even block onto a source std::vector + void evenEvenLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the inverse of the even-even block onto a source std::vector + void evenEvenInvLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the the even-odd block onto a source std::vector + void evenOddLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the the odd-even block onto a source std::vector + void oddEvenLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the the odd-odd block onto a source std::vector + void oddOddLinOp(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + // Override inherited one with a few more funkies + void operator()(LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the even-even block onto a source std::vector + void derivEvenEvenLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the even-even block onto a source std::vector + void derivEvenEvenLinOpMP(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const; + + void derivLogDetEvenEvenLinOp(multi1d& ds_u, + enum PlusMinus isign) const; + + //! Apply the the even-odd block onto a source std::vector + void derivEvenOddLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the the odd-even block onto a source std::vector + void derivOddEvenLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + //! Apply the the odd-odd block onto a source std::vector + void derivOddOddLinOp(multi1d& ds_u, + const LatticeFermion& chi, const LatticeFermion& psi, + enum PlusMinus isign) const; + + void derivOddOddLinOpMP(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const; + + //! Return flops performed by the operator() + unsigned long nFlops() const; + + //! Get the log det of the even even part + Double logDetEvenEvenLinOp(void) const; + + private: + mutable LatticeFermion tmp1; + mutable LatticeFermion tmp2; + CloverFermActParams param; + WilsonDslash D; + ExpCloverTerm clov; + ExpCloverTerm invclov; // uggh, only needed for evenEvenLinOp + mutable double clov_apply_time; + mutable double clov_deriv_time; + mutable StopWatch swatch; + }; + + + +} // End Namespace Chroma + + +#endif diff --git a/lib/actions/ferm/linop/exp_clover_term_base_w.h b/lib/actions/ferm/linop/exp_clover_term_base_w.h index 397624265b..dcf80cccbb 100644 --- a/lib/actions/ferm/linop/exp_clover_term_base_w.h +++ b/lib/actions/ferm/linop/exp_clover_term_base_w.h @@ -8,7 +8,7 @@ #include "chroma_config.h" #include "linearop.h" - +#include "actions/ferm/linop/clover_term_base_w.h" namespace Chroma { @@ -19,9 +19,7 @@ namespace Chroma */ template - class ExpCloverTermBase : public DslashLinearOperator, - multi1d > + class ExpCloverTermBase : public CloverTermBase< T, U> { public: //! No real need for cleanup here @@ -847,7 +845,9 @@ namespace Chroma // Get weight*Tr_spin gamma_mu gamma_nu A^{-1} piece triacntr(sigma_XY_dag, mu_nu_index, cb); - sigma_XY_dag[rb[cb]] *= factor; + //sigma_XY_dag[rb[cb]] *= factor; + + sigma_XY_dag[rb[cb]] *= factor*0; // These will be overwritten so no need to initialize to zero U ds_tmp_mu; diff --git a/lib/actions/ferm/linop/exp_clover_term_qdp_w.h b/lib/actions/ferm/linop/exp_clover_term_qdp_w.h index b16c545a0b..5fac4fb168 100644 --- a/lib/actions/ferm/linop/exp_clover_term_qdp_w.h +++ b/lib/actions/ferm/linop/exp_clover_term_qdp_w.h @@ -309,6 +309,10 @@ namespace Chroma #endif #if 1 + //Set the highest power of A^n for the exp sum. This allows for N_exp_default < 5 to compare with clover + int pow_max=5; + if (N_exp_default <5) + pow_max=N_exp_default; // Accumulate exponential from only A RComplex tmp[12]; @@ -320,7 +324,7 @@ namespace Chroma } // Main loop: chi = psi + q[i]/q[i-1] A chi - for (int pow = 5; pow > 0; --pow) + for (int pow = pow_max; pow > 0; --pow) { siteApplicationBlock(tmp, tri_in.A, cchi); for (int cspin = 0; cspin < 6; cspin++) @@ -399,6 +403,23 @@ namespace Chroma #endif } + //Apply coefficient to site + template + inline void siteApplicationCoeff(RComplex* __restrict__ cchi, const ExpClovTriang& tri_in, + int pow_i,int pow_j, + const RComplex* const __restrict__ ppsi) + { + + // Top block + for (int cspin = 0; cspin < 6; cspin++) + cchi[cspin] = tri_in.C[0][pow_i][pow_j] * ppsi[cspin]; + + // Second Block + for (int cspin = 6; cspin < 12; cspin++) + cchi[cspin] = tri_in.C[1][pow_i][pow_j]* ppsi[cspin]; + } + + template inline void siteApplicationPower(RComplex* __restrict__ cchi, const ExpClovTriang& tri_in, @@ -501,17 +522,73 @@ namespace Chroma * \param cb Checkerboard of OUTPUT std::vector (Read) */ + //! Take deriv of D^power + /*! + * \param chi left std::vector on cb (Read) + * \param psi right std::vector on 1-cb (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of chi std::vector (Read) + * + * \return Computes \f$chi^\dag * \dot(D} * psi\f$ + */ + //! Take deriv of D + /*! + * \param chi left std::vector (Read) + * \param psi right std::vector (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * + * \return Computes \f$chi^\dag * \dot(D} * psi\f$ + */ + void deriv(multi1d& ds_u, + const T& chi, const T& psi, + enum PlusMinus isign) const;//{ExpCloverTermBase::deriv(ds_u,chi,psi,isign);} + + void deriv(multi1d& ds_u, + const T& chi, const T& psi, + enum PlusMinus isign, int cb) const; + + //! Take deriv of D + /*! + * \param chi left vectors (Read) + * \param psi right vectors (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of chi std::vector (Read) + * + * \return Computes \f$chi^\dag * \dot(D} * psi\f$ + */ + void derivMultipole(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const; + + //! Take deriv of D + /*! + * \param chi left vectors on cb (Read) + * \param psi right vectors on cb (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of chi std::vector (Read) + * + * \return Computes \f$chi^\dag * \dot(D} * psi\f$ + */ + + void derivMultipole(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign, int cb) const; + + void fillRefDiag(Real diag); // Reference exponential using old fashioned taylor expansion void applyRef(T& chi, const T& psi, enum PlusMinus isign, int N = N_exp) const; - // Appl;y a power of a matrix from A^0 to A^5 + // Apply a power of a matrix from A^0 to A^5 void applyPowerSite(T& chi, const T& psi, enum PlusMinus isign, int site, int power = 1) const; - // Appl;y a power of a matrix from A^0 to A^5 + // Apply a power of a matrix from A^0 to A^5 void applyPower(T& chi, const T& psi, enum PlusMinus isign, int cb, int power = 1) const; + // Apply coefficients to powers of a matrix A + void applyCoeff(T& chi, const T& psi, enum PlusMinus isign,int cb, int pow_i, int pow_j) const; + // Apply exponential operator void apply(T& chi, const T& psi, enum PlusMinus isign, int cb) const override; @@ -1021,6 +1098,169 @@ namespace Chroma #endif } + //! Take deriv of D + /*! + * \param chi left std::vector (Read) + * \param psi right std::vector (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * + * \return Computes \f$\chi^\dag * \dot(D} * \psi\f$ + */ + template + void QDPExpCloverTermT::deriv(multi1d& ds_u, + const T& chi, const T& psi, + enum PlusMinus isign) const + { + START_CODE(); + + // base deriv resizes. + // Even even checkerboard + deriv(ds_u, chi, psi, isign,0); + + // Odd Odd checkerboard + multi1d ds_tmp; + deriv(ds_tmp, chi, psi, isign,1); + + ds_u += ds_tmp; + + END_CODE(); + } + + //! Take deriv of D + /*! + * \param chi left std::vector on cb (Read) + * \param psi right std::vector on 1-cb (Read) + * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read) + * \param cb Checkerboard of chi std::vector (Read) + * + * \return Computes \f$\chi^\dag * \dot(D} * \psi\f$ + */ + + template + void QDPExpCloverTermT::deriv(multi1d& ds_u, + const T& chi, const T& psi, + enum PlusMinus isign, int cb) const + { + START_CODE(); + + // Do I still need to do this? + if( ds_u.size() != Nd ) { + ds_u.resize(Nd); + } + + ds_u = zero; + multi1d ds_u_tmp; + ds_u_tmp.resize(Nd); + + // Get the links + //const multi1d& u = getU(); + + T ppsi= zero; + T cchi= zero; + T f_chi= zero; + f_chi=chi; + + // The exp derivative is computed as + // A'+AA'/2+A'A/2+A'AA/6+AA'A/6+AAA'/6 = Sum A^i A' A^j + // applyCoeff multiplies the chi by the exponential term factor + // and the factors from using the Caley Hamilton for A^n, for n>5 + + for(int i=0;i<=5;i++){ + for(int j=0;j<=5;j++){ + (*this).applyCoeff(f_chi, chi, isign,cb,i,j); + (*this).applyPower(ppsi, psi, PLUS, cb, j); + (*this).applyPower(cchi, f_chi, PLUS, cb,i); + + CloverTermBase::deriv(ds_u_tmp,cchi,ppsi,isign,cb); + + for(int i=0;i + void QDPExpCloverTermT::derivMultipole(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign) const + { + START_CODE(); + + // base deriv resizes. + // Even even checkerboard + derivMultipole(ds_u, chi, psi, isign,0); + + // Odd Odd checkerboard + multi1d ds_tmp; + derivMultipole(ds_tmp, chi, psi, isign,1); + + ds_u += ds_tmp; + + END_CODE(); + } + + template + void QDPExpCloverTermT::derivMultipole(multi1d& ds_u, + const multi1d& chi, const multi1d& psi, + enum PlusMinus isign, int cb) const + { + START_CODE(); + + + // Do I still need to do this? + if( ds_u.size() != Nd ) { + ds_u.resize(Nd); + } + + ds_u = zero; + multi1d ds_u_tmp; + ds_u_tmp.resize(Nd); + + // Get the links + //const multi1d& u = getU(); + + multi1d ppsi,cchi,f_chi; + + f_chi.resize(chi.size()); + cchi.resize(chi.size()); + ppsi.resize(chi.size()); + + for(int i=0;i5 + + for(int i=0;i<=5;i++){ + for(int j=0;j<=5;j++){ + for(int k=0;k::derivMultipole(ds_u_tmp,cchi,ppsi,isign,cb); + + for(int i=0;i(sign * tab[block][row][i] / (REALT)(fact)); } } - } + + //HMC: adding the calculation of the C_ij + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + tri[site].C[block][i][j] = RScalar(tab[block][0][i])*RScalar(tab[block][0][j]); + } + } + + fact = 1; + unsigned long fact_row = 1; + + for (unsigned int row = 0; row <= N_exp; ++row) + { + if (row!=0) + fact_row *= (unsigned long)(row); + fact=fact_row*(unsigned long)(row+1); + for(unsigned int col = 0; col <= N_exp-row; ++col) + { + if(row!=0 || col!=0) //row=0, col=0 computed above + { + + //This is the factor on the exp = c_n x^n, for the derivative of the n-term x^row x'x^col + //the factor is row+col+1,where row+col=n-1 + if( col !=0) + fact *= (unsigned long)(row+col+1); + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + tri[site].C[block][i][j] += RScalar(tab[block][col][j])*RScalar(tab[block][row][i] / (REALT)(fact)); + } + } + } + } + } + + }//for block ends // Assemble te exponential from the q-s and powers of A. // siteExponentiate(tri[site]); @@ -1407,8 +1685,23 @@ namespace Chroma const ExpClovTriang* tri; int cb; int power = 1; + ApplyPowerArgs(T& _chi, const T& _psi,const ExpClovTriang* _tri,int _cb,int _power) : chi(_chi), psi(_psi), tri(_tri), cb(_cb), power(_power) {} }; + template + struct ApplyDerivCoeffArgs { + typedef typename WordType::Type_t REALT; + T& chi; + const T& psi; + const ExpClovTriang* tri; + int cb; + int pow_i = 1; + int pow_j = 1; + ApplyDerivCoeffArgs(T& _chi, const T& _psi,const ExpClovTriang* _tri,int _cb,int _pow_i, int _pow_j) : chi(_chi), psi(_psi), tri(_tri), cb(_cb), pow_i(_pow_i), pow_j(_pow_j) {} + + }; + + template void applySitePowerLoop(int lo, int hi, int MyId, ApplyPowerArgs* arg) { @@ -1452,6 +1745,43 @@ namespace Chroma int cb; }; + template + void applySiteCoeffLoop(int lo, int hi, int MyId, ApplyDerivCoeffArgs* arg) + { +#ifndef QDP_IS_QDPJIT + // This is essentially the body of the previous "Apply" + // but now the args are handed in through user arg struct... + + START_CODE(); + + typedef typename WordType::Type_t REALT; + // Unwrap the args... + T& chi = arg->chi; + const T& psi = arg->psi; + const ExpClovTriang* tri = arg->tri; + int cb = arg->cb; + int pow_i = arg->pow_i; + int pow_j = arg->pow_j; + const int n = 2 * Nc; + + for (int ssite = lo; ssite < hi; ++ssite) + { + + int site = rb[cb].siteTable()[ssite]; + + RComplex* cchi = (RComplex*)&(chi.elem(site).elem(0).elem(0)); + + const RComplex* const ppsi = + (const RComplex* const) & (psi.elem(site).elem(0).elem(0)); + + siteApplicationCoeff(cchi, tri[site], pow_i,pow_j, ppsi); + } + END_CODE(); +#endif + } // Function + + + template void applySiteLoop(int lo, int hi, int MyId, ApplyArgs* arg) { @@ -1546,6 +1876,30 @@ namespace Chroma #endif } + template + void QDPExpCloverTermT::applyCoeff(T& chi, const T& psi, enum PlusMinus isign,int cb, int pow_i, int pow_j) const + { +#ifndef QDP_IS_QDPJIT + START_CODE(); + + if (Ns != 4) + { + QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl; + QDP_abort(1); + } + + QDPExpCloverEnv::ApplyDerivCoeffArgs arg = {chi, psi, tri, cb, pow_i, pow_j}; + int num_sites = rb[cb].siteTable().size(); + + // The dispatch function is at the end of the file + // ought to work for non-threaded targets too... + dispatch_to_threads(num_sites, arg, QDPExpCloverEnv::applySiteCoeffLoop); + (*this).getFermBC().modifyF(chi, QDP::rb[cb]); + + END_CODE(); +#endif + } + template void QDPExpCloverTermT::apply(T& chi, const T& psi, enum PlusMinus isign, int cb) const diff --git a/lib/actions/ferm/linop/exp_clover_term_w.h b/lib/actions/ferm/linop/exp_clover_term_w.h index f56cfbf999..74a8f25be7 100644 --- a/lib/actions/ferm/linop/exp_clover_term_w.h +++ b/lib/actions/ferm/linop/exp_clover_term_w.h @@ -20,5 +20,9 @@ namespace Chroma using ExpCloverTermF = QDPExpCloverTermF<>; using ExpCloverTermD = QDPExpCloverTermD<>; + template + using ExpCloverTermT = QDPExpCloverTermT; + } + #endif diff --git a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.cc b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.cc index 844deef993..b7a4252606 100644 --- a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.cc +++ b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.cc @@ -12,6 +12,8 @@ #include "actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -260,3 +262,5 @@ namespace Chroma #endif #endif #endif + +#endif diff --git a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h index 30b34bd8ba..c2bb377b86 100644 --- a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h +++ b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_clover_linop_w.h @@ -13,6 +13,8 @@ #include "actions/ferm/linop/clover_term_w.h" #include "actions/ferm/linop/ilu2prec_s_cprec_t_wilsonlike_linop_w.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Clover Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -171,3 +173,5 @@ namespace Chroma #endif #endif + +#endif diff --git a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.cc b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.cc index 064fc8eb36..57966d164c 100644 --- a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.cc +++ b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.cc @@ -11,6 +11,8 @@ #include "actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -209,3 +211,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h index fb76968064..9ae62d1b7e 100644 --- a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h +++ b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilson_linop_w.h @@ -14,6 +14,8 @@ #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -190,3 +192,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilsonlike_linop_w.h b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilsonlike_linop_w.h index aff75d0509..facda784a5 100644 --- a/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilsonlike_linop_w.h +++ b/lib/actions/ferm/linop/ilu2prec_s_cprec_t_wilsonlike_linop_w.h @@ -12,6 +12,8 @@ #include "actions/ferm/linop/dslash_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -268,3 +270,5 @@ namespace Chroma #endif #endif #endif + +#endif diff --git a/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.cc b/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.cc index 7a9a12bf89..4d95231b99 100644 --- a/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.cc +++ b/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.cc @@ -12,6 +12,8 @@ #include "actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -260,3 +262,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h b/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h index b614ef4c6e..783f880e24 100644 --- a/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h +++ b/lib/actions/ferm/linop/iluprec_s_cprec_t_clover_linop_w.h @@ -13,6 +13,8 @@ #include "actions/ferm/linop/clover_term_w.h" #include "actions/ferm/linop/iluprec_s_cprec_t_wilsonlike_linop_w.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Clover Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -193,3 +195,4 @@ namespace Chroma #endif #endif +#endif diff --git a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.cc b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.cc index 8edaf7ff67..5a16398abe 100644 --- a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.cc +++ b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.cc @@ -11,6 +11,8 @@ #include "actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -209,3 +211,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h index 71d0506a84..b4e7326a3e 100644 --- a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h +++ b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilson_linop_w.h @@ -14,6 +14,8 @@ #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -179,3 +181,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilsonlike_linop_w.h b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilsonlike_linop_w.h index ba5e48be13..09e9a58d78 100644 --- a/lib/actions/ferm/linop/iluprec_s_cprec_t_wilsonlike_linop_w.h +++ b/lib/actions/ferm/linop/iluprec_s_cprec_t_wilsonlike_linop_w.h @@ -12,6 +12,8 @@ #include "actions/ferm/linop/dslash_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -766,3 +768,5 @@ namespace Chroma #endif #endif #endif +#endif + diff --git a/lib/actions/ferm/linop/unprec_exp_clover_linop_w.cc b/lib/actions/ferm/linop/unprec_exp_clover_linop_w.cc index 77e8fdda57..4ccc9f6c45 100644 --- a/lib/actions/ferm/linop/unprec_exp_clover_linop_w.cc +++ b/lib/actions/ferm/linop/unprec_exp_clover_linop_w.cc @@ -58,6 +58,11 @@ namespace Chroma A.deriv(ds_u, chi, psi, isign); + for (int mu = 0; mu < Nd; mu++) + { + ds_u[mu] *= (Real(Nd) + param.Mass); + } + multi1d ds_tmp(Nd); ds_tmp = zero; diff --git a/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.cc b/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.cc index 4294890834..f019c48f18 100644 --- a/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.cc +++ b/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.cc @@ -13,6 +13,8 @@ #include "actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + using namespace QDP::Hints; namespace Chroma @@ -646,3 +648,4 @@ namespace Chroma #endif #endif #endif +#endif diff --git a/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h b/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h index 8f6e60128c..e507e9b2f6 100644 --- a/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h +++ b/lib/actions/ferm/linop/unprec_s_cprec_t_wilson_linop_w.h @@ -17,6 +17,8 @@ #include "actions/ferm/linop/dslash_w.h" #include "actions/ferm/linop/central_tprec_nospin_utils.h" +#if ! defined (QDP_IS_QDPJIT2) + namespace Chroma { //! Wilson Dirac Operator - Unpreconditioned in Space, Centrally Preconditioned in time @@ -161,3 +163,4 @@ namespace Chroma #endif #endif +#endif diff --git a/lib/actions/ferm/qprop/eoprec_fermact_qprop.cc b/lib/actions/ferm/qprop/eoprec_fermact_qprop.cc index d1d4f72fc2..bfa8ff293f 100644 --- a/lib/actions/ferm/qprop/eoprec_fermact_qprop.cc +++ b/lib/actions/ferm/qprop/eoprec_fermact_qprop.cc @@ -46,11 +46,11 @@ namespace Chroma /* chi_tmp = chi_o - D_oe * A_ee^-1 * chi_e */ T chi_tmp; { - T tmp1, tmp2; + T tmp1, tmp2; - A->evenEvenInvLinOp(tmp1, chi, PLUS); - A->oddEvenLinOp(tmp2, tmp1, PLUS); - chi_tmp[rb[1]] = chi - tmp2; + A->evenEvenInvLinOp(tmp1, chi, PLUS); + A->oddEvenLinOp(tmp2, tmp1, PLUS); + chi_tmp[rb[1]] = chi - tmp2; } // Call inverter @@ -59,19 +59,19 @@ namespace Chroma /* Step (ii) */ /* psi_e = A_ee^-1 * [chi_e - D_eo * psi_o] */ { - T tmp1, tmp2; + T tmp1, tmp2; - A->evenOddLinOp(tmp1, psi, PLUS); - tmp2[rb[0]] = chi - tmp1; - A->evenEvenInvLinOp(psi, tmp2, PLUS); + A->evenOddLinOp(tmp1, psi, PLUS); + tmp2[rb[0]] = chi - tmp1; + A->evenEvenInvLinOp(psi, tmp2, PLUS); } - // Compute residual + // Compute the unprec residual { - T r; - A->unprecLinOp(r, psi, PLUS); - r -= chi; - res.resid = sqrt(norm2(r)); + T r; + A->unprecLinOp(r, psi, PLUS); + r -= chi; + res.resid = sqrt(norm2(r)); } END_CODE(); diff --git a/lib/actions/ferm/qprop/fermact_qprop.cc b/lib/actions/ferm/qprop/fermact_qprop.cc index f08ca0e31f..49b8385fec 100644 --- a/lib/actions/ferm/qprop/fermact_qprop.cc +++ b/lib/actions/ferm/qprop/fermact_qprop.cc @@ -49,22 +49,38 @@ namespace Chroma { START_CODE(); - // Call inverter + // Call inverter -- convention: invA will return res ( || r ||/ || b || ) SystemSolverResults_t res = (*invA)(psi, chi); - - // Compute residual - { - T r; - (*A)(r, psi, PLUS); - r -= chi; - res.resid = sqrt(norm2(r)); - } + + // For some reason chroma convention is for Qprop to return || r || (absolute) + // So we multiply res.resid * sqrt(norm(chi)); + res.resid *= sqrt(norm2(chi)); END_CODE(); return res; } + std::vector operator() (const std::vector>& psis, + const std::vector>& chis) const override + { + START_CODE(); + + assert(psis.size() == chis.size()); + + // Call inverter - convention returns residuum + std::vector res = (*invA)(psis, chis); + + // for some reason chroma convention is for Qprop to return || r || (absolute) + // we can obtaint his by multiplying || r || / || b || by || b ||. + for (int col=0; col< psis.size(); ++col) { + res[col].resid *= sqrt(norm2(*(chis[col]))); + } + + END_CODE(); + return res; + } + private: // Hide default constructor FermActQprop() {} diff --git a/lib/actions/ferm/qprop/quarkprop4_w.cc b/lib/actions/ferm/qprop/quarkprop4_w.cc index bf913b6edc..a46db80c82 100644 --- a/lib/actions/ferm/qprop/quarkprop4_w.cc +++ b/lib/actions/ferm/qprop/quarkprop4_w.cc @@ -38,147 +38,164 @@ namespace Chroma { START_CODE(); - QDPIO::cout << "Entering quarkProp4" << std::endl; + QDPIO::cout << "Entering quarkProp4 - MRHS interface" << std::endl; push(xml_out, "QuarkProp4"); ncg_had = 0; int start_spin; int end_spin; + int num_spin; switch (quarkSpinType) { case QUARK_SPIN_TYPE_FULL: start_spin = 0; end_spin = Ns; + num_spin = Ns; break; case QUARK_SPIN_TYPE_UPPER: start_spin = 0; end_spin = Ns/2; + num_spin = Ns/2; break; case QUARK_SPIN_TYPE_LOWER: start_spin = Ns/2; end_spin = Ns; + num_spin = Ns/2; break; } -// LatticeFermion psi = zero; // note this is ``zero'' and not 0 - - // This version loops over all color and spin indices - for(int color_source = 0; color_source < Nc; ++color_source) - { - for(int spin_source = start_spin; spin_source < end_spin; ++spin_source) - { - LatticeFermion psi = zero; // note this is ``zero'' and not 0 - LatticeFermion chi; - - // Extract a fermion source - PropToFerm(q_src, chi, color_source, spin_source); - - // Use the last initial guess as the current initial guess - - /* - * Normalize the source in case it is really huge or small - - * a trick to avoid overflows or underflows - */ - Real fact = 1.0; - Real nrm = sqrt(norm2(chi)); - if (toFloat(nrm) != 0.0) - fact /= nrm; - - // Rescale - chi *= fact; - - // Compute the propagator for given source color/spin. - { - SystemSolverResults_t result = (*qprop)(psi,chi); - ncg_had += result.n_count; - - push(xml_out,"Qprop"); - write(xml_out, "color_source", color_source); - write(xml_out, "spin_source", spin_source); - write(xml_out, "n_count", result.n_count); - write(xml_out, "resid", result.resid); - pop(xml_out); - } - - // Unnormalize the source following the inverse of the normalization above - fact = Real(1) / fact; - psi *= fact; - - /* - * Move the solution to the appropriate components - * of quark propagator. - */ - FermToProp(psi, q_sol, color_source, spin_source); - } /* end loop over spin_source */ - } /* end loop over color_source */ + { + multi1d norm_chi(Nc*num_spin); + multi1d fact(Nc*num_spin); + std::vector< std::shared_ptr > chi_ptrs(Nc*num_spin); + std::vector< std::shared_ptr > psi_ptrs(Nc*num_spin); + // This version loops over all color and spin indices + int idx=0; + for(int color_source = 0; color_source < Nc; ++color_source) + { + for(int spin_source = start_spin; spin_source < end_spin; ++spin_source) + { + psi_ptrs[idx] = std::make_shared(zero); + + + // Extract a fermion source + // Due to the vaguaries of initializing a std::shared + // We go via a temporary. + LatticeFermion tmp; + PropToFerm(q_src, tmp, color_source, spin_source); + + // Normalize temporary + norm_chi[idx] = sqrt(norm2(tmp)); + fact[idx] = toDouble(1)/norm_chi[idx]; + tmp *= fact[idx]; + + // Create the RHS + chi_ptrs[idx] = std::make_shared(tmp); + + // Update Index + idx++; + } + } + + // Do the MultiRHS solve + // + // Convention: In true multiRHS solve only solution 0 will have non-zero + // n-count for now. That way accumulating ncg_had by adding 0s potentially + // will work. + std::vector results = (*qprop)(psi_ptrs, chi_ptrs); + + // Accumulate ncg_had and restore solution into solution prop + ncg_had = 0; + for(int idx=0; idx < Nc*num_spin; idx++) { + + // Undo rescale by multiplying by 1/fact = norm_chi[idx] + *(psi_ptrs[idx]) *= norm_chi[idx]; + + // break colorspin index into color and spin indices. + int spin_idx = idx%num_spin + start_spin; + int col_idx =idx/num_spin; + + // Insert solution into propagator + FermToProp(*(psi_ptrs[idx]), q_sol, col_idx, spin_idx); + + // Accumulate ncg_had. This will be correct if we follow + // the convention that true mrhs solvers return only a count + // in results[0].n_count and keep all others as zero + // Fake MRHS solvers (which loop over sources) can fill out + // an accurate iteration count for each solve. + ncg_had += results[idx].n_count; + push(xml_out,"Qprop"); + write(xml_out, "color_source", col_idx); + write(xml_out, "spin_source", spin_idx); + write(xml_out, "n_count", results[idx].n_count); + write(xml_out, "resid", results[idx].resid); + pop(xml_out); + + } /* end loop over solutions */ + } // psis, chis etc go away here. switch (quarkSpinType) { - case QUARK_SPIN_TYPE_FULL: - // Do nothing here - break; - - case QUARK_SPIN_TYPE_UPPER: - { - /* Since this is a non-relativistic prop - * negate the quark props 'lower' components - * This is because I should have only done a half inversion - * on non relativistic channels, where the last two columns of the - * source MUST be the negation of the first two columns. - * Hence the last two columns of the solution must also be - * negations of the first two columns. The half inversion itself - * has not put in the minus sign, it just copied the columns. - * The post multiply by Gamma_5 adds in the required - sign - * in the last two columns - */ - /* Apply Gamma_5 = Gamma(15) by negating the fermion extracted */ - for(int color_source = 0; color_source < Nc ; ++color_source) - { - for(int spin_source = Ns/2; spin_source < Ns; ++spin_source) - { - int copyfrom = spin_source - Ns/2; - LatticeFermion psi; - - PropToFerm(q_sol, psi, color_source, copyfrom); - FermToProp(LatticeFermion(-psi), q_sol, color_source, spin_source); - } - } - } - break; - - case QUARK_SPIN_TYPE_LOWER: - { - /* Since this is a non-relativistic prop - * negate the quark props 'lower' components - * This is because I should have only done a half inversion - * on non relativistic channels, where the last two columns of the - * source MUST be the negation of the first two columns. - * Hence the last two columns of the solution must also be - * negations of the first two columns. The half inversion itself - * has not put in the minus sign, it just copied the columns. - * The post multiply by Gamma_5 adds in the required - sign - * in the last two columns - */ - /* Apply Gamma_5 = Gamma(15) by negating the fermion extracted */ - for(int color_source = 0; color_source < Nc ; ++color_source) - { - for(int spin_source = 0; spin_source < Ns/2; ++spin_source) - { - int copyfrom = spin_source + Ns/2; - LatticeFermion psi; - - PropToFerm(q_sol, psi, color_source, copyfrom); - // There is no need for (-) in the lower component case (KNO) - FermToProp(LatticeFermion(psi), q_sol, color_source, spin_source); - } - } - } - break; + case QUARK_SPIN_TYPE_FULL: + // Do nothing here + break; + + case QUARK_SPIN_TYPE_UPPER: + { + /* Since this is a non-relativistic prop + * negate the quark props 'lower' components + * This is because I should have only done a half inversion + * on non relativistic channels, where the last two columns of the + * source MUST be the negation of the first two columns. + * Hence the last two columns of the solution must also be + * negations of the first two columns. The half inversion itself + * has not put in the minus sign, it just copied the columns. + * The post multiply by Gamma_5 adds in the required - sign + * in the last two columns + */ + /* Apply Gamma_5 = Gamma(15) by negating the fermion extracted */ + for(int color_source = 0; color_source < Nc ; ++color_source) { + for(int spin_source = Ns/2; spin_source < Ns; ++spin_source) { int copyfrom = spin_source - Ns/2; + LatticeFermion psi; + PropToFerm(q_sol, psi, color_source, copyfrom); + FermToProp(LatticeFermion(-psi), q_sol, color_source, spin_source); + } + } + } + break; + + case QUARK_SPIN_TYPE_LOWER: + { + /* Since this is a non-relativistic prop + * negate the quark props 'lower' components + * This is because I should have only done a half inversion + * on non relativistic channels, where the last two columns of the + * source MUST be the negation of the first two columns. + * Hence the last two columns of the solution must also be + * negations of the first two columns. The half inversion itself + * has not put in the minus sign, it just copied the columns. + * The post multiply by Gamma_5 adds in the required - sign + * in the last two columns + */ + /* Apply Gamma_5 = Gamma(15) by negating the fermion extracted */ + for(int color_source = 0; color_source < Nc ; ++color_source) { + for(int spin_source = 0; spin_source < Ns/2; ++spin_source) { + int copyfrom = spin_source + Ns/2; + LatticeFermion psi; + + PropToFerm(q_sol, psi, color_source, copyfrom); + + // There is no need for (-) in the lower component case (KNO) + FermToProp(LatticeFermion(psi), q_sol, color_source, spin_source); + } + } + } + break; } // end switch(quarkSpinType) pop(xml_out); diff --git a/lib/actions/gauge/gaugebcs/schr_chromomag_gaugebc.cc b/lib/actions/gauge/gaugebcs/schr_chromomag_gaugebc.cc index 7878df9f79..9b82ac8964 100644 --- a/lib/actions/gauge/gaugebcs/schr_chromomag_gaugebc.cc +++ b/lib/actions/gauge/gaugebcs/schr_chromomag_gaugebc.cc @@ -122,7 +122,7 @@ namespace Chroma for(int mu = 1; mu < Nd; ++mu) fld[mu] = 1; - Real ftmp = Chroma::twopi * p.SchrPhiMult / Real(QDP::Layout::lattSize()[var_dir]); + Real ftmp = Chroma::constant().twopi * p.SchrPhiMult / Real(QDP::Layout::lattSize()[var_dir]); LatticeReal lftmp = ftmp * Layout::latticeCoordinate(var_dir); fld[0] = 1.0; diff --git a/lib/actions/gauge/gaugebcs/schr_coupling_gaugebc.cc b/lib/actions/gauge/gaugebcs/schr_coupling_gaugebc.cc index 0d1a626a1e..8db7eb8d21 100644 --- a/lib/actions/gauge/gaugebcs/schr_coupling_gaugebc.cc +++ b/lib/actions/gauge/gaugebcs/schr_coupling_gaugebc.cc @@ -54,7 +54,7 @@ namespace Chroma phases.lower.resize(Nc); phases.upper.resize(Nc); - Real ftmp = Chroma::twopi * 0.5 * SchrPhiMult(); + Real ftmp = Chroma::constant().twopi * 0.5 * SchrPhiMult(); switch (Nc) { diff --git a/lib/actions/gauge/gaugebcs/schr_nonpert_gaugebc.cc b/lib/actions/gauge/gaugebcs/schr_nonpert_gaugebc.cc index b932a98f70..8bf27728ad 100644 --- a/lib/actions/gauge/gaugebcs/schr_nonpert_gaugebc.cc +++ b/lib/actions/gauge/gaugebcs/schr_nonpert_gaugebc.cc @@ -54,7 +54,7 @@ namespace Chroma phases.lower.resize(Nc); phases.upper.resize(Nc); - Real ftmp = Chroma::twopi * 0.5 * SchrPhiMult(); + Real ftmp = Chroma::constant().twopi * 0.5 * SchrPhiMult(); switch (Nc) { diff --git a/lib/chromabase.h b/lib/chromabase.h index d2b1535113..6077dad959 100644 --- a/lib/chromabase.h +++ b/lib/chromabase.h @@ -33,6 +33,13 @@ struct PropTypeTraits typedef LatticeDiracPropagator Type_t; }; +#if defined (QDP_IS_QDPJIT2) +template<> +struct PropTypeTraits +{ + typedef LatticePropagator Type_t; +}; +#endif template<> struct PropTypeTraits @@ -45,15 +52,14 @@ struct PropTypeTraits enum PlusMinus {PLUS = 1, MINUS = -1}; -// Useful constants -#if BASE_PRECISION == 32 -const Real fuzz = 1.0e-5; -#elif BASE_PRECISION == 64 -const Real fuzz = 1.0e-10; -#endif - -const Real twopi = 6.283185307179586476925286; - + struct __chroma_constant + { + Real twopi; + Real fuzz; + }; + const __chroma_constant& constant(); + void constant_destroy(); + // Hooks for various things #if defined(QDP_DEBUG_MEMORY) diff --git a/lib/constant.cc b/lib/constant.cc new file mode 100644 index 0000000000..e2ff85b675 --- /dev/null +++ b/lib/constant.cc @@ -0,0 +1,39 @@ +#include "chroma_config.h" +#include "chromabase.h" + +namespace Chroma +{ + namespace + { + __chroma_constant* constant_data; + bool constant_data_allocated = false; + } + + const __chroma_constant& constant() + { + if (!constant_data) + { + constant_data = new __chroma_constant; + constant_data_allocated = true; + + constant_data->twopi = 6.283185307179586476925286; + +#if BASE_PRECISION == 32 + constant_data->fuzz = 1.0e-5; +#elif BASE_PRECISION == 64 + constant_data->fuzz = 1.0e-10; +#endif + } + + return *constant_data; + } + + + void constant_destroy() + { + if (constant_data_allocated) + delete constant_data; + constant_data_allocated = false; + } + +} diff --git a/lib/eoprec_wilstype_fermact_w.h b/lib/eoprec_wilstype_fermact_w.h index e9a425cb50..d5ad5355e2 100644 --- a/lib/eoprec_wilstype_fermact_w.h +++ b/lib/eoprec_wilstype_fermact_w.h @@ -9,6 +9,7 @@ #include "wilstype_fermact_w.h" #include "eoprec_linop.h" +#include "actions/ferm/linop/lunprec_w.h" namespace Chroma { @@ -29,6 +30,12 @@ namespace Chroma /*! Covariant return rule - override base class function */ virtual EvenOddPrecLinearOperator* linOp(Handle< FermState > state) const = 0; + //! Produce a linear operator for this action + LinearOperator* genLinOp(Handle> state) const override + { + return new Lunprec(linOp(state)); + } + //! Return quark prop solver, solution of unpreconditioned system /*! Default implementation provided */ virtual SystemSolver* qprop(Handle< FermState > state, diff --git a/lib/fermact.h b/lib/fermact.h index 6114135f63..402cf182c4 100644 --- a/lib/fermact.h +++ b/lib/fermact.h @@ -91,6 +91,12 @@ namespace Chroma */ virtual const CreateFermState& getCreateState() const = 0; + //! Produce a linear operator for this action + virtual LinearOperator* genLinOp(Handle> state) const + { + return nullptr; + } + //! Return quark prop solver, solution of unpreconditioned system virtual SystemSolver* qprop(Handle< FermState > state, const GroupXML_t& invParam) const = 0; diff --git a/lib/init/chroma_init.cc b/lib/init/chroma_init.cc index 208439e893..f87ba71d1c 100644 --- a/lib/init/chroma_init.cc +++ b/lib/init/chroma_init.cc @@ -154,11 +154,7 @@ namespace Chroma << " --chroma-i [" << getXMLInputFileName() << "] xml input file name\n" << " -o [" << getXMLOutputFileName() << "] xml output file name\n" << " --chroma-p [" << getXMLOutputFileName() << "] xml output file name\n" - -#ifdef ARCH_PARSCALAR -#include "qmp.h" -#endif -<< " -l [" << getXMLLogFileName() << "] xml log file name\n" + << " -l [" << getXMLLogFileName() << "] xml log file name\n" << " --chroma-l [" << getXMLLogFileName() << "] xml log file name\n" << " -cwd [" << getCWD() << "] xml working directory\n" << " --chroma-cwd [" << getCWD() << "] xml working directory\n" @@ -276,18 +272,17 @@ namespace Chroma # endif setVerbosityQuda(QUDA_SUMMARIZE, "", stdout); - - QDPIO::cout << "Calling initCommsGridQuda\n"; -#ifdef ARCH_PARSCALAR - int ndim = QMP_get_logical_number_of_dimensions(); - const int *dims = QMP_get_logical_dimensions(); -#else - int ndim=4; - const int dims[4]={1,1,1,1}; -#endif - QDPIO::cout << "calling initCommsGridQuda with ndim = " << ndim << " and geom=( " << dims[0] << ", " - << dims[1] << ", " << dims[2] << ", " << dims[3] << " )\n"; - initCommsGridQuda(ndim, dims, nullptr, nullptr); + QDPIO::cout << "Calling initCommsGridQuda\n"; +# ifdef ARCH_PARSCALAR + int ndim = QMP_get_logical_number_of_dimensions(); + const int* dims = QMP_get_logical_dimensions(); +# else + int ndim = 4; + const int dims[4] = {1, 1, 1, 1}; +# endif + QDPIO::cout << "calling initCommsGridQuda with ndim = " << ndim << " and geom=( " << dims[0] + << ", " << dims[1] << ", " << dims[2] << ", " << dims[3] << " )\n"; + initCommsGridQuda(ndim, dims, nullptr, nullptr); QDPIO::cout << "Initializing QUDA device (using CUDA device no. " << cuda_device << ")" << std::endl; @@ -321,12 +316,24 @@ namespace Chroma # endif # endif // BUILD_CUDA +#elif defined(BUILD_SB) && defined(SUPERBBLAS_USE_GPU) + // Get device to run + int gpu_device = SB::detail::getGpuContext()->device; + +# ifdef BUILD_QUDA + setVerbosityQuda(QUDA_SUMMARIZE, "", stdout); + QDPIO::cout << "Initializing QUDA device (using CUDA device no. " << gpu_device << ")" + << std::endl; + initQudaDevice(gpu_device); + initQudaMemory(); +# endif + #else // defined QDP_IS_QDPJIT # ifdef BUILD_QUDA - { - std::cout << "Initializing QUDA with initQuda(-1)" << std::endl; - initQuda(-1); - } + { + std::cout << "Initializing QUDA with initQuda(-1)" << std::endl; + initQuda(-1); + } # endif #endif @@ -404,6 +411,11 @@ namespace Chroma #endif #endif +#ifdef BUILD_SB + // Call superbblas finisher + SB::finish(); +#endif + if (! QDP_isInitialized()) return; @@ -420,12 +432,13 @@ namespace Chroma Chroma::getXMLLogInstance().close(); } + // Free memory for constants + constant_destroy(); + // Destroy singletons destroySingletons(); QDP_finalize(); - - } diff --git a/lib/io/readcppacs.cc b/lib/io/readcppacs.cc index 4782872912..3657037633 100644 --- a/lib/io/readcppacs.cc +++ b/lib/io/readcppacs.cc @@ -24,6 +24,7 @@ namespace Chroma { * \param cfg_file path ( Read ) */ +#if ! defined (QDP_IS_QDPJIT2) void readCPPACS(CPPACSGauge_t& header, multi1d& u, const std::string& cfg_file) { START_CODE(); @@ -115,7 +116,13 @@ void readCPPACS(CPPACSGauge_t& header, multi1d& u, const std END_CODE(); } - +#else +void readCPPACS(CPPACSGauge_t& header, multi1d& u, const std::string& cfg_file) +{ + QDPIO::cerr << __func__ << " not implemented." << std::endl; + QDP_abort(1); +} +#endif //! Read a CPPACS configuration file diff --git a/lib/io/readmilc.cc b/lib/io/readmilc.cc index 0652d4a56d..3f5dee8674 100644 --- a/lib/io/readmilc.cc +++ b/lib/io/readmilc.cc @@ -19,6 +19,7 @@ namespace Chroma { * \param cfg_file path ( Read ) */ +#if ! defined (QDP_IS_QDPJIT2) void readMILC(MILCGauge_t& header, multi1d& u, const std::string& cfg_file) { START_CODE(); @@ -104,7 +105,13 @@ void readMILC(MILCGauge_t& header, multi1d& u, const std::s END_CODE(); } - +#else +void readMILC(MILCGauge_t& header, multi1d& u, const std::string& cfg_file) +{ + QDPIO::cerr << __func__ << " not implemented." << std::endl; + QDP_abort(1); +} +#endif //! Read a MILC configuration file diff --git a/lib/io/readwupp.cc b/lib/io/readwupp.cc index b1c4b9b69e..6d80917ff4 100644 --- a/lib/io/readwupp.cc +++ b/lib/io/readwupp.cc @@ -24,7 +24,8 @@ namespace Chroma { * \param cfg_file path ( Read ) */ - void readWupp(multi1d& u, const std::string& cfg_file) +#if ! defined (QDP_IS_QDPJIT2) +void readWupp(multi1d& u, const std::string& cfg_file) { START_CODE(); @@ -119,7 +120,13 @@ namespace Chroma { END_CODE(); } - +#else +void readWupp(multi1d& u, const std::string& cfg_file) +{ + QDPIO::cerr << __func__ << " not implemented." << std::endl; + QDP_abort(1); +} +#endif //! Read a Wupp configuration file diff --git a/lib/meas/eig/ischiral_w.cc b/lib/meas/eig/ischiral_w.cc index f244adf35c..a2f6259163 100644 --- a/lib/meas/eig/ischiral_w.cc +++ b/lib/meas/eig/ischiral_w.cc @@ -29,7 +29,7 @@ Chirality isChiralVector(const LatticeFermion& chi) // To get a boolean out of < operator I have to apply // toBool. Is this because otherwise it is some kind of selector // for a mask? - if ( toBool(tmp1 > fabs(chirality)*fuzz) ) { + if ( toBool(tmp1 > fabs(chirality)*Chroma::constant().fuzz) ) { ret_val = CH_NONE; } else { diff --git a/lib/meas/gfix/grelax.cc b/lib/meas/gfix/grelax.cc index 30520979e5..60e70e3e6c 100644 --- a/lib/meas/gfix/grelax.cc +++ b/lib/meas/gfix/grelax.cc @@ -89,7 +89,7 @@ void grelax(LatticeColorMatrix& g, // Normalize LatticeBoolean lbtmp; - lbtmp[rb[cb]] = r_l > fuzz; + lbtmp[rb[cb]] = r_l > Chroma::constant().fuzz; LatticeReal lftmp; lftmp[rb[cb]] = 1.0 / where(lbtmp, r_l, LatticeReal(1)); @@ -118,7 +118,7 @@ void grelax(LatticeColorMatrix& g, /* compute sin(new)/sin(old) */ /* set the ratio to 0, if sin(old) < FUZZ */ - lftmp[rb[cb]] = where(oldsin > fuzz, sin(theta_new) / oldsin, LatticeReal(0)); + lftmp[rb[cb]] = where(oldsin > Chroma::constant().fuzz, sin(theta_new) / oldsin, LatticeReal(0)); /* get the new cos = a[0] */ a[0][rb[cb]] = cos(theta_new); @@ -144,7 +144,7 @@ void grelax(LatticeColorMatrix& g, // Normalize LatticeBoolean lbtmp; - lbtmp[rb[cb]] = r_l > fuzz; + lbtmp[rb[cb]] = r_l > Chroma::constant().fuzz; LatticeReal lftmp; lftmp[rb[cb]] = 1.0 / where(lbtmp, r_l, LatticeReal(1)); @@ -157,7 +157,7 @@ void grelax(LatticeColorMatrix& g, /* Now do the overrelaxation, if desired */ if( ordo ) { - Real pi = 0.5 * twopi; + Real pi = 0.5 * Chroma::constant().twopi; /* get angle */ LatticeReal theta; diff --git a/lib/meas/glue/mesplq.cc b/lib/meas/glue/mesplq.cc index ecd787247c..a2d04b2495 100644 --- a/lib/meas/glue/mesplq.cc +++ b/lib/meas/glue/mesplq.cc @@ -81,6 +81,7 @@ namespace Chroma END_CODE(); } +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(const multi1d& u, multi2d& plane_plaq, Double& link) { @@ -92,6 +93,13 @@ namespace Chroma { MesPlq_t(u,plane_plaq, link); } +#else + void MesPlq(const multi1d& u, + multi2d& plane_plaq, Double& link) + { + MesPlq_t(u,plane_plaq, link); + } +#endif //! Return the value of the average plaquette normalized to 1 /*! @@ -144,6 +152,7 @@ namespace Chroma END_CODE(); } +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, multi2d& plane_plaq, @@ -151,7 +160,6 @@ namespace Chroma { MesPlq_t(u,w_plaq,s_plaq,t_plaq, plane_plaq, link); } - void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, multi2d& plane_plaq, @@ -159,6 +167,15 @@ namespace Chroma { MesPlq_t(u,w_plaq,s_plaq,t_plaq, plane_plaq, link); } +#else + void MesPlq(const multi1d& u, + Double& w_plaq, Double& s_plaq, Double& t_plaq, + multi2d& plane_plaq, + Double& link) + { + MesPlq_t(u,w_plaq,s_plaq,t_plaq, plane_plaq, link); + } +#endif //! Return the value of the average plaquette normalized to 1 /*! @@ -171,6 +188,7 @@ namespace Chroma * \param link space-time average link (Write) */ +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, Double& link) { @@ -194,6 +212,19 @@ namespace Chroma END_CODE(); } +#else + void MesPlq(const multi1d& u, + Double& w_plaq, Double& s_plaq, Double& t_plaq, Double& link) + { + START_CODE(); + + multi2d plane_plaq; + + MesPlq(u, w_plaq, s_plaq, t_plaq, plane_plaq, link); + + END_CODE(); + } +#endif //! Print the value of the average plaquette normalized to 1 /*! @@ -265,6 +296,7 @@ namespace Chroma END_CODE(); } +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(XMLWriter& xml, const std::string& xml_group, const multi1d& u) @@ -278,5 +310,13 @@ namespace Chroma { MesPlq_t(xml, xml_group, u); } - +#else + void MesPlq(XMLWriter& xml, + const std::string& xml_group, + const multi1d& u) + { + MesPlq_t(xml, xml_group, u); + } +#endif + } // end namespace Chroma diff --git a/lib/meas/glue/mesplq.h b/lib/meas/glue/mesplq.h index d63e239bfc..33fe78629d 100644 --- a/lib/meas/glue/mesplq.h +++ b/lib/meas/glue/mesplq.h @@ -16,12 +16,17 @@ namespace Chroma * \param t_plaq time-like plaquette average (Write) * \param link space-time average link (Write) */ +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, Double& link); void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, Double& link); - +#else + void MesPlq(const multi1d& u, + Double& w_plaq, Double& s_plaq, Double& t_plaq, Double& link); +#endif + //! Return the value of the average plaquette normalized to 1 /*! * \ingroup glue @@ -34,6 +39,7 @@ namespace Chroma * \param link space-time average link (Write) */ +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(const multi1d& u, Double& w_plaq, Double& s_plaq, Double& t_plaq, multi2d& plane_plaq, @@ -43,7 +49,13 @@ namespace Chroma Double& w_plaq, Double& s_plaq, Double& t_plaq, multi2d& plane_plaq, Double& link); - +#else + void MesPlq(const multi1d& u, + Double& w_plaq, Double& s_plaq, Double& t_plaq, + multi2d& plane_plaq, + Double& link); +#endif + //! Print the value of the average plaquette normalized to 1 /*! * \ingroup glue @@ -51,6 +63,7 @@ namespace Chroma * \param xml plaquette average (Write) * \param u gauge field (Read) */ +#if ! defined (QDP_IS_QDPJIT2) void MesPlq(XMLWriter& xml, const std::string& xml_group, const multi1d& u); @@ -58,7 +71,12 @@ namespace Chroma void MesPlq(XMLWriter& xml, const std::string& xml_group, const multi1d& u); - +#else + void MesPlq(XMLWriter& xml, + const std::string& xml_group, + const multi1d& u); +#endif + } // end namespace Chroma #endif diff --git a/lib/meas/glue/polylp.cc b/lib/meas/glue/polylp.cc index 7c9e838398..8a40106f04 100644 --- a/lib/meas/glue/polylp.cc +++ b/lib/meas/glue/polylp.cc @@ -22,11 +22,11 @@ namespace Chroma START_CODE(); // Initial Polyakov loop - LatticeColorMatrix poly = u[mu]; + Q poly = u[mu]; for(int n = 1; n < Layout::lattSize()[mu]; ++n) // run over all links in mu dir { - LatticeColorMatrix tmp = shift(poly, FORWARD, mu); + Q tmp = shift(poly, FORWARD, mu); poly = u[mu] * tmp; } @@ -36,7 +36,7 @@ namespace Chroma END_CODE(); } - +#if ! defined (QDP_IS_QDPJIT2) void polylp(const multi1d& u, DComplex& poly_loop, int mu) { polylp_t( u, poly_loop, mu); @@ -46,6 +46,12 @@ namespace Chroma { polylp_t(u, poly_loop, mu); } +#else + void polylp(const multi1d& u, DComplex& poly_loop, int mu) + { + polylp_t( u, poly_loop, mu); + } +#endif //! Compute Polyakov loop /*! @@ -68,6 +74,7 @@ namespace Chroma } +#if ! defined (QDP_IS_QDPJIT2) void polylp(const multi1d& u, multi1d& poly_loop) { polylp_t(u,poly_loop); @@ -77,4 +84,11 @@ namespace Chroma { polylp_t(u,poly_loop); } +#else + void polylp(const multi1d& u, multi1d& poly_loop) + { + polylp_t(u,poly_loop); + } +#endif + } // end namespace Chroma diff --git a/lib/meas/glue/polylp.h b/lib/meas/glue/polylp.h index 8c586a5a8d..02272811df 100644 --- a/lib/meas/glue/polylp.h +++ b/lib/meas/glue/polylp.h @@ -18,10 +18,13 @@ namespace Chroma * \param mu direction of Polyakov loop (Read) */ +#if ! defined (QDP_IS_QDPJIT2) void polylp(const multi1d& u, DComplex& poly_loop, int mu); - void polylp(const multi1d& u, DComplex& poly_loop, int mu); - +#else + void polylp(const multi1d& u, DComplex& poly_loop, int mu); +#endif + //! Compute Polyakov loop /*! * \ingroup glue @@ -30,9 +33,13 @@ namespace Chroma * \param poly_loop Polyakov loop average (Write) */ +#if ! defined (QDP_IS_QDPJIT2) void polylp(const multi1d& u, multi1d& poly_loop); void polylp(const multi1d& u, multi1d& poly_loop); - +#else + void polylp(const multi1d& u, multi1d& poly_loop); +#endif + } // end namespace Chroma #endif diff --git a/lib/meas/glue/qactden.cc b/lib/meas/glue/qactden.cc index 2d9ba00d11..1e189c0e73 100644 --- a/lib/meas/glue/qactden.cc +++ b/lib/meas/glue/qactden.cc @@ -230,13 +230,14 @@ namespace Chroma qtop_tmp = real(trace(tmp_2)); lrqtop -= qtop_tmp; - } + } + /* Lattice version of S_ratio */ - lract /= (4*Chroma::twopi*Chroma::twopi); + lract /= ( 4 * Chroma::constant().twopi * Chroma::constant().twopi); /* Lattice version of qtop */ - lrqtop /= (64*Chroma::twopi*Chroma::twopi); + lrqtop /= ( 64 * Chroma::constant().twopi * Chroma::constant().twopi); END_CODE(); } diff --git a/lib/meas/glue/qnaive.cc b/lib/meas/glue/qnaive.cc index 09f2bc7619..003735db81 100644 --- a/lib/meas/glue/qnaive.cc +++ b/lib/meas/glue/qnaive.cc @@ -662,7 +662,7 @@ namespace Chroma } /* Topological charge */ - qtop /= ( 16*16*twopi*twopi ); + qtop /= ( 16*16 * Chroma::constant().twopi * Chroma::constant().twopi ); QDPIO::cout << "qtop = " << qtop << std::endl; END_CODE(); diff --git a/lib/meas/hadron/greedy_coloring.cc b/lib/meas/hadron/greedy_coloring.cc index 99225c17ff..b32b01a7f8 100644 --- a/lib/meas/hadron/greedy_coloring.cc +++ b/lib/meas/hadron/greedy_coloring.cc @@ -71,8 +71,8 @@ namespace Chroma // // NOTE: we used anti-natural order, the last coordinate moves the fastest - Indices coor2index(const Coors& coors, const CoorType dim, - const CoorType order = antiNaturalOrder()) + Indices coor2index(const Coors& coors, const CoorType& dim, + const CoorType& order = antiNaturalOrder()) { // Quick exit if (dim.size() <= 0) @@ -113,8 +113,8 @@ namespace Chroma // // NOTE: we used anti-natural order, the last coordinate moves the fastest - Coors index2coor(const Indices& indices, const CoorType dim, - const CoorType order = antiNaturalOrder()) + Coors index2coor(const Indices& indices, const CoorType& dim, + const CoorType& order = antiNaturalOrder()) { // Quick exit if (dim.size() <= 0) @@ -144,7 +144,7 @@ namespace Chroma // // Return the neighbors' coordinates of the vertices 'coors'. - Coors neighbors(const Coors coors, const CoorType dim) + Coors neighbors(const Coors coors, const CoorType& dim) { if (dim.size() <= 0) return Coors(); @@ -180,7 +180,7 @@ namespace Chroma // Return the number of vertices in a lattice // \param dim: lattice dimensions - std::size_t volume(const CoorType dim) + std::size_t volume(const CoorType& dim) { if (dim.size() <= 0) return 0; @@ -230,6 +230,18 @@ namespace Chroma return (dim == 0 ? 0 : coor % dim); } +// Avoid intel compiler explosion +#ifdef __INTEL_COMPILER +# pragma intel optimization_level 0 +#endif + CoorType normalize_coor(const CoorType& coor, const CoorType& dim) + { + CoorType r; + for (unsigned int i = 0; i < coor.size(); ++i) + r[i] = normalize_coor(coor[i], dim[i]); + return r; + } + IndexType euclidian_dist_squared(const CoorType& a, const CoorType& b, const CoorType& dim) { CoorType d; @@ -243,8 +255,20 @@ namespace Chroma return dist; } +// Avoid intel compiler explosion +#ifdef __INTEL_COMPILER +# pragma intel optimization_level 0 +#endif + CoorType minus(const CoorType& a, const CoorType& b) + { + CoorType r; + for (unsigned int i = 0; i < a.size(); ++i) + r[i] = a[i] - b[i]; + return r; + } + // Return all neighbors up to a given distance WITH THE SAME PARITY - // \param dist: distance in the z-direction + // \param dists: shifts to consider // \param power: all neighbors up to this distance // // Return a vector of coordinate differences to all neighbors up to @@ -255,15 +279,18 @@ namespace Chroma // NOTE: no performance requirements for this function; the function // 'plus' is doing the heavy lifting. - Coors neighbors_upto_distance(unsigned int dist, unsigned int power, const CoorType& dim) + Coors neighbors_upto_distance(const Coors& dists, unsigned int power, const CoorType& dim) { // Find all neighbors of the vertex at origin up to the - // given distance; a regular code should do something like: + // given distances; a regular code should do something like: // for i=1:dist, vertices=union(vertices, neighbors(vertices)) // BUT get_motive ONLY CARES ABOUT EVEN VERTICES - Coors centers(2); - centers[0][2] = dist; - centers[1][2] = normalize_coor(dim[2] - dist, dim[2]); + Coors centers; + for (const auto& dist : dists) + { + centers.push_back(dist); + centers.push_back(normalize_coor(minus(dim, dist), dim)); + } Coors neighbors_pattern = centers, prev; for (unsigned int i = 0; i < power; i++) { @@ -275,7 +302,7 @@ namespace Chroma // Filter our neighbors further than power distance in euclidian metric from the centers Coors filter; - for (CoorType c : neighbors_pattern) + for (const CoorType& c : neighbors_pattern) if (coor2index(Coors(1, c), dim)[0] != 0 && (euclidian_dist_squared(c, centers[0], dim) <= power * power || euclidian_dist_squared(c, centers[1], dim) <= power * power)) @@ -297,10 +324,10 @@ namespace Chroma // * First color even vertices with greedy coloring // * Then color odd vertices copying the coloring of the even vertices - Indices get_colors(unsigned int dist, unsigned int power, const CoorType& dim, + Indices get_colors(const Coors& dists, unsigned int power, const CoorType& dim, unsigned int& num_colors) { - Coors neighbors_rel = neighbors_upto_distance(dist, power, dim); + Coors neighbors_rel = neighbors_upto_distance(dists, power, dim); const unsigned int vol = volume(dim); std::vector color(vol); @@ -356,10 +383,10 @@ namespace Chroma return color; } - bool check_coloring(const Indices& color, unsigned int dist, unsigned int power, + bool check_coloring(const Indices& color, const Coors& dists, unsigned int power, const CoorType& dim) { - Coors neighbors_rel = neighbors_upto_distance(dist, power, dim); + Coors neighbors_rel = neighbors_upto_distance(dists, power, dim); const unsigned int vol = volume(dim); for (unsigned int i = 0; i < vol; i++) @@ -381,42 +408,64 @@ namespace Chroma } // Construct a k-distance coloring - Coloring::Coloring(unsigned int distance, unsigned int power) + void Coloring::construct(const std::vector>& distances, unsigned int power, + const CoorType& latt_size, bool build_local) { - // Get lattice dimensions - CoorType latt_size; - for (unsigned int i = 0; i < latt_size.size(); i++) - latt_size[i] = Layout::lattSize()[i]; + // Get the absolute value of the distances + Coors abs_distances; + for (const auto& dist : distances) + abs_distances.push_back( + CoorType{(unsigned int)std::abs(dist[0]), (unsigned int)std::abs(dist[1]), + (unsigned int)std::abs(dist[2]), (unsigned int)std::abs(dist[3])}); + + // Compute the maximum shift/distance requested + CoorType max_distance{{}}; + for (const auto& dist : abs_distances) + for (unsigned int i = 0; i < dist.size(); ++i) + max_distance[i] = std::max(max_distance[i], dist[i]); // Compute the tile size; the tile size should be divisible by the lattice size and // greater or equal than 2*(dist+power) - CoorType tile_size; for (unsigned int i = 0; i < latt_size.size(); i++) { - tile_size[i] = std::min(2 * ((i == 2 ? distance : 0) + power), latt_size[i]); + tile_size[i] = std::min(2 * (max_distance[i] + power), latt_size[i]); while (latt_size[i] % tile_size[i] != 0) tile_size[i]++; } // Get colors for all nodes - Indices colors = get_colors(distance, power, tile_size, num_colors); + colors = get_colors(abs_distances, power, tile_size, num_colors); - // Store the colors of the local nodes - int this_node = Layout::nodeNumber(); - local_colors.resize(Layout::sitesOnNode()); - for (unsigned int i = 0; i < Layout::sitesOnNode(); i++) + if (build_local) { - // Local coordinates of node i - multi1d x = Layout::siteCoords(this_node, i); + // Store the colors of the local nodes + int this_node = Layout::nodeNumber(); + local_colors.resize(Layout::sitesOnNode()); + for (unsigned int i = 0; i < Layout::sitesOnNode(); i++) + { + // Local coordinates of node i + multi1d x = Layout::siteCoords(this_node, i); - CoorType c; - for (unsigned int j = 0; j < c.size(); j++) - c[j] = x[j]; + CoorType c; + for (unsigned int j = 0; j < c.size(); j++) + c[j] = x[j]; - local_colors[i] = colors[coor2index(Coors(1, c), tile_size)[0]]; + local_colors[i] = colors[coor2index(Coors(1, c), tile_size)[0]]; + } } } + // Construct a k-distance coloring + Coloring::Coloring(const std::vector>& distances, unsigned int power) + { + // Get lattice dimensions + CoorType latt_size; + for (unsigned int i = 0; i < latt_size.size(); i++) + latt_size[i] = Layout::lattSize()[i]; + + construct(distances, power, latt_size, true); + } + // Read the coloring from a file Coloring::Coloring(const std::string& filename) { @@ -471,6 +520,8 @@ namespace Chroma { if (color >= num_colors) throw std::runtime_error("Invalid color value"); + if (local_colors.size() == 0) + throw std::runtime_error("Invalid function"); int node = Layout::nodeNumber(); for (int s(0); s < Layout::sitesOnNode(); s++) @@ -481,4 +532,31 @@ namespace Chroma } } + // Return the color for a site + unsigned int Coloring::getColor(const std::array& coor) const + { + CoorType coor0{(unsigned int)coor[0], (unsigned int)coor[1], (unsigned int)coor[2], + (unsigned int)coor[3]}; + return colors[coor2index(Coors(1, coor0), tile_size)[0]]; + } + + // Return all neighbors + std::vector> Coloring::all_neighbors(unsigned int farthest_neighbor, + const std::array& dim) + { + std::array dimu{(unsigned int)dim[0], (unsigned int)dim[1], + (unsigned int)dim[2], (unsigned int)dim[3]}; + Coors neighbors_coors(1); + for (unsigned int i = 0; i < farthest_neighbor; i++) + { + auto new_neighbors_coors = neighbors(neighbors_coors, dimu); + neighbors_coors.insert(neighbors_coors.end(), new_neighbors_coors.begin(), + new_neighbors_coors.end()); + neighbors_coors = index2coor(unique_and_sort(coor2index(neighbors_coors, dimu)), dimu); + } + std::vector> r; + for (const auto& i : neighbors_coors) + r.push_back(std::array{(int)i[0], (int)i[1], (int)i[2], (int)i[3]}); + return r; + } } diff --git a/lib/meas/hadron/greedy_coloring.h b/lib/meas/hadron/greedy_coloring.h index cf474b2b61..c6efaaeba1 100644 --- a/lib/meas/hadron/greedy_coloring.h +++ b/lib/meas/hadron/greedy_coloring.h @@ -10,28 +10,58 @@ #include "chromabase.h" +#include +#include + namespace Chroma { // Interface for computes distance-k coloring for toroidal lattices struct Coloring { // Construct a k-distance coloring - Coloring(unsigned int distance, unsigned int power); + Coloring(const std::vector>& distances, unsigned int power); + + // Construct a k-distance coloring + Coloring(unsigned int power, const std::array& dim) + { + std::array dimu{(unsigned int)dim[0], (unsigned int)dim[1], + (unsigned int)dim[2], (unsigned int)dim[3]}; + construct(std::vector>{std::array{{}}}, power, dimu, false); + } + + // Construct a k-distance coloring + Coloring(const std::vector>& distances, unsigned int power, const std::array& dim) + { + std::array dimu{(unsigned int)dim[0], (unsigned int)dim[1], + (unsigned int)dim[2], (unsigned int)dim[3]}; + construct(distances, power, dimu, false); + } + // Reading the coloring from a file Coloring(const std::string& filename); // Return a probing vector for the given color void getVec(LatticeInteger& vec, unsigned int color) const; + // Return the color for each node + unsigned int getColor(const std::array& dim) const; + // Return the number of colors unsigned int numColors() const { return num_colors; } + static std::vector> all_neighbors(unsigned int farthest_neighbor, + const std::array& dim); + private: - multi1d local_colors; + std::vector colors; + std::vector local_colors; + std::array tile_size; unsigned int num_colors; + void construct(const std::vector>& distances, unsigned int power, + const std::array& latt_size, bool build_local); }; } diff --git a/lib/meas/hadron/mesQl_w.cc b/lib/meas/hadron/mesQl_w.cc index fffc5035e0..77302d2a51 100644 --- a/lib/meas/hadron/mesQl_w.cc +++ b/lib/meas/hadron/mesQl_w.cc @@ -4,6 +4,7 @@ #include "mesQl_w.h" #include "barQll_w.h" +#include namespace Chroma { @@ -37,6 +38,7 @@ void Qlbar(const multi1d& u, const std::string& xml_group, const int bc) { +#if !defined(__INTEL_LLVM_COMPILER) START_CODE(); if ( Ns != 4 ) /* Code is specific to Ns=4 */ @@ -84,6 +86,9 @@ void Qlbar(const multi1d& u, pop(xml); END_CODE(); +#else + throw std::runtime_error("shitty intel compiler refuses to compile this code"); +#endif } //! Heavy-light meson 2-pt function with backwards moving static quark @@ -116,6 +121,7 @@ void QlbarBACK(const multi1d& u, const std::string& xml_group, const int bc) { +#if !defined(__INTEL_LLVM_COMPILER) START_CODE(); if ( Ns != 4 ) /* Code is specific to Ns=4 */ @@ -158,6 +164,9 @@ void QlbarBACK(const multi1d& u, pop(xml); END_CODE(); +#else + throw std::runtime_error("shitty intel compiler refuses to compile this code"); +#endif } } diff --git a/lib/meas/hadron/photon_seqsrc_w.cc b/lib/meas/hadron/photon_seqsrc_w.cc index 1e91180b72..f92fec9e42 100644 --- a/lib/meas/hadron/photon_seqsrc_w.cc +++ b/lib/meas/hadron/photon_seqsrc_w.cc @@ -175,7 +175,7 @@ namespace Chroma { if (mu != params.j_decay) { - pp_f[j] = params.sink_mom[j] * twopi / Real(Layout::lattSize()[mu]); + pp_f[j] = params.sink_mom[j] * Chroma::constant().twopi / Real(Layout::lattSize()[mu]); if (params.sink_mom[j] != 0) p_dot_x += (Layout::latticeCoordinate(mu) - getTSrce()[mu]) * pp_f[j]; @@ -280,7 +280,7 @@ namespace Chroma { if (mu != params.j_decay) { - pp_f[j] = params.sink_mom[j] * twopi / Real(Layout::lattSize()[mu]); + pp_f[j] = params.sink_mom[j] * Chroma::constant().twopi / Real(Layout::lattSize()[mu]); if (params.sink_mom[j] != 0) p_dot_x += (Layout::latticeCoordinate(mu) - getTSrce()[mu]) * pp_f[j]; @@ -322,7 +322,7 @@ namespace Chroma { if (params.sink_mom[params.pol_dir] != 0) { - Real pp_f = - params.sink_mom[params.pol_dir] * twopi / Real(Layout::lattSize()[params.pol_dir]); + Real pp_f = - params.sink_mom[params.pol_dir] * Chroma::constant().twopi / Real(Layout::lattSize()[params.pol_dir]); exp_p_dot_x_b *= cmplx(cos(pp_f),sin(pp_f)); } } diff --git a/lib/meas/hadron/qqbar_w.cc b/lib/meas/hadron/qqbar_w.cc index 4de67ff713..ce078ca503 100644 --- a/lib/meas/hadron/qqbar_w.cc +++ b/lib/meas/hadron/qqbar_w.cc @@ -106,8 +106,9 @@ namespace Chroma QDPIO::cout<<"Starting the qqbar code\n"; // Length of lattice in decay direction - Set sft_set(phases.getSet()) ; - int length(sft_set.numSubsets()); + + int length(phases.getSet().numSubsets()); + //QDPIO::cout<<"Time length: "<(params.named_obj.baryon_op_file, metadata, order, - SB::kvcoors<6>(order, {{'i', params.param.num_vecs}, - {'j', params.param.num_vecs}, - {'k', params.param.num_vecs}, - {'t', Nt}, - {'d', displacement_list.size()}, - {'m', moms.size()}}), - SB::Sparse, SB::checksum_type::BlockChecksum); + st = SB::StorageTensor<6, SB::ComplexD>( + params.named_obj.baryon_op_file, metadata, order, + SB::kvcoors<6>(order, {{'i', params.param.num_vecs}, + {'j', params.param.num_vecs}, + {'k', params.param.num_vecs}, + {'t', Nt}, + {'d', displacement_list.size()}, + {'m', moms.size()}}), + SB::Sparse, SB::checksum_type::BlockChecksum, + params.param.output_file_is_local ? SB::LocalFSFile : SB::SharedFSFile); st.preallocate(params.param.num_vecs * params.param.num_vecs * params.param.num_vecs * t_slices_to_write.size() * displacement_list.size() * moms.size() * - sizeof(SB::ComplexD)); + sizeof(SB::ComplexD) / + (params.param.output_file_is_local ? Layout::numNodes() : 1)); } @@ -742,77 +750,93 @@ namespace Chroma double time_storing = 0; // total time in writing elementals - // Iterate over time-slices - for (int tfrom0 = 0, this_tsize = std::min(tsize, params.param.max_tslices_in_contraction); - tfrom0 < tsize; tfrom0 += this_tsize, - this_tsize = std::min(params.param.max_tslices_in_contraction, tsize - tfrom0)) + // NOTE: st needs MPI synchronization when closing, so capture exception and abort in that case + // to avoid hangs + try { - int this_tfrom = (tfrom + tfrom0) % Nt; + int max_tslices_in_contraction = params.param.max_tslices_in_contraction == 0 + ? tsize + : params.param.max_tslices_in_contraction; - // Get num_vecs colorvecs on time-slice t_source - SB::Tensor source_colorvec = - SB::getColorvecs(colorvecsSto, u, params.param.decay_dir, this_tfrom, - this_tsize, params.param.num_vecs, "cxyzXnt", phase); + // Iterate over time-slices + for (int tfrom0 = 0, this_tsize = std::min(tsize, max_tslices_in_contraction); + tfrom0 < tsize; tfrom0 += this_tsize, + this_tsize = std::min(max_tslices_in_contraction, tsize - tfrom0)) + { + int this_tfrom = (tfrom + tfrom0) % Nt; - // Call for storing the baryons - SB::ColorContractionFn call([&](SB::Tensor<5, SB::Complex> tensor, int disp, - int first_tslice, int first_mom) { - StopWatch tstoring; - tstoring.reset(); - tstoring.start(); + // Get num_vecs colorvecs on time-slice t_source + SB::Tensor source_colorvec = + SB::getColorvecs(colorvecsSto, u, params.param.decay_dir, this_tfrom, + this_tsize, params.param.num_vecs, "cxyzXnt", phase); - if (params.param.use_superb_format) - { - for (int t = 0, numt = tensor.kvdim()['t']; t < numt; ++t) - { - if (t_slices_to_write.count((first_tslice + t) % Nt) == 0) - continue; - st.kvslice_from_size({{'t', (first_tslice + t) % Nt}, {'d', disp}, {'m', first_mom}}, - {{'t', 1}, {'d', 1}, {'m', tensor.kvdim()['m']}}) - .copyFrom(tensor.kvslice_from_size({{'t', t}}, {{'t', 1}})); - } - } - else - { - // Only the master node writes the elementals and we assume that tensor is only supported on master - assert(tensor.dist == SB::OnMaster); - tensor = tensor.getLocal(); - if (tensor) // if the local tensor isn't empty, ie this node holds the tensor - { - // Open the database - open_db(); - - KeyBaryonElementalOperator_t key; - ValBaryonElementalOperator_t val(params.param.num_vecs); - - for (int t = 0, numt = tensor.kvdim()['t']; t < numt; ++t) + // Call for storing the baryons + SB::ColorContractionFn call( + [&](SB::Tensor<5, SB::Complex> tensor, int disp, int first_tslice, int first_mom) { + StopWatch tstoring; + tstoring.reset(); + tstoring.start(); + + if (params.param.use_superb_format) { - if (t_slices_to_write.count((first_tslice + t) % Nt) == 0) - continue; - for (int m = 0, numm = tensor.kvdim()['m']; m < numm; ++m) + for (int t = 0, numt = tensor.kvdim()['t']; t < numt; ++t) { - key.t_slice = (first_tslice + t) % Nt; - key.left = SB::tomulti1d(displacement_list[disp][0]); - key.middle = SB::tomulti1d(displacement_list[disp][1]); - key.right = SB::tomulti1d(displacement_list[disp][2]); - key.mom = SB::tomulti1d(mom_list[first_mom + m]); - tensor.kvslice_from_size({{'t', t}, {'m', m}}, {{'t', 1}, {'m', 1}}).copyTo(val); - qdp_db[0].insert(key, val); + if (t_slices_to_write.count((first_tslice + t) % Nt) == 0) + continue; + st.kvslice_from_size( + {{'t', (first_tslice + t) % Nt}, {'d', disp}, {'m', first_mom}}, + {{'t', 1}, {'d', 1}, {'m', tensor.kvdim()['m']}}) + .copyFrom(tensor.kvslice_from_size({{'t', t}}, {{'t', 1}})); + } + } + else + { + // Only the master node writes the elementals and we assume that tensor is only supported on master + assert(tensor.dist == SB::OnMaster); + tensor = tensor.getLocal(); + if (tensor) // if the local tensor isn't empty, ie this node holds the tensor + { + // Open the database + open_db(); + + KeyBaryonElementalOperator_t key; + ValBaryonElementalOperator_t val(params.param.num_vecs); + + for (int t = 0, numt = tensor.kvdim()['t']; t < numt; ++t) + { + if (t_slices_to_write.count((first_tslice + t) % Nt) == 0) + continue; + for (int m = 0, numm = tensor.kvdim()['m']; m < numm; ++m) + { + key.t_slice = (first_tslice + t) % Nt; + key.left = SB::tomulti1d(displacement_list[disp][0]); + key.middle = SB::tomulti1d(displacement_list[disp][1]); + key.right = SB::tomulti1d(displacement_list[disp][2]); + key.mom = SB::tomulti1d(mom_list[first_mom + m]); + tensor.kvslice_from_size({{'t', t}, {'m', m}}, {{'t', 1}, {'m', 1}}) + .copyTo(val); + qdp_db[0].insert(key, val); + } + } } } - } - } - - tstoring.stop(); - time_storing += tstoring.getTimeInSeconds(); - }); - // Do the color-contraction - SB::doMomDisp_colorContractions( - u_smr, source_colorvec, mom_list, this_tfrom, displacement_list, params.param.use_derivP, - call, 0 /*params.param.max_tslices_in_contraction==0 means to do all */, - params.param.max_moms_in_contraction, params.param.max_vecs, SB::none, - SB::OnDefaultDevice, params.param.use_superb_format ? SB::OnEveryone : SB::OnMaster); + tstoring.stop(); + time_storing += tstoring.getTimeInSeconds(); + }); + + // Do the color-contraction + SB::doMomDisp_colorContractions( + u_smr, source_colorvec, mom_list, this_tfrom, displacement_list, + params.param.use_derivP, call, 0 /* it means to do all */, + params.param.max_moms_in_contraction, params.param.max_vecs, SB::none, + SB::OnDefaultDevice, + params.param.use_superb_format ? SB::none : SB::Maybe(SB::OnMaster)); + } + } catch (const std::exception& e) + { + std::cerr << "caught error: " << e.what() << std::endl; + QDP_abort(1); } // Close db diff --git a/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.h b/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.h index 27a67c859c..1fd7da3c9e 100644 --- a/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.h +++ b/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_superb_w.h @@ -55,6 +55,7 @@ namespace Chroma int max_moms_in_contraction;/*! maximum number of contracted momenta simultaneously */ int max_vecs; /*! maximum number of columns from the first tensor being contracted */ bool use_superb_format; /*! whether to use the superb file format for storing the data */ + bool output_file_is_local; /*!< Whether the output file is in a not shared filesystem */ }; struct NamedObject_t diff --git a/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_w.cc b/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_w.cc index 74ae8d4030..6be0c871a5 100644 --- a/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_w.cc +++ b/lib/meas/inline/hadron/inline_baryon_matelem_colorvec_w.cc @@ -84,7 +84,6 @@ namespace Chroma read(paramtop, "displacement_list", param.displacement_list); read(paramtop, "num_vecs", param.num_vecs); read(paramtop, "decay_dir", param.decay_dir); - read(paramtop, "site_orthog_basis", param.site_orthog_basis); param.link_smearing = readXMLGroup(paramtop, "LinkSmearing", "LinkSmearingType"); } @@ -104,7 +103,6 @@ namespace Chroma write(xml, "displacement_list", param.displacement_list); write(xml, "num_vecs", param.num_vecs); write(xml, "decay_dir", param.decay_dir); - write(xml, "site_orthog_basis", param.site_orthog_basis); xml << param.link_smearing.xml; pop(xml); diff --git a/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.cc b/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.cc index ecc18948fc..91ad444320 100644 --- a/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.cc +++ b/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.cc @@ -1,7 +1,7 @@ /*! \file * \brief Compute the disconnected diagrams with 4D probing * - * Propagator calculation on a colorstd::vector + * Propagator calculation on a color vector */ #include "inline_disco_prob_defl_superb_w.h" @@ -18,26 +18,27 @@ #include "util/ferm/key_val_db.h" #include "util/ferm/map_obj/map_obj_aggregate_w.h" #include "util/ferm/map_obj/map_obj_factory_w.h" +#include "util/ferm/mgproton.h" #include "util/ferm/subset_vectors.h" #include "util/ferm/superb_contractions.h" #include "util/ferm/transf.h" -#include "util/ft/sftmom.h" #include "util/info/proginfo.h" #include -#include #include +#include #ifdef BUILD_SB -namespace Chroma -{ - namespace InlineDiscoProbDeflSuperb +namespace Chroma +{ + + namespace InlineDiscoProbDeflSuperb { - //! Propagator input - void read(XMLReader& xml, const std::string& path, InlineDiscoProbDeflSuperb::Params::NamedObject_t& input) + void read(XMLReader& xml, const std::string& path, + InlineDiscoProbDeflSuperb::Params::NamedObject_t& input) { XMLReader inputtop(xml, path); @@ -46,7 +47,8 @@ namespace Chroma } //! Propagator output - void write(XMLWriter& xml, const std::string& path, const InlineDiscoProbDeflSuperb::Params::NamedObject_t& input) + void write(XMLWriter& xml, const std::string& path, + const InlineDiscoProbDeflSuperb::Params::NamedObject_t& input) { push(xml, path); @@ -57,109 +59,118 @@ namespace Chroma } //! Propagator input - void read(XMLReader& xml, const std::string& path, InlineDiscoProbDeflSuperb::Params::Param_t& param) + void read(XMLReader& xml, const std::string& path, + InlineDiscoProbDeflSuperb::Params::Param_t& param) { XMLReader inputtop(xml, path); - - read(inputtop,"max_path_length",param.max_path_length); - if(inputtop.count("p2_max")!=0){ - read(inputtop,"p2_max",param.p2_max); - param.use_p_list = false; - QDPIO::cout<<"Using momenta centered at the origin, with a max of "<>n>>m>>l){ - c1[i]=n; - c2[i]=m; - c3[i]=l; - i++; - } - lines=i-1;//Total number of momenta. - param.p_list.resize(lines, Nd - 1); - for(int mom = 0; mom < lines; mom++) - { - param.p_list[mom][0] = c1[mom]; - param.p_list[mom][1] = c2[mom]; - param.p_list[mom][2] = c3[mom]; - QDPIO::cout<<"Momentum number "< 0) + { + read(inputtop, "mom2_min", param.mom2_min); } - else + + param.mom2_max = 0; + if (inputtop.count("mom2_max") > 0) { - QDPIO::cout<<"Could not find valid XML momentum input."< 0) + { + read(inputtop, "mom_list", param.mom_list); } - read(inputtop,"mass_label",param.mass_label); - read(inputtop,"Propagator",param.prop) ; - - if(inputtop.count("use_ferm_state_links")!=0){ - read(inputtop,"use_ferm_state_links",param.use_ferm_state_links) ; - QDPIO::cout<<"Ferm state links set equal to "< disp ; /*!< Displacement dirs of quark (right)*/ - multi1d mom ; /*!< D-1 momentum of this operator */ - std::string mass_label ; /*!< Mass label */ - - KeyOperator_t(){ - mom.resize(Nd-1); + //! Meson operator + struct KeyOperator_t { + int t_slice; /*!< Meson operator time slice */ + multi1d disp; /*!< Displacement dirs of quark (right)*/ + multi1d mom; /*!< D-1 momentum of this operator */ + std::string mass_label; /*!< Mass label */ + + KeyOperator_t() + { + mom.resize(Nd - 1); } }; - - bool operator<(const KeyOperator_t& a, const KeyOperator_t& b) { - return (a.t_slice != b.t_slice ? a.t_slice < b.t_slice : ( - a.mom != b.mom ? a.mom < b.mom : ( - a.disp != b.disp ? a.disp < b.disp : ( - a.mass_label < b.mass_label)))); - } - + + //! Meson operator + struct ValOperator_t : public SB::Tensor<1, SB::ComplexD> { + ValOperator_t() : SB::Tensor<1, SB::ComplexD>("i", {Ns * Ns}, SB::OnHost, SB::Local) + { + } + }; + template T& operator<<(T& os, const multi1d& d) { - for (int i=0; i op ; - ValOperator_t(){op.resize(Ns*Ns);} // Here go the 16 gamma matrices - ~ValOperator_t(){} - } ; //------------------------------------------------------------------------- - //! stream IO - template - T& operator<<(T& os, const ValOperator_t& d) - { - os << "ValOperator_t:\n"; - for (int i=0; i, std::vector> r = - SB::doMomGammaDisp_contractions<9>(u, std::move(qbart), std::move(qt), 0, p, 0, SB::none, - gamma_mats, disps, false, order_out); + // Normalize paths: replace empty by [0] + std::vector> norm_disps; + norm_disps.reserve(disps.size()); + for (const auto& it : disps) + norm_disps.push_back(it.size() == 0 ? std::vector(1) : it); - // Gather all traces at the master node - SB::Tensor<9, SB::Complex> con = - r.first.make_sure(SB::none, SB::OnHost, SB::OnMaster).getLocal(); - - const std::vector& disps_perm = r.second; - - // Do the update only on the master node - if (con) - { - std::pair kv; - kv.first.mom.resize(Nd - 1); - kv.second.op.resize(Ns * Ns); - for (int i = 0; i < Ns * Ns; ++i) - kv.second.op[i] = 0.0; - - for (int d = 0; d < disps_perm.size(); ++d) + // Contract S and Q with all the gammas, and apply the displacements + std::string order_out = "gmNnsqt*"; + auto call = [&](SB::Tensor<8, SB::Complex> r, int disp_index, int tfrom, int mfrom) { + // Gather all traces at the master node + SB::Tensor<8, SB::Complex> con = + r.make_sure(order_out, SB::OnHost, SB::OnMaster).getLocal(); + + // Do the update only on the master node + if (con) { - // Normalize paths - int disp_d_len = disps[disps_perm[d]].size(); - kv.first.disp.resize(std::max(disp_d_len, 1)); - for (int i = 0; i < disp_d_len; ++i) - kv.first.disp[i] = disps[disps_perm[d]][i]; - if (disp_d_len == 0) - kv.first.disp[0] = 0; - - for (int mom = 0; mom < p.numMom(); ++mom) + int tsize = r.kvdim().at('t'); + for (int mom = 0; mom < mom_list.size(); ++mom) { - for (int i = 0; i < Nd - 1; ++i) - kv.first.mom[i] = p.numToMom(mom)[i]; - - for (int t = 0; t < Nt; ++t) + for (int t = 0; t < tsize; ++t) { - kv.first.t_slice = t; + MesonKey k{(tfrom + t) % Nt, norm_disps[disp_index], mom_list[mfrom + mom]}; - auto it = db.find(kv.first); + auto it = db.find(k); if (it == db.end()) - it = db.insert(kv).first; + it = db.insert({k, std::vector>(Ns * Ns)}).first; for (int ai = 0; ai < a; ++ai) { for (int g = 0; g < Ns * Ns; ++g) { - std::complex a = con.get({g, mom, 1, 1, d, 1, 1, t, ai}); -#ifdef QDP_IS_QDPJIT - it->second.op[g].elem().elem().elem().real().elem() += a.real(); - it->second.op[g].elem().elem().elem().imag().elem() += a.imag(); -#else - it->second.op[g].elem().elem().elem().real() += a.real(); - it->second.op[g].elem().elem().elem().imag() += a.imag(); -#endif + it->second[g] += con.get({g, mom, 1, 1, 1, 1, t, ai}); } } } } } - } + }; + SB::doMomGammaDisp_contractions<8, Nd + 6, Nd + 6, SB::Complex>( + u, std::move(qbart), std::move(qt), 0 /* first t_slize */, 0 /* save from */, + Nt /* save size */, mom_list, gamma_mats, disps, false /*no deriv*/, call, order_out); } // Update the mean and var for each observable in db - void do_update(std::map< KeyOperator_t, ValOperator_t >& dbmean, - std::map< KeyOperator_t, ValOperator_t >& dbvar, - const std::map< KeyOperator_t, ValOperator_t >& db, bool first_it) + void do_update(Traces& dbmean, TracesVariance& dbvar, const Traces& db, bool first_it) { - for(std::map< KeyOperator_t, ValOperator_t >::const_iterator it=db.begin(); it != db.end(); it++) { - std::pair::iterator, bool> itbo; - // Insert mean - itbo = dbmean.insert(*it); - assert(itbo.second == first_it); - if(!itbo.second ){ - // if insert fails, key already exists, so add result - itbo.first->second.op += it->second.op; - } - - // Insert variance - std::pair kv; - kv.first = it->first; - kv.second.op.resize(it->second.op.size()); - for(int i(0); isecond.op.size(); i++) - kv.second.op[i] = it->second.op[i] * conj(it->second.op[i]); - itbo = dbvar.insert(kv); + for (const auto& it : db) + { + // Insert mean + { + auto itbo = dbmean.insert(it); + assert(itbo.second == first_it); + if (!itbo.second) + { + // if insert fails, key already exists, so add result + for (int k = 0; k < Ns * Ns; ++k) + itbo.first->second[k] += it.second[k]; + } + } + + // Insert variance + auto key = it.first; + std::vector val(Ns * Ns); + for (int i = 0; i < Ns * Ns; i++) + val[i] = std::norm(it.second[i]); + auto itbo = dbvar.insert({key, val}); assert(itbo.second == first_it); - if(!itbo.second ){ - // if insert fails, key already exists, so add result - itbo.first->second.op += kv.second.op; - } + if (!itbo.second) + { + // if insert fails, key already exists, so add result + for (int i = 0; i < Ns * Ns; i++) + itbo.first->second[i] += val[i]; + } } } - void show_stats(const std::map< KeyOperator_t, ValOperator_t >& dbmean, - const std::map< KeyOperator_t, ValOperator_t >& dbvar, - const std::map< KeyOperator_t, ValOperator_t >& dbdet, - unsigned int hadamard_normalization, unsigned num_noise) + void show_stats(const Traces& dbmean, const TracesVariance& dbvar, const Traces& dbdet, + unsigned int hadamard_normalization, unsigned num_noise) { - if (num_noise <= 1) return; + if (num_noise <= 1) + return; + // Average the stats over all t_slice and absolute momenta and absolute displacement const int Nt = Layout::lattSize()[3]; - std::map> dbmean_avg, dbdet_avg, dbvar_avg; - std::map avg_n; // number of averaged values - for(std::map< KeyOperator_t, ValOperator_t >::const_iterator it=dbvar.cbegin();it != dbvar.cend(); it++){ - // Average over t_slice and forward/backward directions - std::pair> kv; - kv.first = it->first; - kv.first.t_slice = 0; - for(int k=0;kfirst.disp.size();k++) kv.first.disp[k] = abs(kv.first.disp[k]); - - // Update dbvar_avg - // Compute the variance as E[x^2] - E[x]^2 - std::map< KeyOperator_t, ValOperator_t >::const_iterator itmean = dbmean.find(it->first); - assert(itmean != dbmean.cend()); - kv.second.resize(it->second.op.size()); - for(int i(0);isecond.op.size();i++) { - DComplex a = it->second.op[i] / num_noise - itmean->second.op[i] * conj(itmean->second.op[i]) / num_noise / num_noise / hadamard_normalization / hadamard_normalization; - kv.second[i] = detox(a.elem().elem().elem().real()); - } - std::pair >::iterator, bool> itbo = dbvar_avg.insert(kv); - if(itbo.second ){ - avg_n[kv.first] = 1; - } else { - // if insert fails, key already exists, so add result - for(int i(0);isecond[i] += kv.second[i]; - avg_n[kv.first]++; - } - - // Update dbmean_avg - for(int i(0);isecond.op.size();i++) kv.second[i] = abs(std::complex(detox(itmean->second.op[i].elem().elem().elem().real()), detox(itmean->second.op[i].elem().elem().elem().imag()))) / hadamard_normalization / num_noise; - itbo = dbmean_avg.insert(kv); - if(!itbo.second){ - for(int i(0);isecond.op.size();i++) itbo.first->second[i] += kv.second[i]; - } - - // Update dbdet_avg - itmean = dbdet.find(it->first); - if (itmean != dbdet.cend()) { - for(int i(0);isecond.op.size();i++) kv.second[i] = abs(std::complex(detox(itmean->second.op[i].elem().elem().elem().real()), detox(itmean->second.op[i].elem().elem().elem().imag()))); - } else { - for(int i(0);isecond.op.size();i++) kv.second[i] = 0.0; - } - itbo = dbdet_avg.insert(kv); - if(!itbo.second){ - for(int i(0);isecond.op.size();i++) itbo.first->second[i] += kv.second[i]; - } - } - for(std::map< KeyOperator_t, std::vector>::iterator it=dbvar_avg.begin();it != dbvar_avg.end(); it++) { - const unsigned int n = avg_n[it->first]; - QDPIO::cout << "DISCO VARIANCE with " << num_noise << " noise vectors key: disp = " << it->first.disp << " mom = " << it->first.mom << " val: " << std::endl; - for(int i(0);isecond.size();i++) QDPIO::cout << "Gamma[" << i << "]: avg_det = " << dbdet_avg[it->first][i]/n << " avg = " << dbmean_avg[it->first][i]/n << " var = " << it->second[i]/n << std::endl; + TracesVariance dbmean_avg(dbmean.size()), dbdet_avg(dbmean.size()), dbvar_avg(dbmean.size()); + MesonMap avg_n; // number of averaged values + for (const auto& it : dbvar) + { + // Average over t_slice, forward/backward directions, and absolute momenta + MesonKey key = it.first; + key.t_slice = 0; + for (int k = 0; k < it.first.disp.size(); k++) + key.disp[k] = abs(it.first.disp[k]); + for (int k = 0; k < it.first.mom.size(); k++) + key.mom[k] = abs(it.first.mom[k]); + + // Update dbvar_avg + // Compute the variance as E[x^2] - E[x]^2 + std::vector val(Ns * Ns); + auto itmean = dbmean.find(it.first); + assert(itmean != dbmean.cend()); + for (int i = 0; i < Ns * Ns; i++) + { + val[i] = it.second[i] / num_noise - std::norm(itmean->second[i]) / num_noise / num_noise / + hadamard_normalization / hadamard_normalization; + } + auto itbo = dbvar_avg.insert({key, val}); + if (itbo.second) + { + avg_n[key] = 1; + } + else + { + // if insert fails, key already exists, so add result + for (int i = 0; i < Ns * Ns; i++) + itbo.first->second[i] += val[i]; + ++avg_n[key]; + } + + // Update dbmean_avg + for (int i = 0; i < Ns * Ns; i++) + val[i] = abs(itmean->second[i]) / hadamard_normalization / num_noise; + itbo = dbmean_avg.insert({key, val}); + if (!itbo.second) + { + for (int i = 0; i < Ns * Ns; i++) + itbo.first->second[i] += val[i]; + } + + // Update dbdet_avg + itmean = dbdet.find(it.first); + if (itmean != dbdet.cend()) + { + for (int i = 0; i < Ns * Ns; i++) + val[i] = abs(itmean->second[i]); + } + else + { + for (int i = 0; i < Ns * Ns; i++) + val[i] = 0; + } + itbo = dbdet_avg.insert({key, val}); + if (!itbo.second) + { + for (int i = 0; i < Ns * Ns; i++) + itbo.first->second[i] += val[i]; + } + } + + // Order the keys by momenta and displacement + std::vector keys; + keys.reserve(dbvar_avg.size()); + for (const auto& it : dbvar_avg) + keys.push_back(it.first); + std::sort(keys.begin(), keys.end(), [=](const MesonKey& a, const MesonKey& b) { + auto a_mom = std::vector(a.mom.begin(), a.mom.end()); + auto b_mom = std::vector(b.mom.begin(), b.mom.end()); + return (a_mom < b_mom || (a_mom == b_mom && // compare mom + (a.disp < b.disp || a.disp == b.disp) // compare disp + )); + }); + + // Print the stats + for (const auto& key : keys) + { + const unsigned int n = avg_n[key]; + QDPIO::cout << "DISCO VARIANCE with " << num_noise + << " noise vectors key: disp = " << SB::tomulti1d(key.disp) + << " mom = " << SB::tomulti1d(key.mom) << " val: " << std::endl; + for (int i = 0; i < Ns * Ns; i++) + QDPIO::cout << "Gamma[" << i << "]: avg_det = " << dbdet_avg[key][i] / n + << " avg = " << dbmean_avg[key][i] / n << " var = " << dbvar_avg[key][i] / n + << std::endl; } } - + namespace { - AbsInlineMeasurement* createMeasurement(XMLReader& xml_in, - const std::string& path) + AbsInlineMeasurement* createMeasurement(XMLReader& xml_in, const std::string& path) { return new InlineMeas(Params(xml_in, path)); } @@ -529,14 +543,14 @@ namespace Chroma //! Local registration flag bool registered = false; } - + const std::string name = "DISCO_PROBING_DEFLATION_SUPERB"; //! Register all the factories - bool registerAll() + bool registerAll() { - bool success = true; - if (! registered) + bool success = true; + if (!registered) { success &= WilsonTypeFermActsEnv::registerAll(); success &= TheInlineMeasurementFactory::Instance().registerObject(name, createMeasurement); @@ -545,14 +559,16 @@ namespace Chroma return success; } - //---------------------------------------------------------------------------- // Param stuff - Params::Params() { frequency = 0; } + Params::Params() + { + frequency = 0; + } - Params::Params(XMLReader& xml_in, const std::string& path) + Params::Params(XMLReader& xml_in, const std::string& path) { - try + try { XMLReader paramtop(xml_in, path); @@ -568,23 +584,19 @@ namespace Chroma read(paramtop, "NamedObject", named_obj); // Possible alternate XML file pattern - if (paramtop.count("xml_file") != 0) + if (paramtop.count("xml_file") != 0) { read(paramtop, "xml_file", xml_file); } - } - catch(const std::string& e) + } catch (const std::string& e) { QDPIO::cerr << __func__ << ": Caught Exception reading XML: " << e << std::endl; QDP_abort(1); } } - // Function call - void - InlineMeas::operator()(unsigned long update_no, - XMLWriter& xml_out) + void InlineMeas::operator()(unsigned long update_no, XMLWriter& xml_out) { // If xml file not empty, then use alternate if (params.xml_file != "") @@ -605,15 +617,12 @@ namespace Chroma } } - // Real work done here - void - InlineMeas::func(unsigned long update_no, - XMLWriter& xml_out) + void InlineMeas::func(unsigned long update_no, XMLWriter& xml_out) { - typedef LatticeFermion T; - typedef multi1d P; - typedef multi1d Q; + typedef LatticeFermion T; + typedef multi1d P; + typedef multi1d Q; START_CODE(); @@ -626,15 +635,14 @@ namespace Chroma XMLBufferWriter gauge_xml; try { - u = TheNamedObjMap::Instance().getData< multi1d >(params.named_obj.gauge_id); + u = TheNamedObjMap::Instance().getData>( + params.named_obj.gauge_id); TheNamedObjMap::Instance().get(params.named_obj.gauge_id).getRecordXML(gauge_xml); - } - catch( std::bad_cast ) + } catch (std::bad_cast) { QDPIO::cerr << name << ": caught dynamic cast error" << std::endl; QDP_abort(1); - } - catch (const std::string& e) + } catch (const std::string& e) { QDPIO::cerr << name << ": std::map call failed: " << e << std::endl; QDP_abort(1); @@ -645,7 +653,7 @@ namespace Chroma QDPIO::cout << name << ": disconnected diagram calculation" << std::endl; - proginfo(xml_out); // Print out basic program info + proginfo(xml_out); // Print out basic program info // Write out the input write(xml_out, "Input", params); @@ -661,217 +669,263 @@ namespace Chroma MesPlq(xml_out, "Observables", u); std::shared_ptr coloring; - if (!params.param.probing_file.empty()) { - QDPIO::cout << "Reading colors from file " << params.param.probing_file << std::endl; - coloring.reset(new Coloring(params.param.probing_file)); - } else { + if (!params.param.probing_file.empty()) + { + QDPIO::cout << "Reading colors from file " << params.param.probing_file << std::endl; + coloring.reset(new Coloring(params.param.probing_file)); + } + else + { QDPIO::cout << "Generating a " << params.param.probing_distance << "-distance coloring with a power " << params.param.probing_power << std::endl; - coloring.reset(new Coloring(params.param.probing_distance, params.param.probing_power)); + // Do a k-distance coloring with k being params.param.probing_distance and taking as + // shifts, zero and the given probing distance. We include always the zero shift because + // the disconnected loops can be small at z=0 for several gammas. + coloring.reset(new Coloring( + std::vector>{{{}}, {0, 0, params.param.probing_distance, 0}}, + params.param.probing_power)); } - - // - // Initialize fermion action - // - std::istringstream xml_s(params.param.prop.fermact.xml); - XMLReader fermacttop(xml_s); - QDPIO::cout << "FermAct = " << params.param.prop.fermact.id << std::endl; - - Handle< FermionAction > - S_f(TheFermionActionFactory::Instance().createObject(params.param.prop.fermact.id, - fermacttop, - params.param.prop.fermact.path)); - Handle< FermState > state(S_f->createState(u)); + // Initialize fermion action + SB::ChimeraSolver PP{params.param.prop.fermact, params.param.prop.invParam, u}; + SB::ChimeraProjector proj{params.param.prop.fermact, params.param.projParam, u}; - Handle< SystemSolver > PP = S_f->qprop(state, - params.param.prop.invParam); - Handle< Projector > proj = S_f->projector(state, params.param.projParam); + std::istringstream xml_s(params.param.prop.fermact.xml); + XMLReader fermacttop(xml_s); + Handle> S_f(TheFermionActionFactory::Instance().createObject( + params.param.prop.fermact.id, fermacttop, params.param.prop.fermact.path)); + Handle> state(S_f->createState(u)); // Initialize the slow Fourier transform phases - int decay_dir = Nd-1 ; // hadamard needs this for now - //Initialize ft differently based on momentum list or max value. - SftMom ft = params.param.use_p_list ? SftMom(params.param.p_list, decay_dir) : SftMom(params.param.p2_max, false, decay_dir); + int decay_dir = Nd - 1; // hadamard needs this for now + + // + // If a list of momenta has been specified only need phases corresponding to these + // + SB::CoorMoms mom_list; + if (params.param.mom_list.size() == 0) + { + mom_list = SB::getMomenta(params.param.mom2_min, params.param.mom2_max); + } + else + { + mom_list = SB::getMomenta(params.param.mom_list); + } // number of colors int Nsrc = coloring->numColors(); QDPIO::cout << "num colors " << Nsrc << std::endl; - DComplex tr = 0.0 ; - DComplex trDef = 0.0 ; - StopWatch swatch; swatch.start(); // Do the projector part of the trace // Loop over the U and V vectors - QDPIO::cout<<"Now computing the projector contribution"< dbdet; - for (int k = 0 ; k < proj->rank() ; k++) { - // collect dk pairs of vectors - LatticeFermion vi_lambda, // = v[i]/(u[i]'*Dslash*v[i]) - ui, vi; // = u[i], v[i] - proj->V(k,vi); - DComplex lambda; - proj->lambda(k, lambda); - vi_lambda = vi / lambda; - proj->U(k,ui); - - std::vector> vi_lambda_sh( - 1, std::shared_ptr(&vi_lambda, [](LatticeFermion*) {})); - std::vector> ui_sh( - 1, std::shared_ptr(&ui, [](LatticeFermion*) {})); - do_disco(dbdet, vi_lambda_sh, ui_sh, ft, - params.param.use_ferm_state_links ? state->getLinks() : u, - params.param.max_path_length); + Traces dbdet; + if (params.param.first_color == 0) + { + unsigned int rank = SB::getProjectorRank(proj); + unsigned int blk = std::min(rank, 12u); + std::vector> vi_lambda_sh, ui_sh; + vi_lambda_sh.reserve(blk); + ui_sh.reserve(blk); + for (std::size_t i = 0; i < blk; ++i) + { + vi_lambda_sh.push_back(std::make_shared()); + ui_sh.push_back(std::make_shared()); + } + + for (unsigned int k = 0, nk = blk; k < rank; k += nk, nk = std::min(blk, rank - k)) + { + // collect dk pairs of vectors + auto vk = std::vector>(vi_lambda_sh.begin(), + vi_lambda_sh.begin() + nk); + auto uk = std::vector>(ui_sh.begin(), ui_sh.begin() + nk); + getV(proj, k, uk); + for (unsigned int ki = 0; ki < nk; ++ki) + *vk[ki] = *uk[ki] / getLambda(proj, k + ki); + getU(proj, k, uk); + + // Added to dbdet the results of \Omega*P*inv(A)=\Omega*V*inv(U'*A*V)*U', where \Omega are + do_disco(dbdet, uk, vk, mom_list, + params.param.use_ferm_state_links ? state->getLinks() : u, + params.param.max_path_length); + } } + swatch_det.stop(); - QDPIO::cout << "Projector contribution computed in time= " << swatch_det.getTimeInSeconds() << " secs" << std::endl; - + QDPIO::cout << "Projector contribution computed in time= " << swatch_det.getTimeInSeconds() + << " secs" << std::endl; + + const int N_rhs = (std::max(params.param.max_rhs, 1) + Ns * Nc - 1) / Ns / Nc; + const int max_color = params.param.num_colors < 0 + ? Nsrc + : std::min(Nsrc, params.param.first_color + params.param.num_colors); // Loop over the source color and spin, creating the source // and calling the relevant propagator routines. - std::map< KeyOperator_t, ValOperator_t > dbmean, dbvar; - for (int noise = 0 ; noise < params.param.noise_vectors; noise++) { - std::map< KeyOperator_t, ValOperator_t > db; + Traces dbmean(16); + TracesVariance dbvar(16); + for (int noise = 0; noise < params.param.noise_vectors; noise++) + { + Traces db; - // doing a new noise vector - QDPIO::cout << " Doing noise vector " << noise << std::endl; + // doing a new noise vector + QDPIO::cout << " Doing noise vector " << noise << std::endl; //generate a random std::vector - LatticeComplex vec ; + LatticeComplex vec; LatticeReal rnd1, theta; - random(rnd1); - Real twopiN = Chroma::twopi / 4; - theta = twopiN * floor(4*rnd1); - vec = cmplx(cos(theta),sin(theta)); - - // All the loops - const int N_rhs = (params.param.max_rhs + Ns * Nc - 1) / Ns / Nc; - for (int k1 = 0, dk = std::min(Nsrc, N_rhs); k1 < Nsrc ; k1 += dk, dk = std::min(Nsrc - k1, N_rhs)) { - // collect (Ns*Nc*dk) pairs of vectors - std::vector> v_chi(Ns * Nc * dk), v_psi(Ns * Nc * dk), v_q(Ns * Nc * dk); - for (int col=0; colgetVec(hh, k1 + i_v); - LatticeComplex rv = vec*hh; - for(int color_source(0);color_source>(v_chi.begin(), v_chi.end())); - proj->VUAObliqueProjector(v_q, std::vector>(v_psi.begin(), v_psi.end())); - for (int i=0; igetLinks() : u, + random(rnd1); + Real twopiN = Chroma::constant().twopi / 4; + theta = twopiN * floor(4 * rnd1); + vec = cmplx(cos(theta), sin(theta)); + + // All the loops + for (int k1 = params.param.first_color, dk = std::min(max_color - k1, N_rhs); + k1 < max_color; k1 += dk, dk = std::min(max_color - k1, N_rhs)) + { + // collect (Ns*Nc*dk) pairs of vectors + std::vector> v_chi(Ns * Nc * dk), v_psi(Ns * Nc * dk), + v_q(Ns * Nc * dk), v_prj(Ns * Nc * dk); + for (int col = 0; col < v_chi.size(); col++) + v_chi[col].reset(new LatticeFermion); + for (int col = 0; col < v_psi.size(); col++) + v_psi[col].reset(new LatticeFermion); + for (int col = 0; col < v_q.size(); col++) + v_q[col].reset(new LatticeFermion); + for (int col = 0; col < v_prj.size(); col++) + v_prj[col].reset(new LatticeFermion); + for (int i_v = 0; i_v < dk; i_v++) + { + LatticeInteger hh; + coloring->getVec(hh, k1 + i_v); + LatticeComplex rv = vec * hh; + for (int color_source(0); color_source < Nc; color_source++) + { + LatticeColorVector vec_srce = zero; + pokeColor(vec_srce, rv, color_source); + + for (int spin_source = 0; spin_source < Ns; ++spin_source) + { + // Insert a ColorVector into spin index spin_source + // This only overwrites sections, so need to initialize first + *v_chi[i_v * Ns * Nc + color_source * Ns + spin_source] = zero; + CvToFerm(vec_srce, *v_chi[i_v * Ns * Nc + color_source * Ns + spin_source], + spin_source); + *v_psi[i_v * Ns * Nc + color_source * Ns + spin_source] = zero; + } + } + } + + SB::doInversion( + PP, v_psi, + std::vector>(v_chi.begin(), v_chi.end())); + doVUAObliqueProjector( + proj, v_prj, + std::vector>(v_psi.begin(), v_psi.end())); + for (int i = 0; i < v_psi.size(); ++i) + *v_q[i] = *v_psi[i] - *v_prj[i]; // q <= (I - V*inv(U'*A*V)*U'*A)*quark_soln + + // Added to db the results of chi'*\Omega*(I-P)*inv(A)*chi, where \Omega are + // local operators in spin and space + StopWatch swatch_dots; + swatch_dots.start(); + do_disco(db, v_chi, v_q, mom_list, + params.param.use_ferm_state_links ? state->getLinks() : u, params.param.max_path_length); swatch_dots.stop(); QDPIO::cout << "Computing inner products " << swatch_dots.getTimeInSeconds() << " secs" << std::endl; - } // for k1 + } // for k1 - // Update dbmean, dbvar - do_update(dbmean, dbvar, db, noise == 0); + // Update dbmean, dbvar + do_update(dbmean, dbvar, db, noise == 0); - // Show stats - show_stats(dbmean, dbvar, dbdet, 1, noise+1); + // Show stats + show_stats(dbmean, dbvar, dbdet, 1, noise + 1); } // noise // Normalize the traces - for(std::map< KeyOperator_t, ValOperator_t >::iterator it=dbmean.begin();it != dbmean.end(); it++){ - for(int k=0;ksecond.op.size();k++){ - it->second.op[k] = it->second.op[k]/toDouble(params.param.noise_vectors); - } + for (auto& it : dbmean) + { + for (int k = 0; k < Ns * Ns; k++) + it.second[k] /= (double)params.param.noise_vectors; } // Add the deterministic part to the traces - for(std::map< KeyOperator_t, ValOperator_t >::iterator it=dbmean.begin();it != dbmean.end(); it++) - it->second.op += dbdet[it->first].op; + if (params.param.first_color == 0 && dbdet.size() > 0) + for (auto& it : dbmean) + for (int k = 0; k < Ns * Ns; k++) + it.second[k] += dbdet[it.first][k]; swatch.stop(); - QDPIO::cout << "Traces were computed: time= " - << swatch.getTimeInSeconds() - << " secs" << std::endl; - + QDPIO::cout << "Traces were computed: time= " << swatch.getTimeInSeconds() << " secs" + << std::endl; // write out the results - - // DB storage - BinaryStoreDB,SerialDBData > qdp_db; - - // Open the file, and write the meta-data and the binary for this operator + + if (Layout::nodeNumber() == 0) { + // DB storage + LocalBinaryStoreDB, LocalSerialDBData> + qdp_db; + + // Open the file, and write the meta-data and the binary for this operator XMLBufferWriter file_xml; - + push(file_xml, "DBMetaData"); write(file_xml, "id", std::string("DiscoBlocks")); write(file_xml, "lattSize", QDP::Layout::lattSize()); write(file_xml, "decay_dir", decay_dir); write(file_xml, "Params", params.param); write(file_xml, "Config_info", gauge_xml); + write(file_xml, "first_computed_color", params.param.first_color); + write(file_xml, "num_computed_colors", std::max(0, max_color - params.param.first_color)); + write(file_xml, "num_colors", Nsrc); pop(file_xml); std::string file_str(file_xml.str()); qdp_db.setMaxUserInfoLen(file_str.size()); - //qdp_db.open(params.named_obj.sdb_file, O_RDWR | O_CREAT, 0664); //Slightly modify code to account for changes from multifile write. //Be consistent with old mode of filename write. std::string file_name = params.named_obj.sdb_file; qdp_db.open(file_name, O_RDWR | O_CREAT, 0664); qdp_db.insertUserdata(file_str); + + KeyOperator_t key; + ValOperator_t val; + // Store all the data + for (const auto& it : dbmean) + { + key.t_slice = it.first.t_slice; + key.disp = SB::tomulti1d(it.first.disp); + key.mom = SB::tomulti1d(it.first.mom); + key.mass_label = params.param.mass_label; + for (int i = 0; i < Ns * Ns; i++) + val.set({i}, it.second[i]); + qdp_db.insert(key, val); + } + qdp_db.close(); } - - SerialDBKey key ; - SerialDBData val ; - std::map< KeyOperator_t, ValOperator_t >::iterator it; - // Store all the data - for(it=dbmean.begin();it!=dbmean.end();it++){ - key.key() = it->first ; - key.key().mass_label = params.param.mass_label; - val.data().op.resize(it->second.op.size()) ; - for(int i(0);isecond.op.size();i++) - val.data().op[i] = it->second.op[i]; - qdp_db.insert(key,val); - } - - pop(xml_out); // close last tag + pop(xml_out); // close last tag snoop.stop(); - QDPIO::cout << name << ": total time = " - << snoop.getTimeInSeconds() - << " secs" << std::endl; - + QDPIO::cout << name << ": total time = " << snoop.getTimeInSeconds() << " secs" << std::endl; + QDPIO::cout << name << ": ran successfully" << std::endl; - - END_CODE(); - } + END_CODE(); + } - }// namespace + } // namespace } // namespace Chroma // vim: sw=2 sts=2 diff --git a/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.h b/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.h index 65b93294fc..a8da318072 100644 --- a/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.h +++ b/lib/meas/inline/hadron/inline_disco_prob_defl_superb_w.h @@ -33,18 +33,17 @@ namespace Chroma struct Param_t { int max_path_length ; /*! maximum displacement path */ - int p2_max ; /*! maximum p2 */ - multi2d p_list; //Instead of a max momentum, a list is possible as an input. - int p_num; //Maximum number of momenta in the file. - std::string p_file; //Name of file that contains list of momenta. - bool use_p_list; //A boolean that keeps track of which momentum structure to pass to the fourier transform. - bool multifile_write; //A boolean that switches between new and old code for writing to multiple databases, + int mom2_min; /*!< (mom)^2 >= mom2_min */ + int mom2_max; /*!< (mom)^2 <= mom2_max */ + std::vector> mom_list; /*!< Alternative array of momenta to generate */ std::string mass_label ; /*! a std::string flag maybe used in analysis*/ int max_rhs; /*! maximum number of linear systems solved simultaneously */ ChromaProp_t prop; GroupXML_t projParam; int probing_distance; int probing_power; + int first_color; + int num_colors; std::string probing_file; int noise_vectors; bool use_ferm_state_links ; diff --git a/lib/meas/inline/hadron/inline_disco_prob_defl_w.cc b/lib/meas/inline/hadron/inline_disco_prob_defl_w.cc index c99dd2a26f..d7dd2dcb0a 100644 --- a/lib/meas/inline/hadron/inline_disco_prob_defl_w.cc +++ b/lib/meas/inline/hadron/inline_disco_prob_defl_w.cc @@ -129,11 +129,11 @@ namespace Chroma else param.noise_vectors = 1; - if(inputtop.count("max_rhs")!=0){ - read(inputtop,"max_rhs",param.max_rhs) ; + param.max_rhs = 0; + if (inputtop.count("max_rhs") != 0) + { + read(inputtop, "max_rhs", param.max_rhs); } - else - param.max_rhs = 1; } //! Propagator output @@ -627,16 +627,24 @@ namespace Chroma MesPlq(xml_out, "Observables", u); std::shared_ptr coloring; - if (!params.param.probing_file.empty()) { - QDPIO::cout << "Reading colors from file " << params.param.probing_file << std::endl; + if (!params.param.probing_file.empty()) + { + QDPIO::cout << "Reading colors from file " << params.param.probing_file << std::endl; coloring.reset(new Coloring(params.param.probing_file)); - } else { + } + else + { QDPIO::cout << "Generating a " << params.param.probing_distance << "-distance coloring with a power " << params.param.probing_power << std::endl; - coloring.reset(new Coloring(params.param.probing_distance, params.param.probing_power)); + // Do a k-distance coloring with k being params.param.probing_distance and taking as + // shifts, zero and the given probing distance. We include always the zero shift because + // the disconnected loops can be small at z=0 for several gammas. + coloring.reset(new Coloring( + std::vector>{{{}}, {0, 0, params.param.probing_distance, 0}}, + params.param.probing_power)); } - + // // Initialize fermion action // @@ -688,10 +696,10 @@ namespace Chroma multi1d d; if (params.param.use_ferm_state_links) - do_disco(dbdet, vi_lambda, ui, ft, state->getLinks(), + do_disco(dbdet, ui, vi_lambda, ft, state->getLinks(), d, params.param.max_path_length); else - do_disco(dbdet, vi_lambda, ui, ft, u, + do_disco(dbdet, ui, vi_lambda, ft, u, d, params.param.max_path_length); } swatch_det.stop(); @@ -711,12 +719,12 @@ namespace Chroma LatticeComplex vec ; LatticeReal rnd1, theta; random(rnd1); - Real twopiN = Chroma::twopi / 4; + Real twopiN = Chroma::constant().twopi / 4; theta = twopiN * floor(4*rnd1); vec = cmplx(cos(theta),sin(theta)); // All the loops - const int N_rhs = (params.param.max_rhs + Ns * Nc - 1) / Ns / Nc; + const int N_rhs = (std::max(params.param.max_rhs, 1) + Ns * Nc - 1) / Ns / Nc; for (int k1 = 0, dk = std::min(Nsrc, N_rhs); k1 < Nsrc ; k1 += dk, dk = std::min(Nsrc - k1, N_rhs)) { // collect (Ns*Nc*dk) pairs of vectors std::vector> v_chi(Ns * Nc * dk), v_psi(Ns * Nc * dk), v_q(Ns * Nc * dk); @@ -786,16 +794,17 @@ namespace Chroma << swatch.getTimeInSeconds() << " secs" << std::endl; - // write out the results - - // DB storage - BinaryStoreDB,SerialDBData > qdp_db; - - // Open the file, and write the meta-data and the binary for this operator + + if (Layout::nodeNumber() == 0) { + // DB storage + LocalBinaryStoreDB, LocalSerialDBData> + qdp_db; + + // Open the file, and write the meta-data and the binary for this operator XMLBufferWriter file_xml; - + push(file_xml, "DBMetaData"); write(file_xml, "id", std::string("DiscoBlocks")); write(file_xml, "lattSize", QDP::Layout::lattSize()); @@ -807,43 +816,41 @@ namespace Chroma std::string file_str(file_xml.str()); qdp_db.setMaxUserInfoLen(file_str.size()); - //qdp_db.open(params.named_obj.sdb_file, O_RDWR | O_CREAT, 0664); //Slightly modify code to account for changes from multifile write. //Be consistent with old mode of filename write. std::string file_name = params.named_obj.sdb_file; qdp_db.open(file_name, O_RDWR | O_CREAT, 0664); qdp_db.insertUserdata(file_str); + + LocalSerialDBKey key; + LocalSerialDBData val; + std::map::iterator it; + // Store all the data + for (it = dbmean.begin(); it != dbmean.end(); it++) + { + key.key() = it->first; + key.key().mass_label = params.param.mass_label; + val.data().op.resize(it->second.op.size()); + for (int i(0); i < it->second.op.size(); i++) + val.data().op[i] = it->second.op[i]; + qdp_db.insert(key, val); + } + + qdp_db.close(); } - - SerialDBKey key ; - SerialDBData val ; - std::map< KeyOperator_t, ValOperator_t >::iterator it; - // Store all the data - for(it=dbmean.begin();it!=dbmean.end();it++){ - key.key() = it->first ; - key.key().mass_label = params.param.mass_label; - val.data().op.resize(it->second.op.size()) ; - for(int i(0);isecond.op.size();i++) - val.data().op[i] = it->second.op[i]; - qdp_db.insert(key,val); - } - - pop(xml_out); // close last tag + pop(xml_out); // close last tag snoop.stop(); - QDPIO::cout << name << ": total time = " - << snoop.getTimeInSeconds() - << " secs" << std::endl; - + QDPIO::cout << name << ": total time = " << snoop.getTimeInSeconds() << " secs" << std::endl; + QDPIO::cout << name << ": ran successfully" << std::endl; - - END_CODE(); - } + END_CODE(); + } - }// namespace + } // namespace } // namespace Chroma // vim: sw=2 sts=2 diff --git a/lib/meas/inline/hadron/inline_eigenvalues_superb_w.cc b/lib/meas/inline/hadron/inline_eigenvalues_superb_w.cc new file mode 100644 index 0000000000..94aee7ea5d --- /dev/null +++ b/lib/meas/inline/hadron/inline_eigenvalues_superb_w.cc @@ -0,0 +1,463 @@ +/*! \file + * \brief Compute the smallest eigenvalues of D^\dagger*D + * + * Compute an approximation of the smallest eigenvalues/vectors of D^\dagger * D with + * an iterative eigensolver on \gamma_5 D^{-1}, where D^{-1} is approximate with + * a linear solver + */ + +#include "qdp.h" +#include "fermact.h" +#include "meas/inline/hadron/inline_eigenvalues_superb_w.h" +#include "meas/inline/abs_inline_measurement_factory.h" +#include "meas/glue/mesplq.h" +#include "qdp_map_obj.h" +#include "qdp_map_obj_disk.h" +#include "qdp_map_obj_disk_multiple.h" +#include "qdp_map_obj_memory.h" +#include "qdp_disk_map_slice.h" +#include "util/ferm/subset_vectors.h" +#include "util/ferm/key_val_db.h" +#include "util/ferm/transf.h" +#include "util/ferm/spin_rep.h" +#include "util/ferm/diractodr.h" +#include "util/ferm/twoquark_contract_ops.h" +#include "util/ferm/superb_contractions.h" +#include "util/ferm/mgproton.h" +#include "util/ft/time_slice_set.h" +#include "util/info/proginfo.h" +#include "actions/ferm/fermacts/fermact_factory_w.h" +#include "actions/ferm/fermacts/fermacts_aggregate_w.h" +#include "meas/inline/make_xml_file.h" + +#include "meas/inline/io/named_objmap.h" + +#include "chroma_config.h" + +#ifdef BUILD_SB + +namespace Chroma +{ + + //---------------------------------------------------------------------------- + namespace InlineEigenvaluesSuperbEnv + { + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::NamedObject_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "gauge_id", input.gauge_id); + read(inputtop, "eigs_file", input.eigs_file); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::NamedObject_t& input) + { + push(xml, path); + + write(xml, "gauge_id", input.gauge_id); + write(xml, "eigs_file", input.eigs_file); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t::Contract_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "num_vecs", input.num_vecs); + read(inputtop, "tolerance", input.tol); + read(inputtop, "mass_label", input.mass_label); + + input.max_rhs = 8; + if( inputtop.count("max_rhs") == 1 ) { + read(inputtop, "max_rhs", input.max_rhs); + } + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t::Contract_t& input) + { + push(xml, path); + + write(xml, "num_vecs", input.num_vecs); + write(xml, "tolerance", input.tol); + write(xml, "mass_label", input.mass_label); + write(xml, "max_rhs", input.max_rhs); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "Propagator", input.prop); + + input.eigensolver = ""; + if (inputtop.count("eigensolver") == 1) + { + XMLReader xml_tmp(inputtop, "eigensolver"); + std::ostringstream os; + xml_tmp.print(os); + input.eigensolver = os.str(); + } + + read(inputtop, "Contractions", input.contract); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t& input) + { + push(xml, path); + + write(xml, "Propagator", input.prop); + write(xml, "Contractions", input.contract); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params& input) + { + Params tmp(xml, path); + input = tmp; + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params& input) + { + push(xml, path); + + write(xml, "Param", input.param); + write(xml, "NamedObject", input.named_obj); + + pop(xml); + } + + //---------------------------------------------------------------------------- + //! Unsmeared meson operator + struct KeyEigenpair_t + { + int idx; /*!< eigenpair index */ + std::string mass_label; /*!< Some kind of mass label */ + }; + + //! Eigenvalue and eigenvector + struct ValEigenpair_t : public SB::Tensor<6, SB::ComplexD> + { + double value; + ValEigenpair_t() + : SB::Tensor<6, SB::ComplexD>("csxyzt", + SB::latticeSize<6>("csxyzt", {{'x', Layout::lattSize()[0]}}), + SB::OnHost, SB::Local) + { + } + }; + + + //---------------------------------------------------------------------------- + //! KeyEigenpair_t reader + void read(BinaryReader& bin, KeyEigenpair_t& param) + { + read(bin, param.idx); + readDesc(bin, param.mass_label); + } + + //! KeyEigenpair_t write + void write(BinaryWriter& bin, const KeyEigenpair_t& param) + { + write(bin, param.idx); + writeDesc(bin, param.mass_label); + } + + //---------------------------------------------------------------------------- + //! ValEigenpair_t reader + void read(BinaryReader& bin, ValEigenpair_t& param) + { + double value; + read(bin, value); + param = ValEigenpair_t(); + SB::Tensor<6, SB::ComplexD> &t = param; + read(bin, t); + } + + //! ValEigenpair_t write + void write(BinaryWriter& bin, const ValEigenpair_t& param) + { + write(bin, param.value); + SB::Tensor<6, SB::ComplexD> t = param.reorder("csxyzt"); + write(bin, t); + } + + + } // namespace InlinePropDistillationSuperbEnv + + + //---------------------------------------------------------------------------- + namespace InlineEigenvaluesSuperbEnv + { + namespace + { + AbsInlineMeasurement* createMeasurement(XMLReader& xml_in, + const std::string& path) + { + return new InlineMeas(Params(xml_in, path)); + } + + //! Local registration flag + bool registered = false; + } + + const std::string name = "EIGENVALUES_SUPERB"; + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= WilsonTypeFermActsEnv::registerAll(); + success &= TheInlineMeasurementFactory::Instance().registerObject(name, createMeasurement); + registered = true; + } + return success; + } + + + //---------------------------------------------------------------------------- + // Param stuff + Params::Params() { frequency = 0; } + + Params::Params(XMLReader& xml_in, const std::string& path) + { + try + { + XMLReader paramtop(xml_in, path); + + if (paramtop.count("Frequency") == 1) + read(paramtop, "Frequency", frequency); + else + frequency = 1; + + // Parameters for source construction + read(paramtop, "Param", param); + + // Read in the output propagator/source configuration info + read(paramtop, "NamedObject", named_obj); + + // Possible alternate XML file pattern + if (paramtop.count("xml_file") != 0) + { + read(paramtop, "xml_file", xml_file); + } + } + catch(const std::string& e) + { + QDPIO::cerr << __func__ << ": Caught Exception reading XML: " << e << std::endl; + QDP_abort(1); + } + } + + + //---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- + // Function call + void + InlineMeas::operator()(unsigned long update_no, + XMLWriter& xml_out) + { + // If xml file not empty, then use alternate + if (params.xml_file != "") + { + std::string xml_file = makeXMLFileName(params.xml_file, update_no); + + push(xml_out, "Eigenvalues"); + write(xml_out, "update_no", update_no); + write(xml_out, "xml_file", xml_file); + pop(xml_out); + + XMLFileWriter xml(xml_file); + func(update_no, xml); + } + else + { + func(update_no, xml_out); + } + } + + + // Real work done here + void + InlineMeas::func(unsigned long update_no, + XMLWriter& xml_out) + { + START_CODE(); + +# ifdef __INTEL_COMPILER + throw std::runtime_error("crappy intel compiler failed compiling this module... sorry!"); +# else + StopWatch snoop; + snoop.reset(); + snoop.start(); + + // Test and grab a reference to the gauge field + multi1d u; + XMLBufferWriter gauge_xml; + try + { + u = TheNamedObjMap::Instance().getData< multi1d >(params.named_obj.gauge_id); + TheNamedObjMap::Instance().get(params.named_obj.gauge_id).getRecordXML(gauge_xml); + } + catch( std::bad_cast ) + { + QDPIO::cerr << name << ": caught dynamic cast error" << std::endl; + QDP_abort(1); + } + catch (const std::string& e) + { + QDPIO::cerr << name << ": std::map call failed: " << e << std::endl; + QDP_abort(1); + } + + push(xml_out, "Eigenvpairs"); + write(xml_out, "update_no", update_no); + + QDPIO::cout << name << ": eigenpairs calculation" << std::endl; + + proginfo(xml_out); // Print out basic program info + + // Write out the input + write(xml_out, "Input", params); + + // Write out the config header + write(xml_out, "Config_info", gauge_xml); + + push(xml_out, "Output_version"); + write(xml_out, "out_version", 1); + pop(xml_out); + + // Calculate some gauge invariant observables just for info. + MesPlq(xml_out, "Observables", u); + + // Will use TimeSliceSet-s a lot + const int decay_dir = 3; + const int Lt = Layout::lattSize()[decay_dir]; + + // + // DB storage + // + std::vector< + LocalBinaryStoreDB, LocalSerialDBData>> + qdp_db; + + // Open the file, and write the meta-data and the binary for this operator + auto open_db = [&]() { + if (qdp_db.size() > 0) + return; + XMLBufferWriter file_xml; + push(file_xml, "DBMetaData"); + write(file_xml, "id", std::string("eigenpairsOp")); + write(file_xml, "lattSize", QDP::Layout::lattSize()); + proginfo(file_xml); // Print out basic program info + write(file_xml, "Params", params.param); + write(file_xml, "Config_info", gauge_xml); + pop(file_xml); + + std::string file_str(file_xml.str()); + qdp_db.resize(1); + qdp_db[0].setMaxUserInfoLen(file_str.size()); + + qdp_db[0].open(params.named_obj.eigs_file, O_RDWR | O_CREAT, 0664); + + qdp_db[0].insertUserdata(file_str); + }; + + // + // Try the factories + // + try + { + StopWatch swatch; + swatch.reset(); + swatch.start(); + + QDPIO::cout << "Try the various factories" << std::endl; + + // Initialize fermion action and create the solver + SB::ChimeraSolver PP{params.param.prop.fermact, params.param.prop.invParam, u}; + + // Prepare eigensolver + std::shared_ptr ops = + SB::getOptionsFromXML(SB::broadcast(params.param.eigensolver)); + auto eigensolver = SB::getInexactEigensolverGD( + SB::getOperator(PP, params.param.contract.max_rhs), ops->getValue("eigensolver")); + + // Run + auto values_vectors = eigensolver(params.param.contract.num_vecs, params.param.contract.tol); + auto values = std::get<0>(values_vectors); + auto vectors = std::get<1>(values_vectors); + assert(values.size() == (std::size_t)vectors.kvdim().at('n')); + + swatch.stop(); + QDPIO::cout << "Eigenpairs computed: time= " << swatch.getTimeInSeconds() << " secs" + << std::endl; + + // Store the eigenpairs + StopWatch swatch0; + swatch0.reset(); + swatch0.start(); + LocalSerialDBKey key; + key.key().mass_label = params.param.contract.mass_label; + for (int idx = 0; idx < values.size(); ++idx) + { + key.key().idx = idx; + auto v = + SB::detail::toNaturalOrdering(vectors.kvslice_from_size({{'n', idx}}, {{'n', 1}})) + .make_sure(SB::none, SB::OnHost, SB::OnMaster) + .getLocal(); + if (v) + { + open_db(); + LocalSerialDBData val; + val.data().value = values[idx]; + v.copyTo(val.data()); + qdp_db[0].insert(key, val); + } + } + + swatch0.stop(); + QDPIO::cout << "Eigenpairs stored: time= " << swatch0.getTimeInSeconds() << " secs" + << std::endl; + } + catch (const std::exception& e) + { + QDP_error_exit("%s: caught exception: %s\n", name.c_str(), e.what()); + } + + pop(xml_out); + + for (auto& db : qdp_db) + db.close(); + + snoop.stop(); + QDPIO::cout << name << ": total time = " + << snoop.getTimeInSeconds() + << " secs" << std::endl; + + QDPIO::cout << name << ": ran successfully" << std::endl; +# endif + END_CODE(); + } + + } + +} // namespace Chroma + +#endif // BUILD_SB diff --git a/lib/meas/inline/hadron/inline_eigenvalues_superb_w.h b/lib/meas/inline/hadron/inline_eigenvalues_superb_w.h new file mode 100644 index 0000000000..98838b69ed --- /dev/null +++ b/lib/meas/inline/hadron/inline_eigenvalues_superb_w.h @@ -0,0 +1,92 @@ +// -*- C++ -*- +/*! \file + * \brief Compute the propagator from distillation + * + * Propagator calculation in distillation + */ + +#ifndef __inline_eigenvalues_superb_w_h__ +#define __inline_eigenvalues_superb_w_h__ + +#include "chromabase.h" +#include "meas/inline/abs_inline_measurement.h" +#include "io/qprop_io.h" +#include "io/xml_group_reader.h" + +#ifdef BUILD_SB + +namespace Chroma +{ + /*! \ingroup inlinehadron */ + namespace InlineEigenvaluesSuperbEnv + { + bool registerAll(); + + //! Parameter structure + /*! \ingroup inlinehadron */ + struct Params + { + Params(); + Params(XMLReader& xml_in, const std::string& path); + + unsigned long frequency; + + struct Param_t + { + struct Contract_t + { + int num_vecs; /*!< Number of eigenvectors to compute */ + double tol; /*!< Tolerance of the eigenvectors to compute */ + std::string mass_label; /*!< mass label */ + int max_rhs; /*! maximum number of linear systems solved simultaneously */ + }; + + ChromaProp_t prop; + std::string eigensolver; /* eigensolver options */ + Contract_t contract; + }; + + struct NamedObject_t + { + std::string gauge_id; /*!< Gauge field */ + std::string eigs_file; /*!< File name for storing the eigenvalues and eigenvectors */ + }; + + Param_t param; + NamedObject_t named_obj; + std::string xml_file; /*!< Alternate XML file pattern */ + }; + + + //! Inline task for the propagator from distillation + /*! \ingroup inlinehadron */ + class InlineMeas : public AbsInlineMeasurement + { + public: + ~InlineMeas() {} + InlineMeas(const Params& p) : params(p) {} + InlineMeas(const InlineMeas& p) : params(p.params) {} + + unsigned long getFrequency(void) const {return params.frequency;} + + //! Do the measurement + void operator()(const unsigned long update_no, + XMLWriter& xml_out); + + protected: + //! Do the measurement + void func(const unsigned long update_no, + XMLWriter& xml_out); + + private: + Params params; + }; + + } // namespace PropColorVec + + +} + +#endif // BUILD_SB + +#endif diff --git a/lib/meas/inline/hadron/inline_hadron_aggregate.cc b/lib/meas/inline/hadron/inline_hadron_aggregate.cc index 82fda733c6..438bff03ec 100644 --- a/lib/meas/inline/hadron/inline_hadron_aggregate.cc +++ b/lib/meas/inline/hadron/inline_hadron_aggregate.cc @@ -72,9 +72,12 @@ #include "meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.h" #include "meas/inline/hadron/inline_unsmeared_hadron_node_distillation_w.h" #include "meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h" +#include "meas/inline/hadron/inline_eigenvalues_superb_w.h" #include "meas/inline/hadron/inline_genprop_matelem_colorvec_w.h" #include "meas/inline/hadron/inline_genprop_matelem_da_colorvec_w.h" #include "meas/inline/hadron/inline_genprop_matelem_pt_colorvec_w.h" +#include "meas/inline/hadron/inline_inverter_test_w.h" +#include "meas/inline/hadron/inline_inverter_test_superb_w.h" #include "meas/inline/hadron/inline_mres_w.h" #include "meas/inline/hadron/inline_qpropqio_w.h" #include "meas/inline/hadron/inline_qpropadd_w.h" @@ -175,10 +178,13 @@ namespace Chroma #ifdef BUILD_SB success &= InlinePropAndMatElemDistillationSuperbEnv::registerAll(); success &= InlineUnsmearedHadronNodeDistillationSuperbEnv::registerAll(); + success &= InlineEigenvaluesSuperbEnv::registerAll(); #endif #ifndef QDP_IS_QDPJIT_NO_NVPTX +#if ! defined (QDP_IS_QDPJIT2) success &= InlineMatElemDistillationEnv::registerAll(); success &= InlinePropAndMatElemDistillationEnv::registerAll(); +#endif success &= InlineUnsmearedHadronNodeDistillationEnv::registerAll(); #endif #ifndef QDP_IS_QDPJIT @@ -214,6 +220,10 @@ namespace Chroma success &= InlineStochHadronEnv::registerAll(); success &= InlineStochGroupBaryonEnv::registerAll(); success &= InlineStochGroupMesonEnv::registerAll(); + success &= InlineInverterTestEnv::registerAll(); +#ifdef BUILD_SB + success &= InlineInverterTestSuperbEnv::registerAll(); +#endif // success &= InlineStochLaphQuarkEnv::registerAll(); // success &= InlineStochLaphBaryonEnv::registerAll(); diff --git a/lib/meas/inline/hadron/inline_inverter_test_superb_w.cc b/lib/meas/inline/hadron/inline_inverter_test_superb_w.cc new file mode 100644 index 0000000000..27d2dd2f54 --- /dev/null +++ b/lib/meas/inline/hadron/inline_inverter_test_superb_w.cc @@ -0,0 +1,351 @@ +/*! \file + * \brief Compute propagators from distillation + * + * Propagator calculation in distillation + */ + +#include "qdp.h" +#include "fermact.h" +#include "meas/inline/hadron/inline_inverter_test_superb_w.h" +#include "meas/inline/abs_inline_measurement_factory.h" +#include "meas/glue/mesplq.h" +#include "util/ferm/transf.h" +#include "util/ferm/spin_rep.h" +#include "util/ferm/diractodr.h" +#include "util/info/proginfo.h" +#include "util/info/proginfo.h" +#include "actions/ferm/fermacts/fermact_factory_w.h" +#include "actions/ferm/fermacts/fermacts_aggregate_w.h" +#include "meas/inline/make_xml_file.h" + +#include "util/ferm/superb_contractions.h" +#include "util/ferm/mgproton.h" + +#include "meas/inline/io/named_objmap.h" + +#include "chroma_config.h" + +#ifdef BUILD_SB + +namespace Chroma +{ + //---------------------------------------------------------------------------- + namespace InlineInverterTestSuperbEnv + { + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::NamedObject_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "gauge_id", input.gauge_id); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::NamedObject_t& input) + { + push(xml, path); + + write(xml, "gauge_id", input.gauge_id); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t::Contract_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "num_vecs", input.num_vecs); + read(inputtop, "decay_dir", input.decay_dir); + + if( inputtop.count("max_rhs") == 1 ) { + read(inputtop, "max_rhs", input.max_rhs); + } else { + input.max_rhs.resize(1); + input.max_rhs[0] = 8; + } + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t::Contract_t& input) + { + push(xml, path); + + write(xml, "num_vecs", input.num_vecs); + write(xml, "decay_dir", input.decay_dir); + write(xml, "max_rhs", input.max_rhs); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "Propagator", input.prop); + read(inputtop, "Contractions", input.contract); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t& input) + { + push(xml, path); + + write(xml, "Propagator", input.prop); + write(xml, "Contractions", input.contract); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params& input) + { + Params tmp(xml, path); + input = tmp; + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params& input) + { + push(xml, path); + + write(xml, "Param", input.param); + write(xml, "NamedObject", input.named_obj); + + pop(xml); + } + } // namespace InlinePropDistillationSuperbEnv + + + //---------------------------------------------------------------------------- + namespace InlineInverterTestSuperbEnv + { + namespace + { + AbsInlineMeasurement* createMeasurement(XMLReader& xml_in, + const std::string& path) + { + return new InlineMeas(Params(xml_in, path)); + } + + //! Local registration flag + bool registered = false; + } + + const std::string name = "INVERTER_TEST_SUPERB"; + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= WilsonTypeFermActsEnv::registerAll(); + success &= TheInlineMeasurementFactory::Instance().registerObject(name, createMeasurement); + registered = true; + } + return success; + } + + + //---------------------------------------------------------------------------- + // Param stuff + Params::Params() { frequency = 0; } + + Params::Params(XMLReader& xml_in, const std::string& path) + { + try + { + XMLReader paramtop(xml_in, path); + + if (paramtop.count("Frequency") == 1) + read(paramtop, "Frequency", frequency); + else + frequency = 1; + + // Parameters for source construction + read(paramtop, "Param", param); + + // Read in the output propagator/source configuration info + read(paramtop, "NamedObject", named_obj); + + // Possible alternate XML file pattern + if (paramtop.count("xml_file") != 0) + { + read(paramtop, "xml_file", xml_file); + } + } + catch(const std::string& e) + { + QDPIO::cerr << __func__ << ": Caught Exception reading XML: " << e << std::endl; + QDP_abort(1); + } + } + + + //---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- + // Function call + void + InlineMeas::operator()(unsigned long update_no, + XMLWriter& xml_out) + { + // If xml file not empty, then use alternate + if (params.xml_file != "") + { + std::string xml_file = makeXMLFileName(params.xml_file, update_no); + + push(xml_out, "InverterTest"); + write(xml_out, "update_no", update_no); + write(xml_out, "xml_file", xml_file); + pop(xml_out); + + XMLFileWriter xml(xml_file); + func(update_no, xml); + } + else + { + func(update_no, xml_out); + } + } + + + // Real work done here + void + InlineMeas::func(unsigned long update_no, + XMLWriter& xml_out) + { + START_CODE(); + + StopWatch snoop; + snoop.reset(); + snoop.start(); + + // Test and grab a reference to the gauge field + multi1d u; + XMLBufferWriter gauge_xml; + try + { + u = TheNamedObjMap::Instance().getData< multi1d >(params.named_obj.gauge_id); + TheNamedObjMap::Instance().get(params.named_obj.gauge_id).getRecordXML(gauge_xml); + } + catch( std::bad_cast ) + { + QDPIO::cerr << name << ": caught dynamic cast error" << std::endl; + QDP_abort(1); + } + catch (const std::string& e) + { + QDPIO::cerr << name << ": std::map call failed: " << e << std::endl; + QDP_abort(1); + } + + push(xml_out, "InverterTest"); + write(xml_out, "update_no", update_no); + + QDPIO::cout << name << ": propagator calculation" << std::endl; + + proginfo(xml_out); // Print out basic program info + + // Write out the input + write(xml_out, "Input", params); + + // Write out the config header + write(xml_out, "Config_info", gauge_xml); + + push(xml_out, "Output_version"); + write(xml_out, "out_version", 1); + pop(xml_out); + + // Calculate some gauge invariant observables just for info. + MesPlq(xml_out, "Observables", u); + + // Will use TimeSliceSet-s a lot + const int decay_dir = params.param.contract.decay_dir; + const int Lt = Layout::lattSize()[decay_dir]; + + // A sanity check + if (decay_dir != Nd-1) + { + QDPIO::cerr << name << ": TimeSliceIO only supports decay_dir= " << Nd-1 << "\n"; + QDP_abort(1); + } + + // + // Try the factories + // + try + { + StopWatch swatch; + swatch.reset(); + QDPIO::cout << "Try the various factories" << std::endl; + + // Initialize fermion action and create the solver + SB::ChimeraSolver PP{params.param.prop.fermact, params.param.prop.invParam, u}; + + swatch.start(); + + const int num_vecs = params.param.contract.num_vecs; + for (int rhsi = 0; rhsi < params.param.contract.max_rhs.size(); ++rhsi) + { + const int max_rhs = params.param.contract.max_rhs[rhsi]; + + const int t_source = 0; + const int spin_source = 0; + + for (int colorvec_src0 = 0, colorvec_src_step = std::min(max_rhs, num_vecs); + colorvec_src0 < num_vecs; colorvec_src0 += colorvec_src_step, + colorvec_src_step = std::min(colorvec_src_step, num_vecs - colorvec_src0)) + { + // Create random colorvecs on a timeslice + const char* order = "cxyzXnt"; + SB::Tensor colorvec( + order, SB::latticeSize(order, {{'n', colorvec_src_step}, {'t', 1}})); + SB::nrand(colorvec); + + StopWatch snarss1; + snarss1.reset(); + snarss1.start(); + + // Solve + SB::doInversion(PP, colorvec, t_source, 0 /* first tslice */, 1 /* num tslices */, + {spin_source}, max_rhs, "cxyzXnSst"); + + snarss1.stop(); + QDPIO::cout << "Time to compute " << colorvec_src_step + << " inversions; time = " << snarss1.getTimeInSeconds() << " secs" + << std::endl; + + } // colorvec_src0 + } // rhsi + + swatch.stop(); + QDPIO::cout << "Propagators computed: time= " + << swatch.getTimeInSeconds() + << " secs" << std::endl; + } + catch (const std::string& e) + { + QDPIO::cout << name << ": caught exception around qprop: " << e << std::endl; + QDP_abort(1); + } + + pop(xml_out); // prop_dist + + snoop.stop(); + QDPIO::cout << name << ": total time = " + << snoop.getTimeInSeconds() + << " secs" << std::endl; + + QDPIO::cout << name << ": ran successfully" << std::endl; + + END_CODE(); + } + } +} // namespace Chroma + +#endif // BUILD_SB diff --git a/lib/meas/inline/hadron/inline_inverter_test_superb_w.h b/lib/meas/inline/hadron/inline_inverter_test_superb_w.h new file mode 100644 index 0000000000..6845a9f848 --- /dev/null +++ b/lib/meas/inline/hadron/inline_inverter_test_superb_w.h @@ -0,0 +1,86 @@ +// -*- C++ -*- +/*! \file + * \brief Compute the propagator from distillation + * + * Propagator calculation in distillation + */ + +#ifndef __inline_inverter_test_superb_w_h__ +#define __inline_inverter_test_superb_w_h__ + +#include "chromabase.h" +#include "meas/inline/abs_inline_measurement.h" +#include "io/qprop_io.h" +#include "io/xml_group_reader.h" + +#ifdef BUILD_SB + +namespace Chroma +{ + /*! \ingroup inlinehadron */ + namespace InlineInverterTestSuperbEnv + { + bool registerAll(); + + //! Parameter structure + /*! \ingroup inlinehadron */ + struct Params + { + Params(); + Params(XMLReader& xml_in, const std::string& path); + + unsigned long frequency; + + struct Param_t + { + struct Contract_t + { + int num_vecs; /*!< Number of color vectors to use */ + int decay_dir; /*!< Decay direction */ + multi1d max_rhs; /*! maximum number of linear systems solved simultaneously */ + }; + + ChromaProp_t prop; + Contract_t contract; + }; + + struct NamedObject_t + { + std::string gauge_id; /*!< Gauge field */ + }; + + Param_t param; + NamedObject_t named_obj; + std::string xml_file; /*!< Alternate XML file pattern */ + }; + + + //! Inline task for the propagator from distillation + /*! \ingroup inlinehadron */ + class InlineMeas : public AbsInlineMeasurement + { + public: + ~InlineMeas() {} + InlineMeas(const Params& p) : params(p) {} + InlineMeas(const InlineMeas& p) : params(p.params) {} + + unsigned long getFrequency(void) const {return params.frequency;} + + //! Do the measurement + void operator()(const unsigned long update_no, + XMLWriter& xml_out); + + protected: + //! Do the measurement + void func(const unsigned long update_no, + XMLWriter& xml_out); + + private: + Params params; + }; + } +} + +#endif // BUILD_SB + +#endif diff --git a/lib/meas/inline/hadron/inline_inverter_test_w.cc b/lib/meas/inline/hadron/inline_inverter_test_w.cc new file mode 100644 index 0000000000..e1220ab43e --- /dev/null +++ b/lib/meas/inline/hadron/inline_inverter_test_w.cc @@ -0,0 +1,387 @@ +/*! \file + * \brief Compute propagators from distillation + * + * Propagator calculation in distillation + */ + +#include "qdp.h" +#include "fermact.h" +#include "meas/inline/hadron/inline_inverter_test_w.h" +#include "meas/inline/abs_inline_measurement_factory.h" +#include "meas/glue/mesplq.h" +#include "util/ferm/transf.h" +#include "util/ferm/spin_rep.h" +#include "util/ferm/diractodr.h" +#include "util/info/proginfo.h" +#include "util/info/proginfo.h" +#include "actions/ferm/fermacts/fermact_factory_w.h" +#include "actions/ferm/fermacts/fermacts_aggregate_w.h" +#include "meas/inline/make_xml_file.h" + +#include "meas/inline/io/named_objmap.h" + +#include "chroma_config.h" + +namespace Chroma +{ + //---------------------------------------------------------------------------- + namespace InlineInverterTestEnv + { + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::NamedObject_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "gauge_id", input.gauge_id); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::NamedObject_t& input) + { + push(xml, path); + + write(xml, "gauge_id", input.gauge_id); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t::Contract_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "num_vecs", input.num_vecs); + read(inputtop, "decay_dir", input.decay_dir); + + if( inputtop.count("max_rhs") == 1 ) { + read(inputtop, "max_rhs", input.max_rhs); + } else { + input.max_rhs.resize(1); + input.max_rhs[0] = 8; + } + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t::Contract_t& input) + { + push(xml, path); + + write(xml, "num_vecs", input.num_vecs); + write(xml, "decay_dir", input.decay_dir); + write(xml, "max_rhs", input.max_rhs); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params::Param_t& input) + { + XMLReader inputtop(xml, path); + + read(inputtop, "Propagator", input.prop); + read(inputtop, "Contractions", input.contract); + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params::Param_t& input) + { + push(xml, path); + + write(xml, "Propagator", input.prop); + write(xml, "Contractions", input.contract); + + pop(xml); + } + + + //! Propagator input + void read(XMLReader& xml, const std::string& path, Params& input) + { + Params tmp(xml, path); + input = tmp; + } + + //! Propagator output + void write(XMLWriter& xml, const std::string& path, const Params& input) + { + push(xml, path); + + write(xml, "Param", input.param); + write(xml, "NamedObject", input.named_obj); + + pop(xml); + } + } // namespace InlinePropDistillationEnv + + + //---------------------------------------------------------------------------- + namespace InlineInverterTestEnv + { + namespace + { + AbsInlineMeasurement* createMeasurement(XMLReader& xml_in, + const std::string& path) + { + return new InlineMeas(Params(xml_in, path)); + } + + //! Local registration flag + bool registered = false; + } + + const std::string name = "INVERTER_TEST"; + + //! Register all the factories + bool registerAll() + { + bool success = true; + if (! registered) + { + success &= WilsonTypeFermActsEnv::registerAll(); + success &= TheInlineMeasurementFactory::Instance().registerObject(name, createMeasurement); + registered = true; + } + return success; + } + + + //---------------------------------------------------------------------------- + // Param stuff + Params::Params() { frequency = 0; } + + Params::Params(XMLReader& xml_in, const std::string& path) + { + try + { + XMLReader paramtop(xml_in, path); + + if (paramtop.count("Frequency") == 1) + read(paramtop, "Frequency", frequency); + else + frequency = 1; + + // Parameters for source construction + read(paramtop, "Param", param); + + // Read in the output propagator/source configuration info + read(paramtop, "NamedObject", named_obj); + + // Possible alternate XML file pattern + if (paramtop.count("xml_file") != 0) + { + read(paramtop, "xml_file", xml_file); + } + } + catch(const std::string& e) + { + QDPIO::cerr << __func__ << ": Caught Exception reading XML: " << e << std::endl; + QDP_abort(1); + } + } + + + //---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- + // Function call + void + InlineMeas::operator()(unsigned long update_no, + XMLWriter& xml_out) + { + // If xml file not empty, then use alternate + if (params.xml_file != "") + { + std::string xml_file = makeXMLFileName(params.xml_file, update_no); + + push(xml_out, "InverterTest"); + write(xml_out, "update_no", update_no); + write(xml_out, "xml_file", xml_file); + pop(xml_out); + + XMLFileWriter xml(xml_file); + func(update_no, xml); + } + else + { + func(update_no, xml_out); + } + } + + + // Real work done here + void + InlineMeas::func(unsigned long update_no, + XMLWriter& xml_out) + { + START_CODE(); + + StopWatch snoop; + snoop.reset(); + snoop.start(); + + // Test and grab a reference to the gauge field + multi1d u; + XMLBufferWriter gauge_xml; + try + { + u = TheNamedObjMap::Instance().getData< multi1d >(params.named_obj.gauge_id); + TheNamedObjMap::Instance().get(params.named_obj.gauge_id).getRecordXML(gauge_xml); + } + catch( std::bad_cast ) + { + QDPIO::cerr << name << ": caught dynamic cast error" << std::endl; + QDP_abort(1); + } + catch (const std::string& e) + { + QDPIO::cerr << name << ": std::map call failed: " << e << std::endl; + QDP_abort(1); + } + + push(xml_out, "InverterTest"); + write(xml_out, "update_no", update_no); + + QDPIO::cout << name << ": propagator calculation" << std::endl; + + proginfo(xml_out); // Print out basic program info + + // Write out the input + write(xml_out, "Input", params); + + // Write out the config header + write(xml_out, "Config_info", gauge_xml); + + push(xml_out, "Output_version"); + write(xml_out, "out_version", 1); + pop(xml_out); + + // Calculate some gauge invariant observables just for info. + MesPlq(xml_out, "Observables", u); + + // Will use TimeSliceSet-s a lot + const int decay_dir = params.param.contract.decay_dir; + const int Lt = Layout::lattSize()[decay_dir]; + + // A sanity check + if (decay_dir != Nd-1) + { + QDPIO::cerr << name << ": TimeSliceIO only supports decay_dir= " << Nd-1 << "\n"; + QDP_abort(1); + } + + // + // Try the factories + // + try + { + StopWatch swatch; + swatch.reset(); + QDPIO::cout << "Try the various factories" << std::endl; + + // Typedefs to save typing + typedef LatticeFermion T; + typedef multi1d P; + typedef multi1d Q; + + // + // Initialize fermion action + // + std::istringstream xml_s(params.param.prop.fermact.xml); + XMLReader fermacttop(xml_s); + QDPIO::cout << "FermAct = " << params.param.prop.fermact.id << std::endl; + + // Generic Wilson-Type stuff + Handle< FermionAction > + S_f(TheFermionActionFactory::Instance().createObject(params.param.prop.fermact.id, + fermacttop, + params.param.prop.fermact.path)); + + Handle< FermState > state(S_f->createState(u)); + + Handle< SystemSolver > PP = S_f->qprop(state, + params.param.prop.invParam); + + QDPIO::cout << "Suitable factory found: compute all the quark props" << std::endl; + swatch.start(); + + const int num_vecs = params.param.contract.num_vecs; + for (int rhsi = 0; rhsi < params.param.contract.max_rhs.size(); ++rhsi) + { + const int max_rhs = params.param.contract.max_rhs[rhsi]; + + const int t_source = 0; + const int spin_source = 0; + + for (int colorvec_src0 = 0, colorvec_src_step = std::min(max_rhs, num_vecs); + colorvec_src0 < num_vecs; colorvec_src0 += colorvec_src_step, + colorvec_src_step = std::min(colorvec_src_step, num_vecs - colorvec_src0)) + { + std::vector> chis(colorvec_src_step), + quark_solns(colorvec_src_step); + for (int col = 0; col < colorvec_src_step; col++) + chis[col].reset(new LatticeFermion); + for (int col = 0; col < colorvec_src_step; col++) + quark_solns[col].reset(new LatticeFermion); + + for (int colorvec_src = colorvec_src0, col = 0; col < colorvec_src_step; + ++colorvec_src, ++col) + { + // Get the source std::vector + LatticeColorVector vec_srce; + random(vec_srce); + + // Insert a ColorVector into spin index spin_source + // This only overwrites sections, so need to initialize first + *chis[col] = zero; + CvToFerm(vec_srce, *chis[col], spin_source); + + *quark_solns[col] = zero; + } + + StopWatch snarss1; + snarss1.reset(); + snarss1.start(); + + // Solve for the solution std::vector + std::vector res = PP->operator()( + quark_solns, + std::vector>(chis.begin(), chis.end())); + + for (int col = 0; col < colorvec_src_step; col++) + { + QDPIO::cout << name << ": residual norm " << toDouble(res[col].resid) << std::endl; + } + + snarss1.stop(); + QDPIO::cout << "Time to compute " << colorvec_src_step + << " inversions; time = " << snarss1.getTimeInSeconds() << " secs" + << std::endl; + + } // colorvec_src0 + } // rhsi + + swatch.stop(); + QDPIO::cout << "Propagators computed: time= " + << swatch.getTimeInSeconds() + << " secs" << std::endl; + } + catch (const std::string& e) + { + QDPIO::cout << name << ": caught exception around qprop: " << e << std::endl; + QDP_abort(1); + } + + pop(xml_out); // prop_dist + + snoop.stop(); + QDPIO::cout << name << ": total time = " + << snoop.getTimeInSeconds() + << " secs" << std::endl; + + QDPIO::cout << name << ": ran successfully" << std::endl; + + END_CODE(); + } + } +} // namespace Chroma diff --git a/lib/meas/inline/hadron/inline_inverter_test_w.h b/lib/meas/inline/hadron/inline_inverter_test_w.h new file mode 100644 index 0000000000..adbc7d9b3b --- /dev/null +++ b/lib/meas/inline/hadron/inline_inverter_test_w.h @@ -0,0 +1,81 @@ +// -*- C++ -*- +/*! \file + * \brief Compute the propagator from distillation + * + * Propagator calculation in distillation + */ + +#ifndef __inline_inverter_test_w_h__ +#define __inline_inverter_test_w_h__ + +#include "chromabase.h" +#include "meas/inline/abs_inline_measurement.h" +#include "io/qprop_io.h" +#include "io/xml_group_reader.h" + +namespace Chroma +{ + /*! \ingroup inlinehadron */ + namespace InlineInverterTestEnv + { + bool registerAll(); + + //! Parameter structure + /*! \ingroup inlinehadron */ + struct Params + { + Params(); + Params(XMLReader& xml_in, const std::string& path); + + unsigned long frequency; + + struct Param_t + { + struct Contract_t + { + int num_vecs; /*!< Number of color vectors to use */ + int decay_dir; /*!< Decay direction */ + multi1d max_rhs; /*! maximum number of linear systems solved simultaneously */ + }; + + ChromaProp_t prop; + Contract_t contract; + }; + + struct NamedObject_t + { + std::string gauge_id; /*!< Gauge field */ + }; + + Param_t param; + NamedObject_t named_obj; + std::string xml_file; /*!< Alternate XML file pattern */ + }; + + + //! Inline task for the propagator from distillation + /*! \ingroup inlinehadron */ + class InlineMeas : public AbsInlineMeasurement + { + public: + ~InlineMeas() {} + InlineMeas(const Params& p) : params(p) {} + InlineMeas(const InlineMeas& p) : params(p.params) {} + + unsigned long getFrequency(void) const {return params.frequency;} + + //! Do the measurement + void operator()(const unsigned long update_no, + XMLWriter& xml_out); + + protected: + //! Do the measurement + void func(const unsigned long update_no, + XMLWriter& xml_out); + + private: + Params params; + }; + } +} +#endif diff --git a/lib/meas/inline/hadron/inline_matelem_distillation_w.cc b/lib/meas/inline/hadron/inline_matelem_distillation_w.cc index caee0dbb88..129bb1b397 100644 --- a/lib/meas/inline/hadron/inline_matelem_distillation_w.cc +++ b/lib/meas/inline/hadron/inline_matelem_distillation_w.cc @@ -36,6 +36,8 @@ #include "chroma_config.h" +#if ! defined (QDP_IS_QDPJIT2) + #ifndef QDP_IS_QDPJIT_NO_NVPTX #ifdef BUILD_JIT_CONTRACTION_KERNELS @@ -895,3 +897,4 @@ namespace Chroma } // namespace Chroma #endif +#endif diff --git a/lib/meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc b/lib/meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc index 7a9bb290c6..68ba854941 100644 --- a/lib/meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc +++ b/lib/meas/inline/hadron/inline_meson_matelem_colorvec_superb_w.cc @@ -44,7 +44,7 @@ namespace Chroma if (paramtop.count("use_derivP") > 0) read(paramtop, "use_derivP", param.use_derivP); - param.mom_list.resize(0); + param.mom_list.resize(0); if (paramtop.count("mom_list") > 0) read(paramtop, "mom_list", param.mom_list); @@ -54,7 +54,7 @@ namespace Chroma param.mom2_max = 0; if (paramtop.count("mom2_max") > 0) - read(paramtop, "mom2_max", param.mom2_max); + read(paramtop, "mom2_max", param.mom2_max); read(paramtop, "displacement_list", param.displacement_list); read(paramtop, "num_vecs", param.num_vecs); diff --git a/lib/meas/inline/hadron/inline_meson_matelem_colorvec_w.cc b/lib/meas/inline/hadron/inline_meson_matelem_colorvec_w.cc index 86a9ec47b2..7ff1f045df 100644 --- a/lib/meas/inline/hadron/inline_meson_matelem_colorvec_w.cc +++ b/lib/meas/inline/hadron/inline_meson_matelem_colorvec_w.cc @@ -481,8 +481,12 @@ namespace Chroma { moms[i] = params.param.mom_list[i]; } +#if ! defined (QDP_IS_QDPJIT2) SftMom temp_phases(moms, params.param.decay_dir); phases = temp_phases; +#else + phases.reset(moms, params.param.decay_dir); +#endif params.param.mom2_min = 0; } diff --git a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.cc b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.cc index 8e62cfa7f5..98afe4d36b 100644 --- a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.cc +++ b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.cc @@ -24,6 +24,7 @@ #include "util/ferm/diractodr.h" #include "util/ferm/twoquark_contract_ops.h" #include "util/ferm/superb_contractions.h" +#include "util/ferm/mgproton.h" #include "util/ft/sftmom.h" #include "util/ft/time_slice_set.h" #include "util/info/proginfo.h" @@ -95,6 +96,15 @@ namespace Chroma read(inputtop, "use_device_for_contractions", input.use_device_for_contractions); } + input.use_superb_format = false; + if( inputtop.count("use_superb_format") == 1 ) { + read(inputtop, "use_superb_format", input.use_superb_format); + } + + input.output_file_is_local = false; + if( inputtop.count("output_file_is_local") == 1 ) { + read(inputtop, "output_file_is_local", input.output_file_is_local); + } } //! Propagator output @@ -110,6 +120,8 @@ namespace Chroma write(xml, "mass_label", input.mass_label); write(xml, "max_rhs", input.max_rhs); write(xml, "phase", input.phase); + write(xml, "use_superb_format", input.use_superb_format); + write(xml, "output_file_is_local", input.output_file_is_local); write(xml, "use_device_for_contractions", input.use_device_for_contractions); pop(xml); @@ -320,31 +332,71 @@ namespace Chroma // // DB storage // - BinaryStoreDB< SerialDBKey, SerialDBData > qdp_db; + std::vector, + SerialDBData>> + qdp_db{}; + SB::StorageTensor<6, SB::ComplexD> st; // Open the file, and write the meta-data and the binary for this operator - if (!qdp_db.fileExists(params.named_obj.prop_op_file)) + if (!params.param.contract.use_superb_format) { - XMLBufferWriter file_xml; - push(file_xml, "DBMetaData"); - write(file_xml, "id", std::string("propElemOp")); - write(file_xml, "lattSize", QDP::Layout::lattSize()); - write(file_xml, "decay_dir", params.param.contract.decay_dir); - proginfo(file_xml); // Print out basic program info - write(file_xml, "Params", params.param); - write(file_xml, "Config_info", gauge_xml); - pop(file_xml); - - std::string file_str(file_xml.str()); - qdp_db.setMaxUserInfoLen(file_str.size()); - - qdp_db.open(params.named_obj.prop_op_file, O_RDWR | O_CREAT, 0664); - - qdp_db.insertUserdata(file_str); + qdp_db.resize(1); + if (!qdp_db[0].fileExists(params.named_obj.prop_op_file)) + { + XMLBufferWriter file_xml; + push(file_xml, "DBMetaData"); + write(file_xml, "id", std::string("propElemOp")); + write(file_xml, "lattSize", QDP::Layout::lattSize()); + write(file_xml, "decay_dir", params.param.contract.decay_dir); + proginfo(file_xml); // Print out basic program info + write(file_xml, "Params", params.param); + write(file_xml, "Config_info", gauge_xml); + pop(file_xml); + + std::string file_str(file_xml.str()); + qdp_db[0].setMaxUserInfoLen(file_str.size()); + + qdp_db[0].open(params.named_obj.prop_op_file, O_RDWR | O_CREAT, 0664); + + qdp_db[0].insertUserdata(file_str); + } + else + { + qdp_db[0].open(params.named_obj.prop_op_file, O_RDWR, 0664); + } } else { - qdp_db.open(params.named_obj.prop_op_file, O_RDWR, 0664); + // Read order; letter meaning: + // n/N: source/sink eigenvector index + // s/q: source/sink spin index + // p/P: time slice source/sink + const char* order = "sqnNpP"; + XMLBufferWriter metadata_xml; + push(metadata_xml, "DBMetaData"); + write(metadata_xml, "id", std::string("propElemOp")); + write(metadata_xml, "lattSize", QDP::Layout::lattSize()); + write(metadata_xml, "decay_dir", params.param.contract.decay_dir); + proginfo(metadata_xml); // Print out basic program info + write(metadata_xml, "Config_info", gauge_xml); + write(metadata_xml, "Params", params.param); + write(metadata_xml, "mass_label", params.param.contract.mass_label); + write(metadata_xml, "tensorOrder", order); + pop(metadata_xml); + + // NOTE: metadata_xml only has a valid value on Master node; so do a broadcast + std::string metadata = SB::broadcast(metadata_xml.str()); + + st = SB::StorageTensor<6, SB::ComplexD>( + params.named_obj.prop_op_file, metadata, order, + SB::kvcoors<6>(order, {{'s', Ns}, + {'q', Ns}, + {'n', params.param.contract.num_vecs}, + {'N', params.param.contract.num_vecs}, + {'p', Lt}, + {'P', Lt}}), + SB::Sparse, SB::checksum_type::BlockChecksum, + params.param.contract.output_file_is_local ? SB::LocalFSFile : SB::SharedFSFile); } QDPIO::cout << "Finished opening peram file" << std::endl; @@ -374,30 +426,11 @@ namespace Chroma swatch.reset(); QDPIO::cout << "Try the various factories" << std::endl; - // Typedefs to save typing - typedef LatticeFermion T; - typedef multi1d P; - typedef multi1d Q; - // - // Initialize fermion action + // Initialize fermion action and create the solver // - std::istringstream xml_s(params.param.prop.fermact.xml); - XMLReader fermacttop(xml_s); - QDPIO::cout << "FermAct = " << params.param.prop.fermact.id << std::endl; - - // Generic Wilson-Type stuff - Handle< FermionAction > - S_f(TheFermionActionFactory::Instance().createObject(params.param.prop.fermact.id, - fermacttop, - params.param.prop.fermact.path)); + SB::ChimeraSolver PP{params.param.prop.fermact, params.param.prop.invParam, u}; - Handle< FermState > state(S_f->createState(u)); - - Handle< SystemSolver > PP = S_f->qprop(state, - params.param.prop.invParam); - - QDPIO::cout << "Suitable factory found: compute all the quark props" << std::endl; swatch.start(); // @@ -419,7 +452,7 @@ namespace Chroma QDPIO::cout << "t_source = " << t_source << std::endl; // Compute the first tslice and the number of tslices involved in the contraction - int first_tslice = t_source - params.param.contract.Nt_backward; + int first_tslice = SB::normalize_coor(t_source - params.param.contract.Nt_backward, Lt); int num_tslices = std::min( params.param.contract.Nt_backward + std::max(1, params.param.contract.Nt_forward), Lt); @@ -428,16 +461,16 @@ namespace Chroma colorvecsSto, u, decay_dir, first_tslice, num_tslices, num_vecs, "cxyzXnt", phase, dev); // Get all eigenvectors for `t_source` - auto source_colorvec = - colorvec.kvslice_from_size({{'t', t_source - first_tslice}}, {{'t', 1}}); + auto source_colorvec = colorvec.kvslice_from_size( + {{'t', SB::normalize_coor(t_source - first_tslice, Lt)}}, {{'t', 1}}); for (int spin_source = 0; spin_source < Ns; ++spin_source) { // Invert the source for `spin_source` spin and retrieve `num_tslices` tslices starting from tslice `first_tslice` // NOTE: s is spin source, and S is spin sink - SB::Tensor quark_solns = SB::doInversion( - *PP, source_colorvec, t_source, first_tslice, num_tslices, {spin_source}, max_rhs, - "cxyzXnSst"); + SB::Tensor quark_solns = + SB::doInversion(PP, source_colorvec, t_source, first_tslice, num_tslices, + {spin_source}, max_rhs, "cxyzXnSst"); StopWatch snarss1; snarss1.reset(); @@ -445,8 +478,9 @@ namespace Chroma // Contract the distillation elements // NOTE: N: is colorvec in sink, and n is colorvec in source - SB::Tensor<5, SB::Complex> elems("NnSst", {num_vecs, num_vecs, Ns, 1, num_tslices}, - SB::OnHost, SB::OnMaster); + SB::Tensor<5, SB::Complex> elems( + "NnSst", {num_vecs, num_vecs, Ns, 1, num_tslices}, SB::OnHost, + !params.param.contract.use_superb_format ? SB::OnMaster : SB::OnEveryone); elems.contract(colorvec, {{'n', 'N'}}, SB::Conjugate, quark_solns, {}, SB::NotConjugate); @@ -457,47 +491,56 @@ namespace Chroma snarss1.reset(); snarss1.start(); - ValPropElementalOperator_t val; - val.mat.resize(num_vecs, num_vecs); - val.mat = zero; - for (int i_tslice = 0; i_tslice < num_tslices; ++i_tslice) + if (!params.param.contract.use_superb_format) { - for (int spin_sink = 0; spin_sink < Ns; ++spin_sink) + ValPropElementalOperator_t val; + val.mat.resize(num_vecs, num_vecs); + val.mat = zero; + auto local_elems = elems.getLocal(); + for (int i_tslice = 0; i_tslice < num_tslices; ++i_tslice) { - KeyPropElementalOperator_t key; - key.t_source = t_source; - key.t_slice = SB::normalize_coor(i_tslice + first_tslice, Lt); - key.spin_src = spin_source; - key.spin_snk = spin_sink; - key.mass_label = params.param.contract.mass_label; - if (Layout::nodeNumber() == 0) + for (int spin_sink = 0; spin_sink < Ns; ++spin_sink) { - for (int colorvec_sink = 0; colorvec_sink < num_vecs; ++colorvec_sink) + KeyPropElementalOperator_t key; + key.t_source = t_source; + key.t_slice = SB::normalize_coor(i_tslice + first_tslice, Lt); + key.spin_src = spin_source; + key.spin_snk = spin_sink; + key.mass_label = params.param.contract.mass_label; + if (local_elems) { - for (int colorvec_source = 0; colorvec_source < num_vecs; ++colorvec_source) + for (int colorvec_sink = 0; colorvec_sink < num_vecs; ++colorvec_sink) { - std::complex e = elems.get( - {colorvec_sink, colorvec_source, spin_sink, 0, i_tslice}); - val.mat(colorvec_sink, colorvec_source).elem().elem().elem() = - RComplex(e.real(), e.imag()); + for (int colorvec_source = 0; colorvec_source < num_vecs; ++colorvec_source) + { + std::complex e = + local_elems.get({colorvec_sink, colorvec_source, spin_sink, 0, i_tslice}); + val.mat(colorvec_sink, colorvec_source).elem().elem().elem() = + RComplex(e.real(), e.imag()); + } } } + qdp_db[0].insert(key, val); } - qdp_db.insert(key, val); } } + else + { + st.kvslice_from_size({{'s', spin_source}, {'p', t_source}, {'P', first_tslice}}, + {{'s', 1}, {'p', 1}, {'P', num_tslices}}) + .copyFrom(elems.rename_dims({{'S', 'q'}, {'t', 'P'}})); + } snarss1.stop(); QDPIO::cout << "Time to store the props : " << snarss1.getTimeInSeconds() << " secs" << std::endl; - } // for spin_source - } // for tt + } // for spin_source + } // for tt - swatch.stop(); - QDPIO::cout << "Propagators computed: time= " - << swatch.getTimeInSeconds() - << " secs" << std::endl; - } + swatch.stop(); + QDPIO::cout << "Propagators computed: time= " << swatch.getTimeInSeconds() << " secs" + << std::endl; + } catch (const std::exception& e) { QDP_error_exit("%s: caught exception: %s\n", name.c_str(), e.what()); diff --git a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.h b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.h index f0a1b532c2..af09ca6bb9 100644 --- a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.h +++ b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_superb_w.h @@ -45,6 +45,8 @@ namespace Chroma int num_tries; /*!< In case of bad things happening in the solution vectors, do retries */ bool use_device_for_contractions; /*!< Whether use gpu for contractions if available */ int max_rhs; /*! maximum number of linear systems solved simultaneously */ + bool use_superb_format; /*!< Whether use the superb format for storing the elementals */ + bool output_file_is_local; /*!< Whether the output file is in a not shared filesystem */ multi1d phase; /*!< Phase to apply to colorvecs */ }; diff --git a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_w.cc b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_w.cc index fcd337e2b5..e146bde0a6 100644 --- a/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_w.cc +++ b/lib/meas/inline/hadron/inline_prop_and_matelem_distillation_w.cc @@ -36,6 +36,8 @@ #include "chroma_config.h" +#if ! defined (QDP_IS_QDPJIT2) + #ifndef QDP_IS_QDPJIT_NO_NVPTX #ifdef BUILD_JIT_CONTRACTION_KERNELS @@ -968,3 +970,4 @@ namespace Chroma } // namespace Chroma #endif +#endif diff --git a/lib/meas/inline/hadron/inline_stoch_group_baryon_w.cc b/lib/meas/inline/hadron/inline_stoch_group_baryon_w.cc index 4f4877a9bd..29ed9eacc3 100644 --- a/lib/meas/inline/hadron/inline_stoch_group_baryon_w.cc +++ b/lib/meas/inline/hadron/inline_stoch_group_baryon_w.cc @@ -37,10 +37,6 @@ namespace Chroma const int N_quarks = 3; - // - // The spin basis matrix to goto Dirac - // - SpinMatrix rotate_mat(adj(DiracToDRMat())); // Reader for input parameters void read(XMLReader& xml, const std::string& path, InlineStochGroupBaryonEnv::Params::Param_t& param) @@ -449,6 +445,10 @@ namespace Chroma SmearedDispObjects::smearSource(int qnum , const KeySmearedQuark_t & key) { + // + // The spin basis matrix to goto Dirac + // + SpinMatrix rotate_mat(adj(DiracToDRMat())); std::map & qmap = smeared_src_maps[qnum]; @@ -510,6 +510,10 @@ namespace Chroma SmearedDispObjects::smearSolution(int qnum , const KeySmearedQuark_t & key) { + // + // The spin basis matrix to goto Dirac + // + SpinMatrix rotate_mat(adj(DiracToDRMat())); std::map & qmap = smeared_soln_maps[qnum]; diff --git a/lib/meas/inline/hadron/inline_stoch_group_meson_w.cc b/lib/meas/inline/hadron/inline_stoch_group_meson_w.cc index 1c85b00cc5..28590a370d 100644 --- a/lib/meas/inline/hadron/inline_stoch_group_meson_w.cc +++ b/lib/meas/inline/hadron/inline_stoch_group_meson_w.cc @@ -37,11 +37,6 @@ namespace Chroma const int N_quarks = 2; - // - // The spin basis matrix to goto Dirac - // - SpinMatrix rotate_mat(adj(DiracToDRMat())); - // Reader for input parameters void read(XMLReader& xml, const std::string& path, InlineStochGroupMesonEnv::Params::Param_t& param) { @@ -449,6 +444,11 @@ namespace Chroma SmearedDispObjects::smearSource(int qnum, const KeySmearedQuark_t & key) { + // + // The spin basis matrix to goto Dirac + // + SpinMatrix rotate_mat(adj(DiracToDRMat())); + std::map & qmap = smeared_src_maps[qnum]; //If entry is not in std::map create it @@ -517,6 +517,11 @@ namespace Chroma SmearedDispObjects::smearSolution(int qnum , const KeySmearedQuark_t & key) { + // + // The spin basis matrix to goto Dirac + // + SpinMatrix rotate_mat(adj(DiracToDRMat())); + std::map & qmap = smeared_soln_maps[qnum]; //If entry is not in std::map create it diff --git a/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.cc b/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.cc index d9866e2a38..1e19714255 100644 --- a/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.cc +++ b/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.cc @@ -23,9 +23,8 @@ #include "util/ferm/key_val_db.h" #include "util/ferm/subset_vectors.h" #include "util/ferm/superb_contractions.h" +#include "util/ferm/mgproton.h" #include "util/ferm/transf.h" -#include "util/ft/sftmom.h" -#include "util/ft/time_slice_set.h" #include "util/info/proginfo.h" #include "meas/inline/io/named_objmap.h" @@ -159,6 +158,12 @@ namespace Chroma read(inputtop, "decay_dir", input.decay_dir); read(inputtop, "mass_label", input.mass_label); + input.do_summation = false; + if (inputtop.count("do_summation") == 1) + { + read(inputtop, "do_summation", input.do_summation); + } + input.max_rhs = 8; if( inputtop.count("max_rhs") == 1 ) { read(inputtop, "max_rhs", input.max_rhs); @@ -189,9 +194,9 @@ namespace Chroma read(inputtop, "use_genprop5_format", input.use_genprop5_format); } - input.use_multiple_writers = false; - if( inputtop.count("use_multiple_writers") == 1 ) { - read(inputtop, "use_multiple_writers", input.use_multiple_writers); + input.output_file_is_local = false; + if( inputtop.count("output_file_is_local") == 1 ) { + read(inputtop, "output_file_is_local", input.output_file_is_local); } if (inputtop.count("phase") == 1) @@ -246,8 +251,8 @@ namespace Chroma write(xml, "max_moms_in_contraction", input.max_moms_in_contraction); write(xml, "use_genprop4_format", input.use_genprop4_format); write(xml, "use_genprop5_format", input.use_genprop5_format); + write(xml, "output_file_is_local", input.output_file_is_local); write(xml, "use_device_for_contractions", input.use_device_for_contractions); - write(xml, "use_multiple_writers", input.use_multiple_writers); write(xml, "quarkPhase", SB::tomulti1d(input.quarkPhase)); write(xml, "aQuarkPhase", SB::tomulti1d(input.aQuarkPhase)); @@ -776,7 +781,7 @@ namespace Chroma QDPIO::cout << "Parse momentum list" << std::endl; // Possible momenta, gammas, and displacements - multi2d moms; + SB::CoorMoms moms; std::vector gammas; std::vector> disps; @@ -817,24 +822,28 @@ namespace Chroma QDP_abort(1); } - int num_mom = moms_set.size(); - int mom_size = Nd - 1; - QDPIO::cout << name << ": num_mom= " << num_mom - << " mom_size= " << mom_size << std::endl; - moms.resize(num_mom, mom_size); - int i = 0; - for (const auto &it : moms_set) { - for (unsigned int j = 0; j < Nd - 1; ++j) - moms[i][j] = it[j]; - i++; - } - + moms = SB::CoorMoms(moms_set.begin(), moms_set.end()); disps.resize(disps_set.size()); std::copy(disps_set.begin(), disps_set.end(), disps.begin()); gammas.resize(gammas_set.size()); std::copy(gammas_set.begin(), gammas_set.end(), gammas.begin()); } + // Get the maximum steps in the time direction + int t_extra = 0; + for (const auto& disp : disps) + { + int this_t_extra = 0; + for (const auto& dir : disp) + { + if (std::abs(dir) == 4) + { + this_t_extra += (dir < 0 ? -1 : 1); + t_extra = std::max(t_extra, std::abs(this_t_extra)); + } + } + } + // // Parse the phase // @@ -854,11 +863,6 @@ namespace Chroma negSinkPhase[i] = -params.param.contract.aQuarkPhase[i]; } - // - // Initialize the slow Fourier transform phases - // - SftMom phases(moms, params.param.contract.decay_dir); - // // Capture maximum number of vecs // @@ -880,8 +884,17 @@ namespace Chroma Params::Param_t::SinkSource_t ss; ss.t_sink = snk % Lt; ss.t_source = it.first % Lt; - ss.Nt_backward = it.first - params.param.contract.alt_t_start; - ss.Nt_forward = params.param.contract.alt_Nt_forward - ss.Nt_backward; + if (!params.param.contract.do_summation) + { + ss.Nt_backward = it.first - params.param.contract.alt_t_start; + ss.Nt_forward = params.param.contract.alt_Nt_forward - ss.Nt_backward; + } + else + { + // Compute from src+1 up to snk-1 + ss.Nt_backward = -1; + ss.Nt_forward = std::max(SB::normalize_coor(ss.t_sink - ss.t_source, Lt) + 1 - 2, 0); + } params.param.sink_source_pairs.push_back(ss); } } @@ -893,23 +906,30 @@ namespace Chroma std::vector active_tslices_source(Lt), active_tslices_sink0(Lt); std::vector& active_tslices_sink = negSinkPhase == sourcePhase ? active_tslices_source : active_tslices_sink0; - for (const auto& it : params.param.sink_source_pairs) + for (auto& it : params.param.sink_source_pairs) { // Check t_source and t_sink if (it.t_source < 0 || it.t_sink < 0) throw std::runtime_error("Invalid source or sink on SinkSourcePairs"); - int num_tslices_active = it.Nt_backward + it.Nt_forward + 1; + if (params.param.contract.do_summation) + { + // Compute from src+1 up to snk-1 + it.Nt_backward = -1; + it.Nt_forward = std::max(SB::normalize_coor(it.t_sink - it.t_source, Lt) + 1 - 2, 0); + } + + int num_tslices_active = std::min(it.Nt_backward + it.Nt_forward + 1 + 2 * t_extra, Lt); // Make the number of time-slices even; required by SB::doMomGammaDisp_contractions num_tslices_active = std::min(num_tslices_active + num_tslices_active % 2, Lt); FromSize fs = active_tslices_source[it.t_source % Lt]; - SB::union_interval(fs.from, fs.size, it.t_source - it.Nt_backward, num_tslices_active, Lt, - fs.from, fs.size); + SB::union_interval(fs.from, fs.size, it.t_source - it.Nt_backward - t_extra, + num_tslices_active, Lt, fs.from, fs.size); active_tslices_source[it.t_source % Lt] = fs; fs = active_tslices_sink[it.t_sink % Lt]; - SB::union_interval(fs.from, fs.size, it.t_source - it.Nt_backward, num_tslices_active, Lt, - fs.from, fs.size); + SB::union_interval(fs.from, fs.size, it.t_source - it.Nt_backward - t_extra, + num_tslices_active, Lt, fs.from, fs.size); active_tslices_sink[it.t_sink % Lt] = fs; } @@ -959,12 +979,6 @@ namespace Chroma } } - // Set how many processes are going to write elementals; each process is going to write in a - // independent file - bool use_multiple_writers = params.param.contract.use_multiple_writers; - if (params.param.contract.use_genprop5_format) - use_multiple_writers = true; - // // DB storage // @@ -979,56 +993,26 @@ namespace Chroma // Estimate the number of keys std::size_t max_tslices = 0; for (const auto& sink_source : params.param.sink_source_pairs) - max_tslices = - std::max(max_tslices, (std::size_t)sink_source.Nt_backward + sink_source.Nt_forward + 1); - std::size_t num_keys_gp4 = phases.numMom() * gammas.size() * disps.size() * max_tslices * + max_tslices = std::max(max_tslices, + (std::size_t)(sink_source.Nt_backward + sink_source.Nt_forward + 1)); + std::size_t num_keys_gp4 = moms.size() * gammas.size() * disps.size() * max_tslices * params.param.sink_source_pairs.size(); - for (auto& db : qdp_db) - db.setNumberBuckets(num_keys_gp4 * num_vecs * 2); - for (auto& db : qdp4_db) - db.setNumberBuckets(num_keys_gp4 * 2); - - // The final elementals are going to be distributed along the lattice `t` - // dimension, with no support on the lattice spatial dimension. Because - // of this, not all processes are going to have support on the final - // elementals. The processes that have are going to write them on disk bool db_is_open = false; //< whether qdp_db/qdp4_db has been opened - int this_proc_id_t = -1; //< the process id on the tensor holding the elementals - // This function open the output file, after changing the name with the process id if multiple writers is used - // \param proc_id_t: process rank on the tensor - // \param numprocs_t: number of processes with support on the tensor - std::function open_db = [&](int proc_id_t, int numprocs_t) { - if (params.param.contract.use_genprop5_format) - return; - - // If this process has not support on the tensor, do nothing - if (proc_id_t < 0) - return; + // This function open the output file when using filehash + auto open_db = [&]() { if (db_is_open) - { - assert(proc_id_t == this_proc_id_t); - assert((!params.param.contract.use_genprop4_format && qdp_db.size() == 1) || - (params.param.contract.use_genprop4_format && qdp4_db.size() == 1)); return; - } - this_proc_id_t = proc_id_t; db_is_open = true; - // If the final elementals are going to be spread among several processes, append the index - // of the current process on the `t` dimension to the filename - if (!params.param.contract.use_genprop4_format) - qdp_db.resize(1); - else - qdp4_db.resize(1); std::string filename = params.named_obj.dist_op_file; - if (use_multiple_writers) - filename += "." + std::to_string(proc_id_t + 1) + "_outof_" + std::to_string(numprocs_t); // Open the file, and write the meta-data and the binary for this operator if (!params.param.contract.use_genprop4_format) { + qdp_db.resize(1); + qdp_db[0].setNumberBuckets(num_keys_gp4 * num_vecs * 2); if (!qdp_db[0].fileExists(filename)) { XMLBufferWriter file_xml; @@ -1055,6 +1039,8 @@ namespace Chroma } else { + qdp4_db.resize(1); + qdp4_db[0].setNumberBuckets(num_keys_gp4 * 2); if (!qdp4_db[0].fileExists(filename)) { XMLBufferWriter file_xml; @@ -1094,10 +1080,10 @@ namespace Chroma write(metadata_xml, "Config_info", gauge_xml); write(metadata_xml, "tensorOrder", qdp5_order); write(metadata_xml, "displacements", disps); - std::vector> moms; - for (int i = 0; i < phases.numMom(); ++i) - moms.push_back(phases.numToMom(i)); - write(metadata_xml, "moms", moms); + std::vector> moms0; + for (int i = 0; i < moms.size(); ++i) + moms0.push_back(SB::tomulti1d(moms[i])); + write(metadata_xml, "moms", moms0); write(metadata_xml, "mass_label", params.param.contract.mass_label); write(metadata_xml, "gammas", gammas); write(metadata_xml, "quarkPhase", SB::tomulti1d(params.param.contract.quarkPhase)); @@ -1107,53 +1093,35 @@ namespace Chroma // NOTE: metadata_xml only has a valid value on Master node; so do a broadcast std::string metadata = SB::broadcast(metadata_xml.str()); - qdp5_db = - SB::StorageTensor<10, SB::ComplexD>(params.named_obj.dist_op_file, metadata, qdp5_order, - SB::kvcoors<10>(qdp5_order, {{'n', num_vecs}, - {'N', num_vecs}, - {'s', Ns}, - {'q', Ns}, - {'g', gammas.size()}, - {'d', disps.size()}, - {'m', moms.size()}, - {'t', Lt}, - {'p', Lt}, - {'P', Lt}}), - SB::Sparse, SB::checksum_type::BlockChecksum); - qdp5_db.preallocate(num_keys_gp4 * num_vecs * num_vecs * gammas.size() * sizeof(SB::ComplexD)); + qdp5_db = SB::StorageTensor<10, SB::ComplexD>( + params.named_obj.dist_op_file, metadata, qdp5_order, + SB::kvcoors<10>(qdp5_order, {{'n', num_vecs}, + {'N', num_vecs}, + {'s', Ns}, + {'q', Ns}, + {'g', gammas.size()}, + {'d', disps.size()}, + {'m', moms.size()}, + {'t', Lt}, + {'p', Lt}, + {'P', Lt}}), + SB::Sparse, SB::checksum_type::BlockChecksum, + params.param.contract.output_file_is_local ? SB::LocalFSFile : SB::SharedFSFile); + qdp5_db.preallocate(num_keys_gp4 * num_vecs * num_vecs * gammas.size() * + sizeof(SB::ComplexD) / + (params.param.contract.output_file_is_local ? Layout::numNodes() : 1)); } - // - // Try the factories - // + // Initialize fermion action + // NOTE: this gets out the following try-block because QUDA and MGPROTO solvers may + // hang when an exception is thrown, preventing the report of the exception message + SB::ChimeraSolver PP{params.param.prop.fermact, params.param.prop.invParam, u}; + + // NOTE: qdp5_db needs MPI synchronization when closing, so capture exception and abort in that case + // to avoid hangs try { StopWatch swatch; - swatch.reset(); - - // Typedefs to save typing - typedef LatticeFermion T; - typedef multi1d P; - typedef multi1d Q; - - // - // Initialize fermion action - // - std::istringstream xml_s(params.param.prop.fermact.xml); - XMLReader fermacttop(xml_s); - QDPIO::cout << "FermAct = " << params.param.prop.fermact.id << std::endl; - - // Generic Wilson-Type stuff - Handle< FermionAction > - S_f(TheFermionActionFactory::Instance().createObject(params.param.prop.fermact.id, - fermacttop, - params.param.prop.fermact.path)); - - Handle< FermState > state(S_f->createState(u)); - - Handle< SystemSolver > PP = S_f->qprop(state, - params.param.prop.invParam); - // // Loop over the source color and spin, creating the source @@ -1169,15 +1137,24 @@ namespace Chroma // Maximum number of tslices contracted at once (it has to be even) int max_tslices_in_contraction = params.param.contract.max_tslices_in_contraction; - if (max_tslices_in_contraction <= 0) + if (!params.param.contract.do_summation) + { + if (max_tslices_in_contraction <= 0) + max_tslices_in_contraction = Lt; + max_tslices_in_contraction = + max_tslices_in_contraction + (max_tslices_in_contraction % 2); + max_tslices_in_contraction = std::min(Lt, max_tslices_in_contraction); + } + else + { + // When doing summation, compute all middle time slices at once max_tslices_in_contraction = Lt; - max_tslices_in_contraction = max_tslices_in_contraction + (max_tslices_in_contraction % 2); - max_tslices_in_contraction = std::min(Lt, max_tslices_in_contraction); + } // Maximum number of momenta contracted at once int max_moms_in_contraction = params.param.contract.max_moms_in_contraction; if (max_moms_in_contraction <= 0) - max_moms_in_contraction = phases.numMom(); + max_moms_in_contraction = moms.size(); // Set place for doing the contractions SB::DeviceHost dev = @@ -1192,9 +1169,15 @@ namespace Chroma swatch.reset(); swatch.start(); - int first_tslice_active = t_source - sink_source.Nt_backward; - int num_tslices_active = - std::min(sink_source.Nt_backward + std::max(sink_source.Nt_forward, 1), Lt); + int first_tslice_active; // first middle time-slice to compute + int num_tslices_active; // number of middle time-slices to compute + first_tslice_active = + SB::normalize_coor(t_source - sink_source.Nt_backward - t_extra, Lt); + num_tslices_active = + std::min(sink_source.Nt_backward + + (sink_source.Nt_forward == 0 ? 1 : sink_source.Nt_forward) + 2 * t_extra, + Lt); + // Make the number of time-slices even; required by SB::doMomGammaDisp_contractions num_tslices_active = std::min(num_tslices_active + num_tslices_active % 2, Lt); @@ -1213,10 +1196,9 @@ namespace Chroma // Invert the source for all spins and retrieve num_tslices_active // time-slices starting from time-slice first_tslice_active - invCacheSource[t_source] = SB::doInversion( - *PP, std::move(source_colorvec), t_source, - active_tslices_source[t_source].from, active_tslices_source[t_source].size, - {0, 1, 2, 3}, max_rhs, "cxyzXnSst"); + invCacheSource[t_source] = SB::doInversion( + PP, std::move(source_colorvec), t_source, active_tslices_source[t_source].from, + active_tslices_source[t_source].size, {0, 1, 2, 3}, max_rhs, "cxyzXnSst"); } if (!invCacheSink[t_sink]) @@ -1234,8 +1216,8 @@ namespace Chroma // Invert the sink for all spins and retrieve num_tslices_active time-slices starting from // time-slice first_tslice_active - invCacheSink[t_sink] = SB::doInversion( - *PP, std::move(sink_colorvec), t_sink, active_tslices_sink[t_sink].from, + invCacheSink[t_sink] = SB::doInversion( + PP, std::move(sink_colorvec), t_sink, active_tslices_sink[t_sink].from, active_tslices_sink[t_sink].size, {0, 1, 2, 3}, max_rhs, "ScnsxyzXt"); } @@ -1246,6 +1228,7 @@ namespace Chroma SB::Tensor invSink = invCacheSink[t_sink].kvslice_from_size( {{'t', first_tslice_active - active_tslices_sink[t_sink].from}}, {{'t', num_tslices_active}}); + invSink = invSink.rename_dims({{'n', 'N'}, {'s', 'q'}, {'S', 'Q'}}); // Remove from cache the source/sink inversions if the user suggests it or they are not going to be used anymore edges_on_tslice_source[t_source]--; @@ -1255,82 +1238,66 @@ namespace Chroma if (edges_on_tslice_sink[t_sink] == 0 || !cache_tslice[t_sink]) invCacheSink[t_sink].release(); - // Contract the spatial components of sink and source together with - // several momenta, gammas and displacements; but contract not more than - // max_tslices_in_contraction at once! + double time_in_writing = 0; // time in writing in genprops - invSink = invSink.rename_dims({{'n', 'N'}, {'s', 'q'}, {'S', 'Q'}}); - for (int tfrom = 0, tsize = std::min(max_tslices_in_contraction, num_tslices_active); - tfrom < num_tslices_active; tfrom += tsize, - tsize = std::min(max_tslices_in_contraction, num_tslices_active - tfrom)) - { - for (int mfrom = 0, msize = std::min(max_moms_in_contraction, phases.numMom()); - mfrom < phases.numMom(); - mfrom += msize, msize = std::min(max_moms_in_contraction, phases.numMom() - mfrom)) - { + auto call = + [&](SB::Tensor<7, SB::Complex> r, int disp_index, int tfrom, int mfrom) { - StopWatch snarss1; - snarss1.reset(); - snarss1.start(); + // Premultiply by g5, again; see above comment about this + r = SB::contract<7>(r, SB::Gamma(g5, dev).rename_dims({{'j', 'q'}}), "q") + .rename_dims({{'i', 'q'}}); - const char order_out[] = "qgmNndst"; - SB::Tensor this_invSource = - invSource.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); - SB::Tensor this_invSink = - invSink.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); - if (tfrom + tsize >= num_tslices_active && mfrom + msize >= phases.numMom()) + // + // Do summation over all time slices between t_source+1 and t_sink-1 + // + int tsize = r.kvdim().at('t'); + int msize = r.kvdim().at('m'); + if (params.param.contract.do_summation) { - invSource.release(); - invSink.release(); + auto s = r.like_this(SB::none, {{'t', 1}}); + s.set_zero(); + bool something_was_sum_up = false; + for (int t = 0; t < tsize; ++t) + { + if (SB::normalize_coor(tfrom + t - (t_source + 1), Lt) < + SB::normalize_coor(t_sink - 1 - (t_source + 1), Lt)) + { + r.kvslice_from_size({{'t', t}}, {{'t', 1}}).addTo(s); + something_was_sum_up = true; + } + } + if (!something_was_sum_up) + return; + r = s; } - std::pair, std::vector> r = - SB::doMomGammaDisp_contractions<8>( - u, std::move(this_invSink), std::move(this_invSource), - first_tslice_active + tfrom, phases, mfrom, msize, gamma_mats, disps, - params.param.contract.use_derivP, order_out, SB::none, dev); - - // Premultiply by g5, again; see above commit about this - SB::Tensor<8, SB::Complex> g5_con = r.first.like_this( - "qgmNndst", {}, SB::OnHost, use_multiple_writers ? SB::OnEveryone : SB::OnMaster); - g5_con.contract(SB::Gamma(g5, SB::OnDefaultDevice), {}, SB::NotConjugate, - std::move(r.first), {{'q', 'j'}}, SB::NotConjugate, {{'q', 'i'}}); - const std::vector disps_perm = r.second; - - snarss1.stop(); - QDPIO::cout << "Time to compute contractions for " << tsize - << " tslices from t= " << (first_tslice_active + tfrom) % Lt << " and " - << msize << " momenta from momentum " << mfrom << " : " - << snarss1.getTimeInSeconds() << " secs" << std::endl; // // Write the elementals // - snarss1.reset(); - snarss1.start(); + double time_writing_this = -SB::w_time(); if (params.param.contract.use_genprop5_format) { - auto g5_con_rearrange_d = g5_con.like_this(); - for (int d = 0; d < disps_perm.size(); ++d) - { - g5_con.kvslice_from_size({{'d', d}}, {{'d', 1}}) - .copyTo( - g5_con_rearrange_d.kvslice_from_size({{'d', disps_perm[d]}}, {{'d', 1}})); - } - qdp5_db - .kvslice_from_size({{'m', mfrom}, - {'t', (tfrom + first_tslice_active) % Lt}, - {'p', t_source}, - {'P', t_sink}}, - {{'p', 1}, {'P', 1}}) - .copyFrom(g5_con_rearrange_d); + .kvslice_from_size( + {{'m', mfrom}, + {'d', disp_index}, + {'t', !params.param.contract.do_summation ? tfrom : (t_source + 1) % Lt}, + {'p', t_source}, + {'P', t_sink}}, + {{'p', 1}, {'P', 1}, {'d', 1}}) + .copyFrom(r); } else { + // Move the result to the master node and do the writing (only the master node) + r = r.make_sure(SB::none, SB::OnHost, SB::OnMaster).getLocal(); + if (!r) + return; + // Open DB if they are not opened already - open_db(g5_con.p->procRank(), g5_con.p->numProcs()); + open_db(); // Store the tensor if (!params.param.contract.use_genprop4_format) @@ -1346,33 +1313,24 @@ namespace Chroma { for (int mom = 0; mom < msize; ++mom) { - for (int d = 0; d < disps_perm.size(); ++d) + for (int n = 0; n < num_vecs; ++n) { - for (int n = 0; n < num_vecs; ++n) - { - auto g5_con_t = - g5_con - .kvslice_from_size( - {{'g', g}, {'m', mom}, {'n', n}, {'d', d}, {'t', t}}, - {{'g', 1}, {'m', 1}, {'n', 1}, {'d', 1}, {'t', 1}}) - .getLocal(); - if (g5_con_t) - { - g5_con_t.copyTo(val.data()); - - key.key().derivP = params.param.contract.use_derivP; - key.key().t_sink = t_sink; - key.key().t_slice = (t + tfrom + first_tslice_active) % Lt; - key.key().t_source = t_source; - key.key().colorvec_src = n; - key.key().gamma = gammas[g]; - key.key().displacement = disps[disps_perm[d]]; - key.key().mom = phases.numToMom(mfrom + mom); - key.key().mass = params.param.contract.mass_label; - - qdp_db[use_multiple_writers ? mfrom + mom : 0].insert(key, val); - } - } + r.kvslice_from_size({{'g', g}, {'m', mom}, {'n', n}, {'t', t}}, + {{'g', 1}, {'m', 1}, {'n', 1}, {'t', 1}}) + .copyTo(val.data()); + + key.key().derivP = params.param.contract.use_derivP; + key.key().t_sink = t_sink; + key.key().t_slice = SB::normalize_coor( + !params.param.contract.do_summation ? t + tfrom : t_source + 1, Lt); + key.key().t_source = t_source; + key.key().colorvec_src = n; + key.key().gamma = gammas[g]; + key.key().displacement = disps[disp_index]; + key.key().mom = SB::tomulti1d(moms[mfrom + mom]); + key.key().mass = params.param.contract.mass_label; + + qdp_db[0].insert(key, val); } } } @@ -1391,39 +1349,45 @@ namespace Chroma { for (int mom = 0; mom < msize; ++mom) { - for (int d = 0; d < disps_perm.size(); ++d) - { - auto g5_con_t = - g5_con - .kvslice_from_size({{'g', g}, {'m', mom}, {'d', d}, {'t', t}}, - {{'g', 1}, {'m', 1}, {'d', 1}, {'t', 1}}) - .getLocal(); - - if (g5_con_t) - { - g5_con_t.copyTo(val.data()); - - key.key().t_sink = t_sink; - key.key().t_slice = (t + tfrom + first_tslice_active) % Lt; - key.key().t_source = t_source; - key.key().g = gammas[g]; - key.key().displacement = disps[disps_perm[d]]; - key.key().mom = phases.numToMom(mfrom + mom); - key.key().mass = params.param.contract.mass_label; - - qdp4_db[use_multiple_writers ? mfrom + mom : 0].insert(key, val); - } - } + r.kvslice_from_size({{'g', g}, {'m', mom}, {'t', t}}, + {{'g', 1}, {'m', 1}, {'t', 1}}) + .copyTo(val.data()); + + key.key().t_sink = t_sink; + key.key().t_slice = SB::normalize_coor( + !params.param.contract.do_summation ? t + tfrom : t_source + 1, Lt); + key.key().t_source = t_source; + key.key().g = gammas[g]; + key.key().displacement = disps[disp_index]; + key.key().mom = SB::tomulti1d(moms[mfrom + mom]); + key.key().mass = params.param.contract.mass_label; + + qdp4_db[0].insert(key, val); } } } } } - snarss1.stop(); - QDPIO::cout << "Time to store " << tsize - << " tslices : " << snarss1.getTimeInSeconds() << " secs" << std::endl; - } - } + + time_in_writing += SB::w_time() + time_writing_this; + }; + + // Contract the spatial components of sink and source together with + // several momenta, gammas and displacements; but contract not more than + // max_tslices_in_contraction at once! + + double time_contracting_and_writing = -SB::w_time(); + SB::doMomGammaDisp_contractions<7, Nd + 5, Nd + 5, SB::Complex>( + u, std::move(invSink), std::move(invSource), first_tslice_active, t_extra, + num_tslices_active - 2 * t_extra, moms, gamma_mats, disps, + params.param.contract.use_derivP, call, "qgmNnst", max_tslices_in_contraction, + max_moms_in_contraction, dev); + time_contracting_and_writing += SB::w_time(); + + QDPIO::cout << "Time to contract: " << time_contracting_and_writing - time_in_writing + << " secs" << std::endl; + QDPIO::cout << "Time to store: " << time_in_writing << " secs" << std::endl; + swatch.stop(); QDPIO::cout << "SINK-SOURCE: time to compute all source solution vectors and insertions " "for t_sink= " diff --git a/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h b/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h index 707233d84e..61bc2bfde8 100644 --- a/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h +++ b/lib/meas/inline/hadron/inline_unsmeared_hadron_node_distillation_superb_w.h @@ -74,7 +74,8 @@ namespace Chroma bool use_device_for_contractions; /*!< Whether use gpu for contractions if available */ bool use_genprop4_format; /*!< Use the efficient genprop4 format instead of the traditional one */ bool use_genprop5_format; /*!< Use the superb format instead of the traditional one */ - bool use_multiple_writers; /*!< Whether several processes are going to write down the elementals on separate files */ + bool output_file_is_local; /*!< Whether the output file is in a not shared filesystem */ + bool do_summation; /*!< Whether to add all time slices between source+1 and sink-1 */ std::vector quarkPhase; /*!< Phase to apply to the right colorvecs (quarks) */ std::vector aQuarkPhase; /*!< Phase to apply to the left colorvecs (antiquarks) */ }; diff --git a/lib/meas/inline/io/inline_milc_write_stag_source.cc b/lib/meas/inline/io/inline_milc_write_stag_source.cc index 5c31c1bcd2..f055922ae7 100644 --- a/lib/meas/inline/io/inline_milc_write_stag_source.cc +++ b/lib/meas/inline/io/inline_milc_write_stag_source.cc @@ -248,7 +248,9 @@ namespace Chroma // Write QDPIO::cout << "About to write single prec. source component: " << col << std::endl; +#if ! defined (QDP_IS_QDPJIT2) QDPIO::cout << "TypeSize is: " << sizeof(PColorVector,3>) << std::endl; +#endif write(qio_out,record_xml, fermOut, lower_left, upper_right); } diff --git a/lib/meas/inline/io/inline_qio_read_obj.cc b/lib/meas/inline/io/inline_qio_read_obj.cc index 275bb9e78e..76ad3e2a18 100644 --- a/lib/meas/inline/io/inline_qio_read_obj.cc +++ b/lib/meas/inline/io/inline_qio_read_obj.cc @@ -739,6 +739,8 @@ namespace Chroma //----------------------------------------------------------------------- //! Read a RitzPairs Type +#if ! defined (QDP_IS_QDPJIT2) + class QIOReadRitzPairsLatticeFermion : public QIOReadObject { private: @@ -798,7 +800,7 @@ namespace Chroma { return new QIOReadRitzPairsLatticeFermion(p); } - +#endif //------------------------------------------------------------------------ @@ -856,8 +858,10 @@ namespace Chroma success &= TheQIOReadObjectFactory::Instance().registerObject(std::string("EigenInfoLatticeFermion"), qioReadEigenInfoLatticeFermion); +#if ! defined (QDP_IS_QDPJIT2) success &= TheQIOReadObjectFactory::Instance().registerObject(std::string("RitzPairsLatticeFermion"), qioReadRitzPairsLatticeFermion); +#endif registered = true; } diff --git a/lib/meas/inline/io/inline_usqcd_write_ddpairs_prop.cc b/lib/meas/inline/io/inline_usqcd_write_ddpairs_prop.cc index 4a85119a03..ca81e940e1 100644 --- a/lib/meas/inline/io/inline_usqcd_write_ddpairs_prop.cc +++ b/lib/meas/inline/io/inline_usqcd_write_ddpairs_prop.cc @@ -193,7 +193,11 @@ namespace Chroma If not, a wider scope reference will be used to bind the prop and u. This is to avoid a very very long try {} catch block */ try { +#if ! defined (QDP_IS_QDPJIT2) LatticeDiracPropagator& trial_prop=TheNamedObjMap::Instance().getData(params.prop_id); +#else + LatticePropagator& trial_prop=TheNamedObjMap::Instance().getData(params.prop_id); +#endif const multi1d& u_trial = TheNamedObjMap::Instance().getData >(params.gauge_id); diff --git a/lib/meas/inline/io/qio_write_obj_funcmap.cc b/lib/meas/inline/io/qio_write_obj_funcmap.cc index 813fbb0d5c..d6fdfbee98 100644 --- a/lib/meas/inline/io/qio_write_obj_funcmap.cc +++ b/lib/meas/inline/io/qio_write_obj_funcmap.cc @@ -380,6 +380,7 @@ namespace Chroma close(to); } +#if ! defined (QDP_IS_QDPJIT2) //------------------------------------------------------------------------ //! Write out an RitzPairs Type void QIOWriteRitzPairsLatticeFermion(const std::string& buffer_id, @@ -416,7 +417,8 @@ namespace Chroma // Close close(to); } - +#endif + //---------------------------------------------------------------------- void QIOWriteSubsetVectors(const std::string& buffer_id, const std::string& file, @@ -546,8 +548,10 @@ namespace Chroma success &= TheQIOWriteObjFuncMap::Instance().registerFunction(std::string("EigenInfoLatticeFermion"), QIOWriteEigenInfo); +#if ! defined (QDP_IS_QDPJIT2) success &= TheQIOWriteObjFuncMap::Instance().registerFunction(std::string("RitzPairsLatticeFermion"), QIOWriteRitzPairsLatticeFermion); +#endif success &= TheQIOWriteObjFuncMap::Instance().registerFunction(std::string("SubsetVectorsLatticeColorVector"), QIOWriteSubsetVectors); diff --git a/lib/meas/sinks/wall_sink_smearing.cc b/lib/meas/sinks/wall_sink_smearing.cc index 5fa6e03fb2..0e82b3bc50 100644 --- a/lib/meas/sinks/wall_sink_smearing.cc +++ b/lib/meas/sinks/wall_sink_smearing.cc @@ -38,6 +38,7 @@ namespace Chroma return new SinkSmear(Params(xml_in, path), u); } +#if ! defined (QDP_IS_QDPJIT2) //! Callback function QuarkSourceSink* createStagProp(XMLReader& xml_in, const std::string& path, @@ -45,7 +46,8 @@ namespace Chroma { return new SinkSmear(Params(xml_in, path), u); } - +#endif + //! Callback function QuarkSourceSink* createFerm(XMLReader& xml_in, const std::string& path, @@ -71,7 +73,9 @@ namespace Chroma if (! registered) { success &= Chroma::ThePropSinkSmearingFactory::Instance().registerObject(name, createProp); +#if ! defined (QDP_IS_QDPJIT2) success &= Chroma::TheStagPropSinkSmearingFactory::Instance().registerObject(name, createStagProp); +#endif success &= Chroma::TheFermSinkSmearingFactory::Instance().registerObject(name, createFerm); registered = true; } @@ -147,6 +151,7 @@ namespace Chroma +#if ! defined (QDP_IS_QDPJIT2) //! Construct the sink smearing template<> void @@ -164,7 +169,7 @@ namespace Chroma return; } - +#endif //! Construct the sink smearing diff --git a/lib/meas/smear/displace.cc b/lib/meas/smear/displace.cc index 513cf53f48..74491b3860 100644 --- a/lib/meas/smear/displace.cc +++ b/lib/meas/smear/displace.cc @@ -73,6 +73,55 @@ namespace Chroma } + //! Apply a adjoint-displacement operator to a lattice gauge field + /*! + * \ingroup smear + * + * Arguments: + * + * \param u gauge field ( Read ) + * \param psi lattice field ( Read ) + * \param length length of displacement ( Read ) + * \param dir direction of displacement ( Read ) + * + * \return displaced field + */ + template + T adjDisplace(const multi1d& u, + const T& psi, + int length, int dir, + const Subset& sub) + { + if (dir < 0 || dir >= Nd) + { + QDPIO::cerr << __func__ << ": invalid direction: dir=" << dir << std::endl; + QDP_abort(1); + } + + T tmp; + T chi; + chi[sub] = psi; + + if (length > 0) + { + for(int n = 0; n < length; ++n) + { + tmp[sub] = shift(chi, FORWARD, dir); + chi[sub] = (u[dir] * tmp) * adj(u[dir]); + } + } + else // If length = or < 0. If length == 0, does nothing. + { + for(int n = 0; n > length; --n) + { + tmp[sub] = shift((adj(u[dir])*chi)*u[dir], BACKWARD, dir); + chi[sub] = tmp; + } + } + return chi; + } + + // Apply a displacement operator to a lattice field LatticeColorVector displace(const multi1d& u, const LatticeColorVector& chi, @@ -203,6 +252,56 @@ namespace Chroma } + //! Apply a displacement path to a lattice field + /*! + * \ingroup smear + * + * Arguments: + * + * \param u gauge field ( Read ) + * \param chi color std::vector field ( Read ) + * \param length displacement length - must be greater than zero ( Read ) + * \param path array of direction of displacement paths - pos/neg, or zero ( Read ) + * \param sub Subset of sites to act ( Read ) + * + * \return displaced field + */ + template + T adjDisplace(const multi1d& u, + const T& psi, + int displacement_length, + const multi1d& path, + const Subset& sub) + { + if (displacement_length < 0) + { + QDPIO::cerr << __func__ << ": invalid length=" << displacement_length << std::endl; + QDP_abort(1); + } + + T chi; + chi[sub] = psi; + + for(int i=0; i < path.size(); ++i) + { + if (path[i] > 0) + { + int disp_dir = path[i] - 1; + int disp_len = displacement_length; + chi[sub] = adjDisplace(u, chi, disp_len, disp_dir, sub); + } + else if (path[i] < 0) + { + int disp_dir = -path[i] - 1; + int disp_len = -displacement_length; + chi[sub] = adjDisplace(u, chi, disp_len, disp_dir, sub); + } + } + + return chi; + } + + // Apply a displacement path to a lattice field LatticeColorVector displace(const multi1d& u, const LatticeColorVector& chi, @@ -220,7 +319,6 @@ namespace Chroma return displace(u, chi, length, path, sub); } - // Apply a displacement path to a lattice field LatticeColorMatrix displace(const multi1d& u, const LatticeColorMatrix& chi, @@ -229,6 +327,14 @@ namespace Chroma return displace(u, chi, length, path, QDP::all); } + // Apply an adj-displacement path to a lattice gauge field + LatticeColorMatrix adjDisplace(const multi1d& u, + const LatticeColorMatrix& chi, + int length, const multi1d& path) + { + return adjDisplace(u, chi, length, path, QDP::all); + } + // Apply a displacement path to a lattice field LatticeColorMatrix displace(const multi1d& u, const LatticeColorMatrix& chi, @@ -238,6 +344,15 @@ namespace Chroma return displace(u, chi, length, path, sub); } + // Apply a displacement path to a lattice field + LatticeColorMatrix adjDisplace(const multi1d& u, + const LatticeColorMatrix& chi, + int length, const multi1d& path, + const Subset& sub) + { + return adjDisplace(u, chi, length, path, sub); + } + // Apply a displacement path to a lattice field LatticeFermion displace(const multi1d& u, @@ -395,8 +510,8 @@ namespace Chroma const multi1d& u, int mu, int length, int mom) - { - Real angle = twopi*mom / Real(Layout::lattSize()[mu]); + { + Real angle = Chroma::constant().twopi * mom / Real(Layout::lattSize()[mu]); Complex phase = cmplx(cos(angle),sin(angle)); return (Real(1) + conj(phase))*displace(u, F, -length, mu, QDP::all) - (Real(1) + phase)*displace(u, F, length, mu, QDP::all); diff --git a/lib/meas/smear/displace.h b/lib/meas/smear/displace.h index 4a2a366d2e..9b137d0210 100644 --- a/lib/meas/smear/displace.h +++ b/lib/meas/smear/displace.h @@ -50,6 +50,13 @@ namespace Chroma int length, const multi1d& path, const Subset& sub); + //! Apply a displacement path to a lattice field + /*! \ingroup smear */ + LatticeColorMatrix adjDisplace(const multi1d& u, + const LatticeColorMatrix& chi, + int length, const multi1d& path, + const Subset& sub); + //! Apply a displacement path to a lattice field /*! \ingroup smear */ @@ -58,6 +65,13 @@ namespace Chroma int length, const multi1d& path); + //! Apply a adjoint-displacement path to a lattice gauge field, so displace on left and right + /*! \ingroup smear */ + LatticeColorMatrix adjDisplace(const multi1d& u, + const LatticeColorMatrix& chi, + int length, const multi1d& path); + + //! Apply a displacement path to a lattice field /*! \ingroup smear */ LatticeColorVector displace(const multi1d& u, diff --git a/lib/meas/smear/phase_stout_link_smearing.cc b/lib/meas/smear/phase_stout_link_smearing.cc index 4a5c12593f..cac98af9f9 100644 --- a/lib/meas/smear/phase_stout_link_smearing.cc +++ b/lib/meas/smear/phase_stout_link_smearing.cc @@ -96,11 +96,11 @@ namespace Chroma if(params.smear_dirs[d]){ if(params.k[d]!=0){ QDPIO::cout<<" Adding phase to direction: "< + +#ifdef BUILD_SB +namespace Chroma +{ + namespace SB + { + + /// Verbosity level for solvers + enum Verbosity { + NoOutput = 0, ///< Print nothing + JustSummary = 1, ///< Print summary at the end of the solver execution + Detailed = 2, ///< Print progress + VeryDetailed = 3 ///< Print whatever + }; + + /// Expected solver domain and image + enum SolverSpace { + FullSpace, ///< input and output vector with full space support + OnlyEvensSpace ///< input and output vector restricted to even sites + }; + + /// Return the map from string to Verbosity values + inline const std::map& getVerbosityMap() + { + static const std::map m{{"NoOutput", NoOutput}, + {"Summary", JustSummary}, + {"Detailed", Detailed}, + {"VeryDetailed", VeryDetailed}, + {"false", NoOutput}}; + return m; + } + + /// Return the map from string to Verbosity values + inline const std::map& getColOrderingMap() + { + static const std::map m{{"row", RowMajor}, {"column", ColumnMajor}}; + return m; + } + + namespace detail + { + /// Check that the common dimensions have the same size + /// \param V: tensor to check + /// \param W: other tensor to check + /// \param check_dims: dimension labels to check (or none for all of them) + /// \param not_check_dims: dimension labels to not check + + template + void check_compatible(Tensor V, Tensor W, + Maybe check_dims = none, + Maybe not_check_dims = none) + { + auto wdims = W.kvdim(); + for (auto it : V.kvdim()) + if ((!check_dims.hasSome() || check_dims.getSome().find(it.first) != std::string::npos) && + (!not_check_dims.hasSome() || + not_check_dims.getSome().find(it.first) != std::string::npos) && + wdims.count(it.first) > 0 && wdims.at(it.first) != it.second) + throw std::runtime_error("check_compatible: some label does not match"); + } + + /// Returns max ||I - C||_F, which is a heuristic of the orthogonality level of C=V^\dagger*V + /// \param C: matrix to test + /// \param order_t: dimension labels that do not participate in the orthogonalization + /// \param order_cols: dimension labels that are the columns of the matrices V and W + /// \param m_to_rows: map from columns to the alternative label for columns + + template + double ortho_level(Tensor C, const std::string& order_t, + const std::string& order_cols, const remap& m_to_rows) + { + // Check Nrows + if (order_cols.size() != Ncols) + throw std::runtime_error("ortho_level: invalid template argument `Ncols`"); + + // Check that the matrix is square + std::string order_rows = detail::update_order(order_cols, m_to_rows); + auto dim = C.kvdim(); + std::size_t m = volume(dim, order_rows); + if (m != volume(dim, order_cols)) + throw std::runtime_error("ortho_level: the input tensor is not square"); + + // Compute ||I - C||_F^2 + C = C.clone(); + identity(dim, m_to_rows, OnMaster).scale(-1).addTo(C); + auto fnorm2 = contract(C.conj(), C, order_rows + order_cols, OnHost, + OnEveryoneReplicated); + + // Return the largest square root + double tol = 0; + fnorm2.foreachWithCPUFun( + [&](const COMPLEX& t) { tol = std::max(tol, (double)std::sqrt(std::real(t))); }); + return tol; + } + } + + /// Whether to check orthogonalization level, used by `ortho` + + enum class CheckOrthoLevel { dontCheckOrthoLevel, doCheckOrthoLevel }; + + /// Orthonormalize W against V, W_out <- (I-V*V')*W_in*R, with R such that W_out'*W_out = I, + /// for each combination of values of the order_t dimensions, or once if it is empty. + /// \param V: orthogonal matrix to orthogonalize against + /// \param W: matrix to orthogonalize + /// \param order_t: dimension labels that do not participate in the orthogonalization + /// \param order_rows: dimension labels that are the rows of the matrices V and W + /// \param order_cols: dimension labels that are the columns of the matrices V and W + /// \param max_its: maximum number of iterations + /// \param checkOrthoLevel: whether to stop earlier if the orthogonality level is good enough + /// \param verb: verbosity level + /// \param prefix: prefix for logging + /// + /// NOTE: ortho is mostly used to normalize vectors, that is, tensors whose columns are + /// aren't distributed. The function that checks the orthogonality level assumes that, + /// and it's very inefficient when columns are distributed. + + template + void ortho(Tensor V, Tensor W, const std::string& order_t, + const std::string& order_rows, const std::string& order_cols, + unsigned int max_its = 4, + CheckOrthoLevel checkOrthoLevel = CheckOrthoLevel::doCheckOrthoLevel, + Verbosity verb = NoOutput, const std::string& prefix = "") + { + // Check Nrows + if (order_rows.size() != Nrows) + throw std::runtime_error("ortho: invalid template argument `Nrows`"); + if (order_cols.size() != Ncols) + throw std::runtime_error("ortho: invalid template argument `Ncols`"); + + // Check that V and W are compatible, excepting the column dimensions + if (V) + detail::check_compatible(V, W, none, order_cols); + + // Find the ordering + std::string Wcorder = detail::remove_dimensions(W.order, order_t + order_rows); + std::string Vcorder = + V ? detail::remove_dimensions(V.order, order_t + order_rows) : std::string{}; + + // Create an alternative view of W with different labels for the column + remap Wac = detail::getNewLabels(Wcorder, (V ? V.order : std::string{}) + W.order); + std::string Wacorder = detail::update_order(Wcorder, Wac); + auto Wa = W.rename_dims(Wac); + + constexpr std::size_t Nt = NW - Nrows - Ncols; + double l = 0; + unsigned int i = 0; + for (; i <= max_its;) + { + // W = W - V*(V'*W) + if (V) + contract(V.scale(-1), contract(V.conj(), W, order_rows), Vcorder, + AddTo, W); + + // Compute Wa'*W and the orthogonality level of the basis + Tensor C; + if (checkOrthoLevel == CheckOrthoLevel::doCheckOrthoLevel) + { + C = contract(Wa.conj(), W, order_rows, none, OnEveryoneReplicated); + l = detail::ortho_level(C, order_t, Wcorder, Wac); + + if (verb >= Detailed) + QDPIO::cout << prefix << " ortho #its: " << i << " |I-V'*V|_F: " << detail::tostr(l) + << std::endl; + + // If ||I-C|| < 1, then the basis has no linear dependencies; the conditioning of the basis is + // also related to that norm, but we choose an arbitrary close but smaller value than one. + if (l < 0.7) + break; + } + else + { + C = contract(Wa.conj(), W, order_rows, none); + + if (verb >= Detailed) + QDPIO::cout << prefix << " ortho #its: " << i << " (no checking ortho level)" + << std::endl; + } + + if (i >= max_its) + throw std::runtime_error("ortho: failing in orthonormalizing the basis"); + + // W = W/chol(Wa'*W), where Wa'*W has dimensions (rows,cols)=(Wacorder,Wcorder) + cholInv(std::move(C), Wacorder, Wcorder, Wa, Wacorder, + CopyTo, W); + + ++i; + + // Stop by number of iterations when don't checking orthogonality level + if (checkOrthoLevel == CheckOrthoLevel::dontCheckOrthoLevel && i >= max_its) + break; + } + + if (verb >= JustSummary) + { + QDPIO::cout << prefix << " ortho summary rank: " << detail::volume(W.kvdim(), Wcorder) + << " #its: " << i; + if (checkOrthoLevel == CheckOrthoLevel::doCheckOrthoLevel) + QDPIO::cout << " |I-V'*V|_F: " << detail::tostr(l); + QDPIO::cout << std::endl; + } + } + + /// Orthonormalize W, W_out <- W_in*R, with R such that W_out'*W_out = I, + /// for each combination of values of the order_t dimensions, or once if it is empty. + /// \param W: matrix to orthogonalize + /// \param order_t: dimension labels that do not participate in the orthogonalization + /// \param order_rows: dimension labels that are the rows of the matrices V and W + /// \param order_cols: dimension labels that are the columns of the matrices V and W + /// \param max_its: maximum number of iterations + /// \param checkOrthoLevel: whether to stop earlier if the orthogonality level is good enough + /// \param verb: verbosity level + /// \param prefix: prefix for logging + /// + /// NOTE: ortho is mostly used to normalize vectors, that is, tensors whose columns are + /// aren't distributed. The function that checks the orthogonality level assumes that, + /// and it's very inefficient when columns are distributed. + + template + void ortho(Tensor W, const std::string& order_t, const std::string& order_rows, + const std::string& order_cols, unsigned int max_its = 4, + CheckOrthoLevel checkOrthoLevel = CheckOrthoLevel::doCheckOrthoLevel, + Verbosity verb = NoOutput, const std::string& prefix = "") + { + ortho(Tensor{}, W, order_t, order_rows, + order_cols, max_its, checkOrthoLevel, verb, prefix); + } + + /// Solve iteratively op * y = x using FGMRES + /// \param op: problem matrix + /// \param prec: left preconditioner + /// \param x: input right-hand-sides + /// \param y: the solution vectors + /// \param max_basis_size: maximum rank of the search subspace basis per t + /// \param tol: maximum tolerance + /// \param max_its: maximum number of iterations + /// \param error_if_not_converged: throw an error if the tolerance was not satisfied + /// \param ortho_each_its: orthogonalize every this number of iterations + /// \param max_residual_updates: recompute residual vector every this number of restarts + /// \param passing_initial_guess: whether `y` contains a solution guess + /// \param verb: verbosity level + /// \param prefix: prefix printed before every line + /// + /// For FGMRES we find an approximation of op^{-1}*b in a space Z by minimizing + /// || op*Z*x - b ||_2: + /// + /// argmin_x || op*Z*x - b ||_2 = (U'*op*Z)^{-1} * (U'*b), where U = op*Z + /// + /// In FGMRES, it just happens that Z_0 = prec * r and Z_i = prec * U_{i-1} if i>0. + + template + void fgmres(const Operator& op, Operator prec, + const Tensor& x, Tensor& y, + unsigned int max_basis_size, double tol, unsigned int max_its = 0, + bool error_if_not_converged = true, unsigned int ortho_each_its = 0, + unsigned int max_residual_updates = 0, bool passing_initial_guess = false, + Verbosity verb = NoOutput, std::string prefix = "") + { + detail::log(1, prefix + " starting fgmres"); + + // TODO: add optimizations for multiple operators + if (op.order_t.size() > 0) + throw std::runtime_error("Not implemented"); + + // Check options + if (max_its == 0 && tol <= 0) + throw std::runtime_error("fgmres: please give a stopping criterion, either a tolerance or " + "a maximum number of iterations"); + if (max_basis_size == 0) + max_basis_size = 5; + if (max_its == 0) + max_its = std::numeric_limits::max(); + if (ortho_each_its == 0) + ortho_each_its = (std::is_same::value || + std::is_same>::value) + ? 8 + : 4; + if (max_residual_updates == 0) + max_residual_updates = (std::is_same::value || + std::is_same>::value) + ? 4 + : 2; + + // Check that the operator and the preconditioner are compatible with the input and output vectors + if (!op.d.is_compatible(x) || !op.d.is_compatible(y) || (prec && !prec.d.is_compatible(op.d))) + throw std::runtime_error("Either the input or the output vector isn't compatible with the " + "operator or the preconditioner"); + + // Get an unused label for the search subspace columns + char Vc = detail::get_free_label(x.order); + char Vac = detail::get_free_label(x.order + std::string(1, Vc)); + std::string order_cols = detail::remove_dimensions(x.order, op.i.order); + std::string order_rows = detail::remove_dimensions(op.d.order, op.order_t); + std::size_t num_cols = x.volume(order_cols); + if (num_cols == 0) + return; + + // Counting op and prec applications + unsigned int nops = 0, nprecs = 0; + + // Compute residual, r = op * y - x + Tensor r; + if (passing_initial_guess) + { + r = op(y); + nops += num_cols; + } + else + { + r = op.template make_compatible_img(order_cols, x.kvdim()); + r.set_zero(); + y.set_zero(); + } + x.scale(-1).addTo(r); + auto normr0 = norm<1>(r, op.order_t + order_cols); // type std::vector + if (max(normr0) == 0) + return; + + // Allocate the search subspace U (onto the left singular space), and + // Z (onto the right singular space) + auto U = r.template make_compatible(std::string({'%', Vc}), '%', "", + {{Vc, max_basis_size}}); + + // Allocate the search subspace Z (onto the right singular space) + auto Z = U.make_compatible(); + + // Extend r with Vc + auto r_Vc = r.append_dimension(Vc); + + // Do the iterations + auto normr = normr0.clone(); ///< residual norms + unsigned int it = 0; ///< iteration number + double max_tol = HUGE_VAL; ///< maximum residual norm + unsigned int ires = 0; ///< vector index for the last restart starting + unsigned int residual_updates = 0; ///< number of residual updates + for (it = 0; it < max_its;) + { + unsigned int expansion_size = + std::min(std::min(max_basis_size - ires, ortho_each_its), max_its - it); + + // Expand the search subspace from residual + if (prec) + { + for (unsigned int i = 0; i < expansion_size; ++i) + { + // Z(:,ires+i) = prec * U(:,ires+i-1) + prec(i == 0 ? r_Vc : U.kvslice_from_size({{Vc, ires + i - 1}}, {{Vc, 1}}), + Z.kvslice_from_size({{Vc, ires + i}}, {{Vc, 1}})); + nprecs += num_cols; + + // U(:,ires+i) = op * Z(:,ires+i) + op(Z.kvslice_from_size({{Vc, ires + i}}, {{Vc, 1}}), + U.kvslice_from_size({{Vc, ires + i}}, {{Vc, 1}})); + nops += num_cols; + + ++it; + } + } + else + { + // U(:,ires+i) = op ^ i * r for i=1..expansion_size-1 + op(r_Vc, U.kvslice_from_size({{Vc, ires}}, {{Vc, expansion_size}}), Vc); + r_Vc.copyTo(Z.kvslice_from_size({{Vc, ires}}, {{Vc, 1}})); + U.kvslice_from_size({{Vc, ires}}, {{Vc, expansion_size - 1}}) + .copyTo(Z.kvslice_from_size({{Vc, ires + 1}}, {{Vc, expansion_size - 1}})); + nops += num_cols * expansion_size; + it += expansion_size; + } + + // Orthogonalize U and put it into W: W = orth(U(:,2:end)) + // NOTE: for small max_basis_size and assuming r is far from a left singular vector, + // a light or none orthogonalization should be enough + unsigned int basis_size = ires + expansion_size; + auto Up = U.kvslice_from_size({}, {{Vc, basis_size}}); + auto Uo = Up.rename_dims({{Vc, Vac}}); + Uo = Uo.clone(); /// TODO: Avoid this + ortho(Uo, op.order_t + order_cols, order_rows, std::string(1, Vac), 4, + CheckOrthoLevel::doCheckOrthoLevel, verb, prefix); + + // Restrict to Uo: [x_rt H_rt] = Uo'*U = Uo'*[r Up] + auto x_rt = contract<2>(Uo.conj(), r, order_rows); + auto H_rt = contract<3>(Uo.conj(), Up, order_rows); + + // Solve the projected problem: y_rt = (Uo'*U(:2:end))\(Uo'*r); + auto y_rt = solve<1, 2>(H_rt, std::string(1, Vac), std::string(1, Vc), + x_rt.rename_dims({{Vac, Vc}}), std::string(1, Vc)) + .rename_dims({{Vac, Vc}}) + .make_sure(none, none, OnEveryoneReplicated); + + // Update solution: y += -Z*y_rt + contract(Z.kvslice_from_size({}, {{Vc, basis_size}}).scale(-1), y_rt, std::string(1, Vc), + AddTo, y); + + // Compute residual + if (residual_updates < max_residual_updates) + { + // Update residual by saving a matvec: r += -U*y_rt + contract(Up.scale(-1), y_rt, std::string(1, Vc), AddTo, r); + residual_updates++; + } + else + { + op(y, r); // r = op(y) + x.scale(-1).addTo(r); + nops += num_cols; + residual_updates = 0; + } + + // Compute the norm + auto normr = norm<1>(r, op.order_t + order_cols); + + // Check final residual + if (superbblas::getDebugLevel() > 0) + { + auto rd = r.make_compatible(); + op(y, rd); // rd = op(y) + nops += num_cols; + x.scale(-1).addTo(rd); + r.scale(-1).addTo(rd); + auto normrd = norm<1>(rd, op.order_t + order_cols); + double max_tol_d = max(div(normrd, normr)); + QDPIO::cout << prefix + << " MGPROTON FGMRES error in residual vector: " << detail::tostr(max_tol_d) + << std::endl; + } + + // Get the worse tolerance + max_tol = max(div(normr, normr0)); + + // Report iteration + if (verb >= Detailed) + QDPIO::cout << prefix << " MGPROTON FGMRES iteration #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << std::endl; + + // Stop if the residual tolerance is satisfied + if (max_tol <= tol) + break; + + // Start a new expansion after the current one if there's space left; otherwise do a full restart + ires += expansion_size; + if (ires >= max_basis_size) + ires = 0; + } + + // Check final residual + if (error_if_not_converged) + { + op(y, r); // r = op(y) + nops += num_cols; + x.scale(-1).addTo(r); + auto normr = norm<1>(r, op.order_t + order_cols); + max_tol = max(div(normr, normr0)); + if (tol > 0 && max_tol > tol) + throw std::runtime_error("fgmres didn't converged and you ask for checking the error"); + } + + // Report iteration + if (verb >= JustSummary) + QDPIO::cout << prefix << " MGPROTON FGMRES summary #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << " matvecs: " << nops + << " precs: " << nprecs << std::endl; + } + + /// Solve iteratively op * y = x using BICGSTAB + /// \param op: problem matrix + /// \param prec: left preconditioner + /// \param x: input right-hand-sides + /// \param y: the solution vectors + /// \param tol: maximum tolerance + /// \param max_its: maximum number of iterations + /// \param error_if_not_converged: throw an error if the tolerance was not satisfied + /// \param max_residual_updates: recompute residual vector every this number of restarts + /// \param passing_initial_guess: whether `y` contains a solution guess + /// \param verb: verbosity level + /// \param prefix: prefix printed before every line + /// + /// Method: + /// r0 = p = r = b - A * x // compute initial residual + /// rho = r0' * r + /// for i=1,2,.. + /// Kp = prec * p + /// AKp = A * Kp + /// alpha = rho / (r0' * AKp) + /// s = r - AKp * alpha + /// Ks = prec * s + /// AKs = A * Ks + /// omega = (AKs' * Ks) / (AKs' * AKs) + /// x = x + Kp * alpha + Ks * omega + /// r = s - AKs * omega + /// exit if |r| < |b|*tol + /// prev_rho = rho + /// rho = r0' * r + /// beta = (rho/prev_rho)*(alpha/omega) + /// p = r + beta*(p - omega*AKp) + /// end + + template + void bicgstab(const Operator& op, Operator prec, + const Tensor& x, Tensor& y, double tol, + unsigned int max_its = 0, bool error_if_not_converged = true, + unsigned int max_residual_updates = 0, bool passing_initial_guess = false, + Verbosity verb = NoOutput, std::string prefix = "") + { + detail::log(1, prefix + " starting bicgstab"); + + // TODO: add optimizations for multiple operators + if (op.order_t.size() > 0) + throw std::runtime_error("Not implemented"); + + // Check options + if (max_its == 0 && tol <= 0) + throw std::runtime_error( + "bicgstab: please give a stopping criterion, either a tolerance or " + "a maximum number of iterations"); + if (max_its == 0) + max_its = std::numeric_limits::max(); + if (max_residual_updates == 0) + max_residual_updates = (std::is_same::value || + std::is_same>::value) + ? 100 + : 100; + + // Check that the operator and the preconditioner are compatible with the input and output vectors + if (!op.d.is_compatible(x) || !op.d.is_compatible(y) || (prec && !prec.d.is_compatible(op.d))) + throw std::runtime_error("Either the input or the output vector isn't compatible with the " + "operator or the preconditioner"); + + // Get an unused label for the search subspace columns + std::string order_cols = detail::remove_dimensions(x.order, op.i.order); + std::string order_rows = detail::remove_dimensions(op.d.order, op.order_t); + std::size_t num_cols = x.volume(order_cols); + if (num_cols == 0) + return; + + // Counting op and prec applications + unsigned int nops = 0, nprecs = 0; + + // Compute residual, r = x - op * y + Tensor r; + if (passing_initial_guess) + { + r = op(y).scale(-1); + nops += num_cols; + } + else + { + r = op.template make_compatible_img(order_cols, x.kvdim()); + r.set_zero(); + y.set_zero(); + } + x.addTo(r); + auto normr0 = norm<1>(r, op.order_t + order_cols); // type std::vector + if (max(normr0) == 0) + return; + + // Choose an arbitrary vector that isn't orthogonal to r + auto r0 = op.template make_compatible_img(order_cols, x.kvdim()); + r.copyTo(r0); + + // Do the iterations + auto normr = normr0.clone(); ///< residual norms + unsigned int it = 0; ///< iteration number + double max_tol = HUGE_VAL; ///< maximum residual norm + unsigned int residual_updates = 0; ///< number of residual updates + auto p = r0.clone(); ///< p0 = r0 + auto rho = contract<1>(r, r0.conj(), order_rows); // rho = r0' * r + auto Kp = prec ? r0.make_compatible() : p; + auto AKp = r0.make_compatible(); + auto Ks = prec ? r0.make_compatible() : r; + auto AKs = r0.make_compatible(); + for (it = 0; it < max_its;) + { + // Kp = prec * p + if (prec) + { + prec(p, Kp); + nprecs += num_cols; + } + + // AKp = A * Kp + op(Kp, AKp); + nops += num_cols; + + // alpha = rho / (r0' * AKp) + auto alpha = div(rho, contract<1>(AKp, r0.conj(), order_rows)); + + // s = r - alpha * AKp + auto s = r; + contract(AKp, alpha.scale(-1), "", AddTo, s); + + // Ks = prec * s + if (prec) + { + prec(s, Ks); + nprecs += num_cols; + } + + // AKs = A * Ks + op(Ks, AKs); + nops += num_cols; + + // omega = (AKs' * Ks) / (AKs' * AKs) + auto omega = + div(contract<1>(AKs.conj(), Ks, order_rows), contract<1>(AKs, AKs.conj(), order_rows)); + + // y = y + alpha * Kp + omega * Ks + contract(Kp, alpha, "").addTo(y); + contract(Ks, omega, "").addTo(y); + + // r = s - omega * AKs (NOTE: s == r) + contract(AKs, omega.scale(-1), "", AddTo, r); + + // prev_rho = rho, rho = r0' * r + auto prev_rho = rho.clone(); + contract<1>(r, r0.conj(), order_rows, CopyTo, rho); + + // beta = rho / prev_rho * (alpha / omega) + auto rho_ratio = div(rho, prev_rho); + auto beta = mult(rho_ratio, div(alpha, omega)); + + // beta_omega = beta * omega = rho / prev_rho * alpha + auto beta_omega = mult(rho_ratio, alpha); + + // p = r + beta * p - beta_omega * AKp + auto aux = AKs; + contract(p, beta, "", CopyTo, aux); + aux.copyTo(p); + contract(AKp, beta_omega.scale(-1), "", AddTo, p); + r.addTo(p); + + // Recompute residual vector if needed + if (residual_updates < max_residual_updates) + { + residual_updates++; + } + else + { + op(y.scale(-1), r); // r = op(-y) + x.addTo(r); + r.copyTo(r0); + r.copyTo(p); + contract<1>(r, r0.conj(), order_rows, CopyTo, rho); + nops += num_cols; + residual_updates = 0; + } + + // Compute the norm + auto normr = norm<1>(r, op.order_t + order_cols); + + // Show error in the residual + if (superbblas::getDebugLevel() > 0) + { + auto rd = r.make_compatible(); + op(y, rd); // rd = op(y) + nops += num_cols; + x.scale(-1).addTo(rd); + r.addTo(rd); + auto normrd = norm<1>(rd, op.order_t + order_cols); + double max_tol_d = max(div(normrd, normr)); + QDPIO::cout << prefix + << " MGPROTON BICGSTAB error in residual vector: " << detail::tostr(max_tol_d) + << std::endl; + } + + // Get the worse tolerance + max_tol = max(div(normr, normr0)); + + // Report iteration + if (verb >= Detailed) + QDPIO::cout << prefix << " MGPROTON BICGSTAB iteration #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << std::endl; + + // Increase iterator counter + ++it; + + // Stop if the residual tolerance is satisfied + if (max_tol <= tol) + break; + } + + // Check final residual + if (error_if_not_converged) + { + op(y, r); // r = op(y) + nops += num_cols; + x.scale(-1).addTo(r); + auto normr = norm<1>(r, op.order_t + order_cols); + max_tol = max(div(normr, normr0)); + if (tol > 0 && max_tol > tol) + throw std::runtime_error("bicgstab didn't converged and you ask for checking the error"); + } + + // Report iteration + if (verb >= JustSummary) + QDPIO::cout << prefix << " MGPROTON BICGSTAB summary #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << " matvecs: " << nops + << " precs: " << nprecs << std::endl; + } + + /// Solve iteratively op * y = x using Minimal Residual (valid for positive definite linear systems) + /// \param op: problem matrix + /// \param prec: left preconditioner + /// \param x: input right-hand-sides + /// \param y: the solution vectors + /// \param tol: maximum tolerance + /// \param max_its: maximum number of iterations + /// \param error_if_not_converged: throw an error if the tolerance was not satisfied + /// \param max_residual_updates: recompute residual vector every this number of restarts + /// \param passing_initial_guess: whether `y` contains a solution guess + /// \param verb: verbosity level + /// \param prefix: prefix printed before every line + /// + /// Method: + /// r = b - A * x // compute initial residual + /// while |r| > |b|*tol + /// kr = prec * r + /// p = A * kr + /// alpha = (p' * r) / (p' * p) + /// x = x + kr * alpha + /// r = r - p * alpha + /// end + + template + void mr(const Operator& op, const Operator& prec, + const Tensor& x, Tensor& y, double tol, + unsigned int max_its = 0, bool error_if_not_converged = true, + unsigned int max_residual_updates = 0, bool passing_initial_guess = false, + Verbosity verb = NoOutput, std::string prefix = "") + { + detail::log(1, prefix + " starting minimal residual"); + + // TODO: add optimizations for multiple operators + if (op.order_t.size() > 0) + throw std::runtime_error("Not implemented"); + + // Check options + if (max_its == 0 && tol <= 0) + throw std::runtime_error("mr: please give a stopping criterion, either a tolerance or " + "a maximum number of iterations"); + if (max_its == 0) + max_its = std::numeric_limits::max(); + if (max_residual_updates == 0) + max_residual_updates = (std::is_same::value || + std::is_same>::value) + ? 100 + : 100; + + // Check that the operator is compatible with the input and output vectors + if (!op.d.is_compatible(x) || !op.d.is_compatible(y)) + throw std::runtime_error( + "mr: Either the input or the output vector isn't compatible with the " + "operator"); + + // Get an unused label for the search subspace columns + std::string order_cols = detail::remove_dimensions(x.order, op.i.order); + std::string order_rows = detail::remove_dimensions(op.d.order, op.order_t); + std::size_t num_cols = x.volume(order_cols); + if (num_cols == 0) + return; + + // Counting op applications + unsigned int nops = 0, nprecs = 0; + + // Compute residual, r = x - op * y + Tensor r; + if (passing_initial_guess) + { + r = op(y).scale(-1); + nops += num_cols; + } + else + { + r = op.template make_compatible_img(order_cols, x.kvdim()); + r.set_zero(); + y.set_zero(); + } + x.addTo(r); + auto normr0 = norm<1>(r, op.order_t + order_cols); // type std::vector + if (max(normr0) == 0) + return; + + // Do the iterations + auto normr = normr0.clone(); ///< residual norms + unsigned int it = 0; ///< iteration number + double max_tol = HUGE_VAL; ///< maximum residual norm + unsigned int residual_updates = 0; ///< number of residual updates + auto p = r.make_compatible(); ///< p will hold A * prec * r + auto kr = prec ? r.make_compatible() : r; ///< p will hold prec * r + for (it = 0; it < max_its;) + { + // kr = prec * r + if (prec) + { + prec(r, kr); + nprecs += num_cols; + } + + // p = A * kr + op(kr, p); + nops += num_cols; + + // alpha = (p' * r) / (p' * p) + auto alpha = + div(contract<1>(r, p.conj(), order_rows), contract<1>(p, p.conj(), order_rows)); + + // y = y + alpha * kr + contract(kr, alpha, "", AddTo, y); + + // r = r - alpha * p + contract(p.scale(-1), alpha, "", AddTo, r); + + // Update residual if needed + if (residual_updates < max_residual_updates) + { + residual_updates++; + } + else + { + op(y.scale(-1), r); // r = op(-y) + x.addTo(r); + nops += num_cols; + residual_updates = 0; + } + + // Compute the norm + auto normr = norm<1>(r, op.order_t + order_cols); + + // Show residual error + if (superbblas::getDebugLevel() > 0) + { + auto rd = r.make_compatible(); + op(y, rd); // rd = op(y) + nops += num_cols; + x.scale(-1).addTo(rd); + r.addTo(rd); + auto max_norm = max(div(norm<1>(rd, op.order_t + order_cols), normr)); + QDPIO::cout << prefix + << " MGPROTON MR error in residual vector: " << detail::tostr(max_norm) + << std::endl; + } + + // Get the worse tolerance + max_tol = max(div(normr, normr0)); + + // Report iteration + if (verb >= Detailed) + QDPIO::cout << prefix << " MGPROTON MR iteration #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << std::endl; + + // Increase iterator counter + ++it; + + // Stop if the residual tolerance is satisfied + if (max_tol <= tol) + break; + } + + // Check final residual + if (error_if_not_converged) + { + op(y, r); // r = op(y) + nops += num_cols; + x.scale(-1).addTo(r); + auto normr = norm<1>(r, op.order_t + order_cols); + max_tol = max(div(normr, normr0)); + if (tol > 0 && max_tol > tol) + throw std::runtime_error("mr didn't converged and you ask for checking the error"); + } + + // Report iteration + if (verb >= JustSummary) + QDPIO::cout << prefix << " MGPROTON MR summary #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << " matvecs: " << nops + << " precs: " << nprecs << std::endl; + } + + /// Solve iteratively op * y = x using Generalized Conjugate Residual + /// \param op: problem matrix + /// \param x: input right-hand-sides + /// \param y: the solution vectors + /// \param tol: maximum tolerance + /// \param max_its: maximum number of iterations + /// \param error_if_not_converged: throw an error if the tolerance was not satisfied + /// \param max_residual_updates: recompute residual vector every this number of restarts + /// \param passing_initial_guess: whether `y` contains a solution guess + /// \param verb: verbosity level + /// \param prefix: prefix printed before every line + /// + /// Method: + /// r = b - A * x // compute initial residual + /// P = AP = [] + /// while |r| > |b|*tol + /// kr = prec * r + /// Akr = A * kr + /// beta = (AP' * AP)\(AP' * Akr) + /// p = (I - P/(P'A'AP)*P'A'A) * prec * r = kr - P * beta + /// Ap = A * p = Akr - AP * beta + /// alpha = (Ap' * r) / (Ap' * Ap) + /// x = x + p * alpha + /// r = r - Ap * alpha + /// P = [P p] + /// AP = [AP Ap] + /// end + + template + void gcr(const Operator& op, const Operator& prec, + const Tensor& x, Tensor& y, + unsigned int max_basis_size, double tol, unsigned int max_its = 0, + bool error_if_not_converged = true, unsigned int max_residual_updates = 0, + bool passing_initial_guess = false, Verbosity verb = NoOutput, std::string prefix = "") + { + detail::log(1, prefix + " starting minimal residual"); + + // TODO: add optimizations for multiple operators + if (op.order_t.size() > 0) + throw std::runtime_error("Not implemented"); + + // Check options + if (max_its == 0 && tol <= 0) + throw std::runtime_error("mr: please give a stopping criterion, either a tolerance or " + "a maximum number of iterations"); + if (max_its == 0) + max_its = std::numeric_limits::max(); + if (max_residual_updates == 0) + max_residual_updates = (std::is_same::value || + std::is_same>::value) + ? 100 + : 100; + + // Check that the operator is compatible with the input and output vectors + if (!op.d.is_compatible(x) || !op.d.is_compatible(y)) + throw std::runtime_error( + "mr: Either the input or the output vector isn't compatible with the " + "operator"); + + // Get an unused label for the search subspace columns + char Vc = detail::get_free_label(x.order); + std::string order_cols = detail::remove_dimensions(x.order, op.i.order); + std::string order_rows = detail::remove_dimensions(op.d.order, op.order_t); + std::size_t num_cols = x.volume(order_cols); + if (num_cols == 0) + return; + + // Counting op applications + unsigned int nops = 0, nprecs = 0; + + // Compute residual, r = x - op * y + Tensor r; + if (passing_initial_guess) + { + r = op(y).scale(-1); + nops += num_cols; + } + else + { + r = op.template make_compatible_img(order_cols, x.kvdim()); + r.set_zero(); + y.set_zero(); + } + x.addTo(r); + auto normr0 = norm<1>(r, op.order_t + order_cols); // type std::vector + if (max(normr0) == 0) + return; + + // Allocate the search subspace P (onto the left singular space), and + // AP (= op * P) + auto P = r.template make_compatible(std::string({'%', Vc}), '%', "", + {{Vc, max_basis_size}}); + auto AP = P.make_compatible(); + + // Do the iterations + auto normr = normr0.clone(); ///< residual norms + unsigned int it = 0; ///< iteration number + double max_tol = HUGE_VAL; ///< maximum residual norm + unsigned int residual_updates = 0; ///< number of residual updates + auto p = r.make_compatible(); ///< p will hold the next column in P + auto Ap = r.make_compatible(); ///< Ap will hold the next column in AP + auto kr = prec ? r.make_compatible() : r; ///< p will hold prec * r + auto Akr = r.make_compatible(); ///< Akr will hold A * kr + unsigned int active_P = 0; ///< number of active columns in P and AP + auto AP_norm2 = P.template like_this<2>(order_cols + std::string{Vc}, {}, none, + detail::compatible_replicated_distribution(P.dist)); + for (it = 0; it < max_its;) + { + // kr = prec * r + if (prec) + { + prec(r, kr); + nprecs += num_cols; + } + + // Akr = A * kr + op(kr, Akr); + nops += num_cols; + + if (active_P > 0) + { + auto P_act = P.kvslice_from_size({}, {{Vc, active_P}}); + auto AP_act = AP.kvslice_from_size({}, {{Vc, active_P}}); + auto AP_norm2_act = AP_norm2.kvslice_from_size({}, {{Vc, active_P}}); + + // beta = (AP' * Akr) / AP_norm2 + auto beta = div(contract<2>(Akr, AP_act.conj(), order_rows), AP_norm2_act); + + // p = kr - P * beta + kr.copyTo(p); + contract(P_act, beta.scale(-1), std::string{Vc}, AddTo, p); + + // Ap = Akr - AP * beta + Akr.copyTo(Ap); + contract(AP_act, beta.scale(-1), std::string{Vc}, AddTo, Ap); + } + else + { + kr.copyTo(p); + Akr.copyTo(Ap); + } + + // alpha = (Ap' * r) / (Ap' * Ap) + auto alpha = + div(contract<1>(r, Ap.conj(), order_rows), contract<1>(Ap, Ap.conj(), order_rows)); + + // y = y + alpha * p + contract(p, alpha, "", AddTo, y); + + // r = r - alpha * Ap + contract(Ap.scale(-1), alpha, "", AddTo, r); + + // Update residual if needed + if (residual_updates < max_residual_updates) + { + residual_updates++; + } + else + { + op(y.scale(-1), r); // r = op(-y) + x.addTo(r); + nops += num_cols; + active_P = 0; + residual_updates = 0; + } + + // Compute the norm + auto normr = norm<1>(r, op.order_t + order_cols); + + // Check final residual + if (superbblas::getDebugLevel() > 0) + { + auto rd = r.make_compatible(); + op(y, rd); // rd = op(y) + nops += num_cols; + x.scale(-1).addTo(rd); + r.addTo(rd); + auto max_norm = max(div(norm<1>(rd, op.order_t + order_cols), normr)); + QDPIO::cout << prefix + << " MGPROTON GCR error in residual vector: " << detail::tostr(max_norm) + << std::endl; + } + + // Get the worse tolerance + max_tol = max(div(normr, normr0)); + + // Report iteration + if (verb >= Detailed) + QDPIO::cout << prefix << " MGPROTON GCR iteration #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << std::endl; + + // Increase iterator counter + ++it; + + // Stop if the residual tolerance is satisfied + if (max_tol <= tol) + break; + + // Update P, AP, and AP_norm2 + if (active_P >= max_basis_size) + active_P = 1; + else + ++active_P; + auto P_new = P.kvslice_from_size({{Vc, active_P - 1}}, {{Vc, 1}}); + auto AP_new = AP.kvslice_from_size({{Vc, active_P - 1}}, {{Vc, 1}}); + p.copyTo(P_new); + Ap.copyTo(AP_new); + contract<2>(AP_new.conj(), AP_new, order_rows, CopyTo, + AP_norm2.kvslice_from_size({{Vc, active_P - 1}}, {{Vc, 1}})); + } + + // Check final residual + if (error_if_not_converged) + { + op(y, r); // r = op(y) + nops += num_cols; + x.scale(-1).addTo(r); + auto normr = norm<1>(r, op.order_t + order_cols); + max_tol = max(div(normr, normr0)); + if (tol > 0 && max_tol > tol) + throw std::runtime_error("gcr didn't converged and you ask for checking the error"); + } + + // Report iteration + if (verb >= JustSummary) + QDPIO::cout << prefix << " MGPROTON GCR summary #its.: " << it + << " max rel. residual: " << detail::tostr(max_tol, 2) << " matvecs: " << nops + << " precs: " << nprecs << std::endl; + } + + template + Operator getSolver(const Operator& op, const Options& ops, + const Operator& prec = Operator(), + SolverSpace solverSpace = FullSpace); + + template + Projector getProjector(const Operator& op, const Options& ops); + + namespace detail + { + /// Apply the given function to all the given columns in x + /// \param x: tensor to apply to fun + /// \param y: output tensor + /// \param max_rhs: maximum number of columns to apply `fun` at once + /// \param fun: function to apply + /// \param d: column dimension + + template + inline void foreachInChuncks(const Tensor& x, Tensor& y, unsigned int max_rhs, + const Func& fun, char d = 'n') + { + unsigned int n = x.kvdim().at(d); + if (max_rhs == 0) + max_rhs = n; + for (unsigned int i = 0, step = std::min(max_rhs, n); i < n; + i += step, step = std::min(max_rhs, n - i)) + fun(x.kvslice_from_size({{d, i}}, {{d, step}}), + y.kvslice_from_size({{d, i}}, {{d, step}})); + } + + /// Returns a FGMRES solver + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getFGMRESSolver(Operator op, const Options& ops, + Operator prec_) + { + // Get preconditioner + Operator prec; + Maybe precOps = getOptionsMaybe(ops, "prec"); + if (precOps && prec_) + throw std::runtime_error( + "getFGMRESSolver: invalid `prec` tag: the solver got a preconditioner already"); + if (precOps) + prec = getSolver(op, precOps.getSome()); + else if (prec_) + prec = prec_; + + // Get the remainder options + unsigned int max_basis_size = getOption(ops, "max_basis_size", 0); + double tol = getOption(ops, "tol", 0.0); + unsigned int max_its = getOption(ops, "max_its", 0); + if (max_its == 0 && tol <= 0) + ops.throw_error("set either `tol` or `max_its`"); + bool error_if_not_converged = getOption(ops, "error_if_not_converged", true); + unsigned int ortho_each_its = getOption(ops, "ortho_each_its", 0); + unsigned int max_residual_updates = getOption(ops, "max_residual_updates", 0); + unsigned int max_simultaneous_rhs = getOption(ops, "max_simultaneous_rhs", 0); + Verbosity verb = getOption(ops, "verbosity", getVerbosityMap(), NoOutput); + std::string prefix = getOption(ops, "prefix", ""); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + Tracker _t(std::string("fgmres ") + prefix); + _t.arity = x.kvdim().at('n'); + foreachInChuncks( + x, y, max_simultaneous_rhs, + [=](Tensor x, Tensor y) { + fgmres(op, prec, x, y, max_basis_size, tol, max_its, error_if_not_converged, + ortho_each_its, max_residual_updates, false /* no init guess */, verb, + prefix); + }, + 'n'); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns a BICGSTAB solver + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getBicgstabSolver(Operator op, const Options& ops, + Operator prec_) + { + // Get preconditioner + Operator prec; + Maybe precOps = getOptionsMaybe(ops, "prec"); + if (precOps && prec_) + throw std::runtime_error( + "getBicgstabSolver: invalid `prec` tag: the solver got a preconditioner already"); + if (precOps) + prec = getSolver(op, precOps.getSome()); + else if (prec_) + prec = prec_; + + // Get the remainder options + double tol = getOption(ops, "tol", 0.0); + unsigned int max_its = getOption(ops, "max_its", 0); + if (max_its == 0 && tol <= 0) + ops.throw_error("set either `tol` or `max_its`"); + bool error_if_not_converged = getOption(ops, "error_if_not_converged", true); + unsigned int max_residual_updates = getOption(ops, "max_residual_updates", 0); + unsigned int max_simultaneous_rhs = getOption(ops, "max_simultaneous_rhs", 0); + Verbosity verb = getOption(ops, "verbosity", getVerbosityMap(), NoOutput); + std::string prefix = getOption(ops, "prefix", ""); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + Tracker _t(std::string("bicgstab ") + prefix); + _t.arity = x.kvdim().at('n'); + foreachInChuncks( + x, y, max_simultaneous_rhs, + [=](Tensor x, Tensor y) { + bicgstab(op, prec, x, y, tol, max_its, error_if_not_converged, + max_residual_updates, false /* no init guess */, verb, prefix); + }, + 'n'); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns a MR solver + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getMRSolver(Operator op, const Options& ops, + Operator prec_) + { + // Get preconditioner + Operator prec; + Maybe precOps = getOptionsMaybe(ops, "prec"); + if (precOps && prec_) + throw std::runtime_error( + "getMRSolver: invalid `prec` tag: the solver got a preconditioner already"); + if (precOps) + prec = getSolver(op, precOps.getSome()); + else if (prec_) + prec = prec_; + + // Get the remainder options + double tol = getOption(ops, "tol", 0.0); + unsigned int max_its = getOption(ops, "max_its", 0); + if (max_its == 0 && tol <= 0) + ops.throw_error("set either `tol` or `max_its`"); + bool error_if_not_converged = getOption(ops, "error_if_not_converged", true); + unsigned int max_residual_updates = getOption(ops, "max_residual_updates", 0); + unsigned int max_simultaneous_rhs = getOption(ops, "max_simultaneous_rhs", 0); + Verbosity verb = getOption(ops, "verbosity", getVerbosityMap(), NoOutput); + std::string prefix = getOption(ops, "prefix", ""); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + Tracker _t(std::string("mr ") + prefix); + _t.arity = x.kvdim().at('n'); + foreachInChuncks( + x, y, max_simultaneous_rhs, + [=](Tensor x, Tensor y) { + mr(op, prec, x, y, tol, max_its, error_if_not_converged, max_residual_updates, + false /* no init guess */, verb, prefix); + }, + 'n'); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns a GCR solver + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getGCRSolver(Operator op, const Options& ops, + Operator prec_) + { + // Get preconditioner + Operator prec; + Maybe precOps = getOptionsMaybe(ops, "prec"); + if (precOps && prec_) + throw std::runtime_error( + "getMRSolver: invalid `prec` tag: the solver got a preconditioner already"); + if (precOps) + prec = getSolver(op, precOps.getSome()); + else if (prec_) + prec = prec_; + + // Get the remainder options + unsigned int max_basis_size = getOption(ops, "max_basis_size", 3); + double tol = getOption(ops, "tol", 0.0); + unsigned int max_its = getOption(ops, "max_its", 0); + if (max_its == 0 && tol <= 0) + ops.throw_error("set either `tol` or `max_its`"); + bool error_if_not_converged = getOption(ops, "error_if_not_converged", true); + unsigned int max_residual_updates = getOption(ops, "max_residual_updates", 0); + unsigned int max_simultaneous_rhs = getOption(ops, "max_simultaneous_rhs", 0); + Verbosity verb = getOption(ops, "verbosity", getVerbosityMap(), NoOutput); + std::string prefix = getOption(ops, "prefix", ""); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + Tracker _t(std::string("mr ") + prefix); + _t.arity = x.kvdim().at('n'); + foreachInChuncks( + x, y, max_simultaneous_rhs, + [=](Tensor x, Tensor y) { + gcr(op, prec, x, y, max_basis_size, tol, max_its, error_if_not_converged, + max_residual_updates, false /* no init guess */, verb, prefix); + }, + 'n'); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Return the block diagonal of an operator + /// \param op: operator to extract the block diagonal + /// \param block_labels: labels that compose the blocks (and will be the rows) + /// \param m: map from the operator block labels to the new labels for the columns + /// \return: tensor with the block labels as rows and the renamed labels as columns + + template + Tensor + getBlockDiag(const Operator& op, const std::string& block_labels, + const remap& m, + BlockingAsSparseDimensions blockingAsSparseDimensions = ConsiderBlockingSparse) + { + // Get a sparse tensor representation of the operator + remap rd; + SpTensor sop; + int power = 0; // clone only the block diagonal + ColOrdering coBlk = RowMajor; + const std::string prefix = "block diag"; + if (blockingAsSparseDimensions == ConsiderBlockingSparse) + { + auto t = cloneOperatorToSpTensor(op, power, coBlk, false, prefix); + sop = t.first; + rd = t.second; + } + else + { + auto t = cloneUnblockedOperatorToSpTensor(op, power, coBlk, false, prefix); + sop = t.first; + rd = t.second; + } + + // Create tensor to return + const std::string cols = update_order(block_labels, m); + std::string order = + block_labels + cols + detail::remove_dimensions(op.d.order, block_labels); + auto dims = op.d.kvdim(); + for (const auto& it : m) + dims[it.second] = dims[it.first]; + Tensor r = op.d.template make_compatible(order, dims); + + // Copy the blocks + remap m_cols = getNewLabels(cols, sop.data.order); + remap m_blk; + for (const auto& it : m) + m_blk[rd.at(it.first)] = m_cols.at(it.second); + sop.data.rename_dims(m_blk).copyTo(r.rename_dims(m_cols)); + + // Return tensor + return r; + } + + enum class SpinSplitting { + None, // one spin output + Chirality, // two spin output + Full // four spin output (homoiconic coarse operator) + }; + + /// Returns the prolongator constructed from + /// \param solvers: map of solvers + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator + getMGProlongator(const Operator& op, unsigned int num_null_vecs, + const std::map& mg_blocking, + const std::map& layout_blocking, + SpinSplitting spin_splitting, const Options& null_vecs_ops, + SolverSpace solverSpace) + { + detail::log(1, "starting getMGProlongator"); + + // Check that the blocking divide the lattice sizes. For an operator with support only + // on the even sites, make sure that the blocking on x is divisible by two + auto opdims = op.d.kvdim(); + int X = op.imgLayout == EvensOnlyLayout ? 2 : opdims.at('X'); + bool x_blocking_divide_X = (mg_blocking.at('x') % X == 0); + if (!x_blocking_divide_X && op.imgLayout == EvensOnlyLayout) + throw std::runtime_error( + "When using even-odd preconditioning, the blocking on x should be divisible by 2"); + for (const auto it : getNatLatticeDims(opdims, op.imgLayout)) + { + if (it.second % mg_blocking.at(it.first) != 0) + throw std::runtime_error("The operator dimensions are not divisible by the blocking"); + } + + // Check that there is enough spins to do splitting + int ns = opdims.at('s'); + if (ns != 1 && ns != 2 && ns != Ns) + throw std::runtime_error("Error in getMGProlongator: Unsupported spin number"); + if (ns == 1) + spin_splitting = SpinSplitting::None; + + const Operator null_solver = + getSolver(op, getOptions(null_vecs_ops, "solver")); + auto eigensolver_ops = getOptionsMaybe(null_vecs_ops, "eigensolver"); + Tensor nv; + if (!eigensolver_ops) + { + // Create the random initial guesses to be used in solving Ax=0 + auto b = op.template make_compatible_img("n", {{'n', num_null_vecs}}); + if (solverSpace == FullSpace) + { + nrand(b); + } + else + { + b.set_zero(); + nrand(b.kvslice_from_size({}, {{'X', 1}})); + } + + // Solve Ax=0 with the random initial guesses + nv = null_solver(op(b)); + b.scale(-1).addTo(nv); + b.release(); + } + else + { + // Compute the eigenpairs of inv(op) * g5, the right singular vectors of op + auto eigensolver = SB::getInexactEigensolverGD(null_solver, eigensolver_ops.getSome()); + double tol = getOption(null_vecs_ops, "tol", 0.1); + auto values_vectors = eigensolver(num_null_vecs, tol); + nv = std::get<1>(values_vectors); + } + + Operator V; + auto opdims_nat = opdims; + if (opdims_nat.at('X') != 1) + { + opdims_nat['x'] *= opdims_nat['X']; + opdims_nat['X'] = 1; + } + if (spin_splitting != SpinSplitting::Full) + { + // Do chirality splitting nv2 = [ nv * gpos, nv * gneg ] + auto nv2 = nv; + if (spin_splitting == SpinSplitting::Chirality) + { + nv2 = nv.like_this(none, {{'n', num_null_vecs * 2}}); + auto g5 = getGamma5(ns, OnHost, nv.dist), g5pos = g5.cloneOn(OnHost), + g5neg = g5.cloneOn(OnHost); + for (int i = 0; i < ns; ++i) // make diagonal entries of gpos all positive or zero + g5pos.set({{i, i}}, g5.get({{i, i}}) + COMPLEX{1}); + for (int i = 0; i < ns; ++i) // make diagonal entries of gneg all negative or zero + g5neg.set({{i, i}}, g5.get({{i, i}}) - COMPLEX{1}); + nv2.kvslice_from_size({}, {{'n', num_null_vecs}}) + .contract(g5pos, {{'i', 's'}}, NotConjugate, nv, {{'s', 'j'}}, NotConjugate); + nv2.kvslice_from_size({{'n', num_null_vecs}}, {{'n', num_null_vecs}}) + .contract(g5neg, {{'i', 's'}}, NotConjugate, nv, {{'s', 'j'}}, NotConjugate); + } + nv.release(); + + // Do the blocking, which encompasses the following transformations: + // X0x -> WX0x, 1y -> Y1y, 2z -> Z2z, 3t -> T3t, + // where output X is a singlet dimension, and W,Y,Z, and T have size mg_blocking, + // and output's 0,1,2, and 3 have size layout_blocking, and the output's x,y,z, and t + // have the remaining + + std::map m_blk, m_blk_rev, m_blk_nv; + m_blk_nv = {{"X0x", "WX0x"}, {"1y", "Y1y"}, {"2z", "Z2z"}, {"3t", "T3t"}, + {"n", "cs"}, {"c", "C"}, {"s", "S"}}; + m_blk = {{"X0x", "WX0x"}, {"1y", "Y1y"}, {"2z", "Z2z"}, + {"3t", "T3t"}, {"c", "C"}, {"s", "S"}}; + m_blk_rev = {{"WX0x", "X0x"}, {"Y1y", "1y"}, {"Z2z", "2z"}, + {"T3t", "3t"}, {"C", "c"}, {"S", "s"}}; + + if (!x_blocking_divide_X) + nv2 = toNaturalOrdering(nv2); + auto nv_blk = nv2.template reshape_dimensions( + m_blk_nv, + {{'X', 1}, // we don't do even-odd layout on the coarse operator space + {'W', mg_blocking.at('x') / (op.imgLayout == EvensOnlyLayout ? 2 : 1)}, + {'Y', mg_blocking.at('y')}, + {'Z', mg_blocking.at('z')}, + {'T', mg_blocking.at('t')}, + {'0', layout_blocking.at('x')}, + {'1', layout_blocking.at('y')}, + {'2', layout_blocking.at('z')}, + {'3', layout_blocking.at('t')}, + {'c', num_null_vecs}}, + true); + nv2.release(); + + // User even-odd ordering for nv_blk + if (nv_blk.kvdim().at('0') != 1) + throw std::runtime_error("getMGProlongator: unsupported blocking on the x direction"); + + // Do the orthogonalization on each block and chirality + // NOTE: don't check the orthogonality level, it will replicate the inner products on all processes + ortho<6, 1>(nv_blk, "X0123xyzts", "WYZTSC", "c", 4, CheckOrthoLevel::dontCheckOrthoLevel, + JustSummary, "prolongator"); + + // Return the operator + auto nv_blk_eo_dim = nv_blk.kvdim(); + if (X == 2 && nv_blk_eo_dim.at('x') % 2 == 0 && + (nv_blk_eo_dim.at('y') == 1 || nv_blk_eo_dim.at('y') % 2 == 0) && + (nv_blk_eo_dim.at('z') == 1 || nv_blk_eo_dim.at('z') % 2 == 0) && + (nv_blk_eo_dim.at('t') == 1 || nv_blk_eo_dim.at('t') % 2 == 0)) + { + nv_blk_eo_dim['x'] /= 2; + nv_blk_eo_dim['X'] = 2; + } + Tensor d = op.d.like_this(none, nv_blk_eo_dim), i = op.i; + V = Operator{ + [=](const Tensor& x, Tensor y) { + auto y0 = contract(nv_blk, toNaturalOrdering(x), "cs") + .template reshape_dimensions( + m_blk_rev, x_blocking_divide_X ? opdims : opdims_nat, true); + if (!x_blocking_divide_X) + toEvenOddOrdering(y0).copyTo(y); + else + y0.copyTo(y); + }, + d, + i, + [=](const Tensor& x, Tensor y) { + auto x0 = x_blocking_divide_X ? x : toNaturalOrdering(x); + auto x_blk = x0.template reshape_dimensions(m_blk, nv_blk.kvdim(), true); + if (nv_blk_eo_dim.at('X') == 1) + contract(nv_blk.conj(), x_blk, "WYZTSC", CopyTo, y); + else + toEvenOddOrdering(contract(nv_blk.conj(), x_blk, "WYZTSC")).copyTo(y); + }, + op.order_t, + nv_blk_eo_dim.at('X') == 2 ? XEvenOddLayout : NaturalLayout, + op.imgLayout, + getNeighborsAfterBlocking(mg_blocking, op.d.kvdim(), op.neighbors, op.imgLayout), + op.preferred_col_ordering, + false /* no Kronecker format */}; + } + else + { + // Do the blocking, which encompasses the following transformations: + // X0x -> WX0x, 1y -> Y1y, 2z -> Z2z, 3t -> T3t, + // where output X is a singlet dimension, and W,Y,Z, and T have size mg_blocking, + // and output's 0,1,2, and 3 have size layout_blocking, and the output's x,y,z, and t + // have the remaining. + // Also, enforce the values to be the same across different spins so that the coarse operator links + // are also the tensor product of a spin matrix and a color matrix + + std::map m_blk, m_blk_rev, m_blk_nv; + m_blk_nv = {{"X0x", "WX0x"}, {"1y", "Y1y"}, {"2z", "Z2z"}, + {"3t", "T3t"}, {"ns", "c"}, {"c", "C"}}; + m_blk = {{"X0x", "WX0x"}, {"1y", "Y1y"}, {"2z", "Z2z"}, {"3t", "T3t"}, {"c", "C"}}; + m_blk_rev = {{"WX0x", "X0x"}, {"Y1y", "1y"}, {"Z2z", "2z"}, {"T3t", "3t"}, {"C", "c"}}; + + if (!x_blocking_divide_X) + nv = toNaturalOrdering(nv); + auto nv_blk = nv.template reshape_dimensions( + m_blk_nv, + {{'X', 1}, // we don't do even-odd layout on the coarse operator space + {'W', mg_blocking.at('x') / (op.imgLayout == EvensOnlyLayout ? 2 : 1)}, + {'Y', mg_blocking.at('y')}, + {'Z', mg_blocking.at('z')}, + {'T', mg_blocking.at('t')}, + {'0', layout_blocking.at('x')}, + {'1', layout_blocking.at('y')}, + {'2', layout_blocking.at('z')}, + {'3', layout_blocking.at('t')}, + {'c', num_null_vecs * ns}}, + true); + nv.release(); + + // User even-odd ordering for nv_blk + if (nv_blk.kvdim().at('0') != 1) + throw std::runtime_error("getMGProlongator: unsupported blocking on the x direction"); + + // Do the orthogonalization on each block and chirality + // NOTE: don't check the orthogonality level, it will replicate the inner products on all processes + ortho<5, 1>(nv_blk, "X0123xyzt", "WYZTC", "c", 4, CheckOrthoLevel::dontCheckOrthoLevel, + JustSummary, "prolongator"); + + // Return the operator + auto nv_blk_eo_dim = nv_blk.kvdim(); + if (X == 2 && nv_blk_eo_dim.at('x') % 2 == 0 && + (nv_blk_eo_dim.at('y') == 1 || nv_blk_eo_dim.at('y') % 2 == 0) && + (nv_blk_eo_dim.at('z') == 1 || nv_blk_eo_dim.at('z') % 2 == 0) && + (nv_blk_eo_dim.at('t') == 1 || nv_blk_eo_dim.at('t') % 2 == 0)) + { + nv_blk_eo_dim['x'] /= 2; + nv_blk_eo_dim['X'] = 2; + } + Tensor d = op.d.like_this(none, nv_blk_eo_dim), i = op.i; + V = Operator{ + [=](const Tensor& x, Tensor y) { + auto y0 = contract(nv_blk, toNaturalOrdering(x), "c") + .template reshape_dimensions( + m_blk_rev, x_blocking_divide_X ? opdims : opdims_nat, true); + if (!x_blocking_divide_X) + toEvenOddOrdering(y0).copyTo(y); + else + y0.copyTo(y); + }, + d, + i, + [=](const Tensor& x, Tensor y) { + auto x0 = x_blocking_divide_X ? x : toNaturalOrdering(x); + auto x_blk = x0.template reshape_dimensions(m_blk, nv_blk.kvdim(), true); + if (nv_blk_eo_dim.at('X') == 1) + contract(nv_blk.conj(), x_blk, "WYZTC", CopyTo, y); + else + toEvenOddOrdering(contract(nv_blk.conj(), x_blk, "WYZTC")).copyTo(y); + }, + op.order_t, + nv_blk_eo_dim.at('X') == 2 ? XEvenOddLayout : NaturalLayout, + op.imgLayout, + getNeighborsAfterBlocking(mg_blocking, op.d.kvdim(), op.neighbors, op.imgLayout), + op.preferred_col_ordering, + true /* using Kronecker format */}; + } + + // Test that the transpose of the prolongator is consistent + { + auto xd = V.template make_compatible_dom("nk", {{'n', 4}, {'k', 1}}); + auto xi = V.template make_compatible_img("m", {{'m', 4}}); + nrand(xd); + nrand(xi); + auto xid = contract<3>(xi.conj(), V(xd), V.i.order).make_sure(none, OnHost, OnMaster); + auto xdi = + contract<3>(xd.conj(), V.tconj()(xi), V.d.order).make_sure(none, OnHost, OnMaster); + xid.conj().scale(-1).addTo(xdi); + double err = std::fabs(norm<1>(xdi, "k").get(Coor<1>{0})); + double rel = std::fabs(norm<1>(xid, "k").get(Coor<1>{0})); + if (err > rel * 1e-4) + throw std::runtime_error("The prolongator has an inconsistent transpose"); + } + + return V; + } + + /// Return a list of destroy callbacks after setting a solver + + inline std::vector& getEvenOddOperatorsCacheDestroyList() + { + static std::vector list; + return list; + } + + /// Call the destroy callbacks set up in `getEvenOddOperatorsCacheDestroyList` + inline void cleanEvenOddOperatorsCache() + { + for (const auto& f : getEvenOddOperatorsCacheDestroyList()) + f(); + } + + /// Tuple storing the operator even-odd and odd-even parts and the block diagonal and its inverse + + template + using EvenOddOperatorParts = std::tuple, Operator, + Tensor, Tensor>; + + /// Return the cache of block diagonals for even-odd operators generated by getEvenOddPrec + + template + std::map>& getEvenOddOperatorsPartsCache() + { + static std::map> m = []() { + getEvenOddOperatorsCacheDestroyList().push_back( + []() { getEvenOddOperatorsPartsCache().clear(); }); + return std::map>{}; + }(); + return m; + } + + /// Return a cache for the prolongators + /// NOTE: this one isn't destroyed by `cleanEvenOddOperatorsCache` + + template + static std::map>& getProlongatorCache() + { + static std::map> m = [] { + getDestroyList().push_back([] { getProlongatorCache().clear(); }); + return std::map>{}; + }(); + return m; + } + + /// Return the prolongator and the coarse operator and spin splitting + template + std::tuple, Operator, SpinSplitting> + getProlongatorAndCoarse(Operator op, const Options& ops, + SolverSpace solverSpace) + { + // Get prolongator, V + unsigned int num_null_vecs = getOption(ops, "num_null_vecs"); + std::vector mg_blocking_v = + getOption>(ops, "blocking"); + if (mg_blocking_v.size() != Nd) + ops.getValue("blocking") + .throw_error("getMGPrec: the blocking should be a vector with four elements"); + std::map mg_blocking{{'x', mg_blocking_v[0]}, + {'y', mg_blocking_v[1]}, + {'z', mg_blocking_v[2]}, + {'t', mg_blocking_v[3]}}; + std::vector layout_blocking_v = getOption>( + ops, "coarse_layout_blocking", std::vector{{1, 1, 1, 1}}); + if (layout_blocking_v.size() != Nd) + ops.getValue("coarse_layout_blocking") + .throw_error("getMGPrec: the blocking should be a vector with four elements"); + std::map layout_blocking{{'x', layout_blocking_v[0]}, + {'y', layout_blocking_v[1]}, + {'z', layout_blocking_v[2]}, + {'t', layout_blocking_v[3]}}; + static const std::map m_spin_splitting{ + {"none", SpinSplitting::None}, + {"chirality_splitting", SpinSplitting::Chirality}, + {"full", SpinSplitting::Full}}; + SpinSplitting spin_splitting = getOption( + ops, "spin_splitting", m_spin_splitting, SpinSplitting::Chirality); + + // Grab the prolongator from cache if the user name it + Operator V; + std::string prolongator_id = getOption(ops, "prolongator_id", ""); + if (prolongator_id.size() == 0 || + getProlongatorCache().count(prolongator_id) == 0) + { + V = getMGProlongator(op, num_null_vecs, mg_blocking, layout_blocking, spin_splitting, + getOptions(ops, "null_vecs"), solverSpace); + if (prolongator_id.size() > 0) + getProlongatorCache()[prolongator_id] = V; + } + else + { + QDPIO::cout << "Found prolongator for id " << prolongator_id << std::endl; + V = getProlongatorCache().at(prolongator_id); + } + + // Compute the coarse operator, either V' * op * V or V' * op * g5 * V + unsigned int create_coarse_max_rhs = + getOption(ops, "create_coarse_max_rhs", 0); + ColOrdering co = getOption(ops, "operator_ordering", getColOrderingMap(), + op.preferred_col_ordering); + ColOrdering co_blk = + getOption(ops, "operator_block_ordering", getColOrderingMap(), RowMajor); + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + const Operator op_c = cloneOperator( + Operator{ + [&](const Tensor& x, Tensor y) { + foreachInChuncks(x, y, create_coarse_max_rhs, + [&](Tensor x, Tensor y) { + if (spin_splitting != SpinSplitting::None || ns == 1) + { + V.tconj()(op(V(x)), y); + } + else + { + V.tconj()( + contract(g5.rename_dims({{'j', 's'}}), op(V(x)), "s") + .rename_dims({{'i', 's'}}), + y); + } + }); + }, + V.d, V.d, nullptr, op.order_t, V.domLayout, V.domLayout, V.neighbors, + op.preferred_col_ordering, V.is_kronecker()}, + co, co_blk, ConsiderBlockingSparse, "coarse"); + + return {V, op_c, spin_splitting}; + } + + /// Returns a MG preconditioner. + /// + /// It returns an approximation of Op^{-1} = Op^{-1}*Q + Op^{-1}(I-Q), where Q is a projector + /// on the left singular space of Op. (MG can be derived also from P*Op^{-1} + (I-P)*Op^{-1}, + /// where P is on the right singular space, producing pre-smoothers instead.) + /// The approximation is constructed using an oblique projector and doing the inversions + /// approximately: + /// 1) Q = Op*V*(W'*Op*V)^{-1}*W', where W and V are left and right singular spaces. + /// 2) [Op^{-1}*Q + Op^{-1}(I-Q)]*x \approx + /// V*solver(W'*Op*V, W'*x) + solver(Op, x - Op*V*solver(W'*Op*V, W'*x)) + /// Note that if Op is \gamma_5-Hermitian, and W=\gamma_5*V, and \gamma_5 commutes with V + /// (\gamma_5*V = V*\gamma_5^coarse), then the projector reduces to Q=Op*V(V'*Op*V)^{-1}*V', + /// and, this coarse operator is easier to solve than V'\gamma_5*Op*V. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getMGPrec(Operator op, const Options& ops, + Operator prec_, SolverSpace solverSpace) + { + if (prec_) + throw std::runtime_error("getMGPrec: unsupported input preconditioner"); + + // Get prolongator and coarse operator + auto prolongator_coarse_spin_splitting = getProlongatorAndCoarse(op, ops, solverSpace); + Operator V = std::get<0>(prolongator_coarse_spin_splitting); + Operator op_c = std::get<1>(prolongator_coarse_spin_splitting); + SpinSplitting spin_splitting = std::get<2>(prolongator_coarse_spin_splitting); + + // Get the solver for the projector + const Operator coarseSolver = + getSolver(op_c, getOptions(ops, "solver_coarse")); + + // Get the solver for the smoother + const Operator opSolver = getSolver(op, getOptions(ops, "solver_smoother")); + + OperatorFun solver; + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + std::string prefix = getOption(ops, "prefix", ""); + if (getEvenOddOperatorsPartsCache().count(op.id.get()) == 0) + { + solver = [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("mg solver ") + prefix); + + // x0 = g5 * x if !do_chirality_splitting && ns > 1 + auto x0 = x; + if (spin_splitting == SpinSplitting::None && ns > 1) + { + x0 = + contract(g5.rename_dims({{'j', 's'}}), x, "s").rename_dims({{'i', 's'}}); + } + + // y0 = V*solver(V'*Op*V, V'x0) + auto y0 = V(coarseSolver(V.tconj()(x0))); + + // x1 = x - op*y0 + auto x1 = op(y0.scale(-1)); + x.addTo(x1); + + // y = y0 + solver(Op, x1) + opSolver(std::move(x1), y); + y0.addTo(y); + }; + } + else + { + // Get the block diagonal of the operator with rows cs and columns CS + remap m_sc{{'s', 'S'}, {'c', 'C'}}; + auto t = getEvenOddOperatorsPartsCache().at(op.id.get()); + Operator op_eo = std::get<0>(t); + Operator op_oe = std::get<1>(t); + Tensor opDiag = std::get<2>(t); + solver = [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("mg solver ") + prefix); + + // x0 = g5 * x if !do_chirality_splitting && ns > 1 + auto x0 = x; + if (spin_splitting == SpinSplitting::None && ns > 1) + { + x0 = + contract(g5.rename_dims({{'j', 's'}}), x, "s").rename_dims({{'i', 's'}}); + } + + // y0 = V*solver(V'*Op*V, V'x0) + auto y0 = V(coarseSolver(V.tconj()(x0))); + + // x1_ee = x - op*y0 + auto y0m = y0.scale(-1); + auto x1 = x.clone(); + contract(opDiag, y0m.rename_dims(m_sc), "CS", AddTo, x1); + op_oe(y0m.kvslice_from_size({{'X', 0}}, {{'X', 1}})) + .addTo(x1.kvslice_from_size({{'X', 1}}, {{'X', 1}})); + op_eo(y0m.kvslice_from_size({{'X', 1}}, {{'X', 1}})) + .addTo(x1.kvslice_from_size({{'X', 0}}, {{'X', 1}})); + + // y = y1 + solver(Op, x1) + opSolver(std::move(x1), y); + y0.addTo(y); + }; + } + + // Return the solver + return {solver, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns a general deflation preconditioner + /// + /// It returns an approximation of Op^{-1} = Op^{-1}*Q + Op^{-1}(I-Q), where Q is a projector + /// on the left singular space of Op. + /// The approximation is constructed using an oblique projector and doing the inversions + /// approximately: + /// 1) Q = Op*V*(W'*Op*V)^{-1}*W', where W and V are left and right singular spaces. + /// 2) [Op^{-1}*Q + Op^{-1}(I-Q)]*x \approx + /// V*solver(W'*Op*V, W'*x) + solver(Op, x - Op*V*solver(W'*Op*V, W'*x)) + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getProjPrec(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getProjPrec: unsupported input preconditioner"); + + // Getting a projector + auto proj = getProjector(op, getOptions(ops, "proj")); + + // Get the solver for the smoother + const Operator opSolver = getSolver(op, getOptions(ops, "solver_smoother")); + + OperatorFun solver; + std::string prefix = getOption(ops, "prefix", ""); + if (getEvenOddOperatorsPartsCache().count(op.id.get()) == 0) + { + solver = [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("proj solver ") + prefix); + + // y0 = V*inv(U'*Op*V) U' * x + auto y0 = proj.V_inv_Ut(x); + + // y1 = (I - Q)*x = x - op*y0 + auto y1 = op(y0.scale(-1)); + x.addTo(y1); + + // y = y0 + solver(Op, y1) + opSolver(std::move(y1), y); + y0.addTo(y); + }; + } + else + { + // Get the block diagonal of the operator with rows cs and columns CS + remap m_sc{{'s', 'S'}, {'c', 'C'}}; + auto t = getEvenOddOperatorsPartsCache().at(op.id.get()); + Operator op_eo = std::get<0>(t); + Operator op_oe = std::get<1>(t); + Tensor opDiag = std::get<2>(t); + solver = [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("proj solver ") + prefix); + + // y0 = V*inv(U'*Op*V) U' * x + auto y0 = proj.V_inv_Ut(x); + + // y1_ee = x - op*y0 + auto y0m = y0.scale(-1); + auto y1 = x.clone(); + contract(opDiag, y0m.rename_dims(m_sc), "CS", AddTo, y1); + op_oe(y0m.kvslice_from_size({{'X', 0}}, {{'X', 1}})) + .addTo(y1.kvslice_from_size({{'X', 1}}, {{'X', 1}})); + op_eo(y0m.kvslice_from_size({{'X', 1}}, {{'X', 1}})) + .addTo(y1.kvslice_from_size({{'X', 0}}, {{'X', 1}})); + + // y = y1 + solver(Op, y1) + opSolver(std::move(y1), y); + y0.addTo(y); + }; + } + + // Return the solver + return {solver, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns an even-odd preconditioner. + /// + /// It returns an approximation of Op^{-1} by splitting the rows and columns into the two + /// (red-black, even-odd) and doing: + /// + /// Op^{-1} = R^{-1} * A^{-1} * L^{-1} -> + /// + /// [ Op_ee Op_eo ]^{-1} = [ I 0 ] * [ Op_ee-Op_eo*Op_oo^{-1}*Op_oe 0 ]^{-1} * + /// [ Op_oe Op_oo ] [ -Op_oo^{-1}*Op_oe I ] [ 0 Op_oo ] + /// + /// * [ I -Op_eo*Op_oo^{-1} ] + /// [ 0 I ] + /// + /// The matrix Op_oo^{-1} is block diagonal and is computed directly, while + /// (Op_ee-Op_eo*Op_oo^{-1}*Op_oe)^{-1} is approximated by an iterative solver. Note that + /// the residual norm while solving A_ee=Op_ee-Op_eo*Op_oo^{-1}*Op_oe is the same as the original + /// residual norm. To prove that, notice that the global residual will be zero for the odds, + /// and that the global residual for a solution x'_e of A_ee is + /// r = L*A*R * R^{-1}*[x'_e x'_o] - b = [r_e; 0]; + /// therefore ||L^{-1}r|| = ||r|| because of the form of L, making + /// ||r|| = || A_e*x'_e - (b_e-Op_eo*Op_oo^{-1}*b_o) ||. + /// + /// Notice that A_ee^{-1} is Op^{-1}_ee. So a way for preconditioning + /// the solution of A_ee is with a preconditioner on Op restricted to the even sites. The + /// solver does that when options for `prec_ee' are given. + /// + /// Also, A_ee can be additionally preconditioned by the left with Op_ee. Note that left + /// preconditioning will not change the original residual norm. This option is activated + /// when use_Aee_prec is true. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getEvenOddPrec(Operator op, const Options& ops, + Operator prec_, SolverSpace solverSpace) + { + auto dims = op.d.kvdim(); + if (dims.count('X') == 0 || dims.at('X') != 2 || op.imgLayout != XEvenOddLayout) + ops.throw_error( + "getEvenOddPrec: only supported on explicitly colored operators with two colors"); + + if (getFurthestNeighborDistance(op) != 1) + throw std::runtime_error( + "getEvenOddPrec: not supported for operators with other than distance-1 neighbors"); + + if (prec_) + throw std::runtime_error("getEvenOddPrec: unsupported input preconditioner"); + + bool use_Aee_prec = getOption(ops, "use_Aee_prec", false); + std::string prefix = getOption(ops, "prefix", ""); + + // Get the block diagonal of the operator with rows cs and columns CS + remap m_sc{{'s', 'S'}, {'c', 'C'}}; + Tensor opDiag, opInvDiag; + Operator op_eo, op_oe; + if (getEvenOddOperatorsPartsCache().count(op.id.get()) == 0) + { + opDiag = getBlockDiag(op, "cs", m_sc); // rows cs, cols CS + opInvDiag = inv(opDiag, "cs", "CS"); + op_eo = op.kvslice_from_size({{'X', 1}}, {{'X', 1}}, {{'X', 0}}, {{'X', 1}}); + op_oe = op.kvslice_from_size({{'X', 0}}, {{'X', 1}}, {{'X', 1}}, {{'X', 1}}); + getEvenOddOperatorsPartsCache()[op.id.get()] = {op_eo, op_oe, opDiag, + opInvDiag}; + } + else + { + auto t = getEvenOddOperatorsPartsCache().at(op.id.get()); + op_eo = std::get<0>(t); + op_oe = std::get<1>(t); + opDiag = std::get<2>(t); + opInvDiag = std::get<3>(t); + } + + // Get an explicit form for A:=Op_ee-Op_eo*Op_oo^{-1}*Op_oe + unsigned int create_operator_max_rhs = + getOption(ops, "create_operator_max_rhs", 0); + ColOrdering co = getOption(ops, "operator_ordering", getColOrderingMap(), + op.preferred_col_ordering); + ColOrdering co_blk = + getOption(ops, "operator_block_ordering", getColOrderingMap(), RowMajor); + unsigned int max_dist_neighbors_opA = 2; + + Operator opA{ + use_Aee_prec ? + // Do opA = I - Op_eo * Op_oo^{-1} * Op_oe * Op_ee^{-1} if use_Aee_prec + OperatorFun( + [=](const Tensor& x, Tensor y) { + foreachInChuncks( + x, y, create_operator_max_rhs, + [&](Tensor x, Tensor y) { + Tracker _t(std::string("eo matvec Aee_prec ") + prefix); + + // y = x + x.copyTo(y); + + // y0 = Op_ee^{-1} * x + auto y0 = contract(opInvDiag.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + x.rename_dims(m_sc), "CS"); + + // y1 = Op_oe * y0 + auto y1 = op_oe(std::move(y0)); + + // y2 = Op_oo^{-1} * y1 + auto y2 = contract(opInvDiag.kvslice_from_size({{'X', 1}}, {{'X', 1}}), + std::move(y1).rename_dims(m_sc), "CS"); + + // y += -Op_eo * y2 + op_eo(std::move(y2)).scale(-1).addTo(y); + }); + }) + : + // Otherwise, do opA = Op_ee - Op_eo * Op_oo^{-1} * Op_oe + OperatorFun( + [=](const Tensor& x, Tensor y) { + foreachInChuncks(x, y, create_operator_max_rhs, + [&](Tensor x, Tensor y) { + Tracker _t(std::string("eo matvec ") + prefix); + + // y = Op_ee * x + contract(opDiag.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + x.rename_dims(m_sc), "CS", CopyTo, y); + + // y1 = Op_oe * x + auto y1 = op_oe(x); + + // y2 = Op_oo^{-1} * y1 + auto y2 = contract( + opInvDiag.kvslice_from_size({{'X', 1}}, {{'X', 1}}), + std::move(y1).rename_dims(m_sc), "CS"); + + // y += -Op_eo * y2 + op_eo(std::move(y2)).scale(-1).addTo(y); + }); + }), + op.d.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + op.i.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + nullptr, + op.order_t, + EvensOnlyLayout, + EvensOnlyLayout, + getNeighbors(op.i.kvdim(), max_dist_neighbors_opA, EvensOnlyLayout), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Get preconditioner + auto precOps = getOptionsMaybe(ops, "prec_ee"); + Operator prec_ee; + if (precOps) + { + auto prec_ee0 = getSolver(op, precOps.getSome(), {}, OnlyEvensSpace) + .kvslice_from_size({}, {{'X', 1}}, {}, {{'X', 1}}); + if (use_Aee_prec) + { + prec_ee = Operator{ + [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("eo prec_ee ") + prefix); + + // y = Op_ee * prec_ee0(x) + contract(opDiag.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + prec_ee0(x).rename_dims(m_sc), "CS", CopyTo, y); + }, + prec_ee0.d, prec_ee0.i, nullptr, prec_ee0}; + } + else + { + prec_ee = prec_ee0; + } + } + + // Get solver on opA + const Operator solver = getSolver(opA, getOptions(ops, "solver"), prec_ee); + + // Create the solver + Operator rop{ + [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("eo solver ") + prefix); + + // be = x_e - Op_eo*Op_oo^{-1}*x_o + Tensor be; + if (solverSpace == FullSpace) + { + be = solver.template make_compatible_img("n", {{'n', x.kvdim().at('n')}}); + x.kvslice_from_size({{'X', 0}}, {{'X', 1}}).copyTo(be); + op_eo(contract(opInvDiag.kvslice_from_size({{'X', 1}}, {{'X', 1}}), + x.kvslice_from_size({{'X', 1}}, {{'X', 1}}).rename_dims(m_sc), + "CS")) + .scale(-1) + .addTo(be); + } + else + { + be = x.kvslice_from_size({{'X', 0}}, {{'X', 1}}); + } + + // Solve opA * y_e = be + if (solverSpace != FullSpace) + y.set_zero(); + auto ye = y.kvslice_from_size({{'X', 0}}, {{'X', 1}}); + solver(be, ye); + + // Do y_e = Op_ee^{-1} y_e if use_Aee_prec + if (use_Aee_prec) + { + contract(opInvDiag.kvslice_from_size({{'X', 0}}, {{'X', 1}}), + ye.rename_dims(m_sc), "CS") + .copyTo(ye); + } + + // y_o = Op_oo^{-1}*(-Op_oe*y_e + x_o) + if (solverSpace == FullSpace) + { + auto yo0 = be; + x.kvslice_from_size({{'X', 1}}, {{'X', 1}}).copyTo(yo0); + op_oe(ye).scale(-1).addTo(yo0); + contract(opInvDiag.kvslice_from_size({{'X', 1}}, {{'X', 1}}), + yo0.rename_dims(m_sc), "CS", CopyTo, + y.kvslice_from_size({{'X', 1}}, {{'X', 1}})); + } + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Do a test + if (superbblas::getDebugLevel() > 0) + { + auto x = op.template make_compatible_img("n", {{'n', 2}}); + urand(x, -1, 1); + auto y = op(rop(x)); + x.scale(-1).addTo(y); + auto normx = norm<1>(x, "n"); + auto normdiff = norm<1>(y, "n"); + double max_err = 0; + for (int i = 0, vol = normdiff.volume(); i < vol; ++i) + max_err = std::max(max_err, (double)normdiff.get({{i}}) / normx.get({{i}})); + QDPIO::cout << " eo prec error: " << detail::tostr(max_err) << std::endl; + } + + return rop; + } + + /// Returns Schur complement preconditioner splitting over the spin components + /// + /// It returns an approximation of Op^{-1} by splitting the rows and columns into the two + /// (even-odd spin components) and doing: + /// + /// Op^{-1} = R^{-1} * A^{-1} * L^{-1} -> + /// + /// [ Op_ee Op_eo ]^{-1} = [ I 0 ] * [ Op_ee-Op_eo*Op_oo^{-1}*Op_oe 0 ]^{-1} * + /// [ Op_oe Op_oo ] [ -Op_oo^{-1}*Op_oe I ] [ 0 Op_oo ] + /// + /// * [ I -Op_eo*Op_oo^{-1} ] + /// [ 0 I ] + /// + /// The matrix Op_oo^{-1} is block diagonal and is computed directly, while + /// (Op_ee-Op_eo*Op_oo^{-1}*Op_oe)^{-1} is approximated by an iterative solver. Note that + /// the residual norm while solving A_ee=Op_ee-Op_eo*Op_oo^{-1}*Op_oe is the same as the original + /// residual norm. To prove that, notice that the global residual will be zero for the odds, + /// and that the global residual for a solution x'_e of A_ee is + /// r = L*A*R * R^{-1}*[x'_e x'_o] - b = [r_e; 0]; + /// therefore ||L^{-1}r|| = ||r|| because of the form of L, making + /// ||r|| = || A_e*x'_e - (b_e-Op_eo*Op_oo^{-1}*b_o) ||. + /// + /// Notice that A_ee^{-1} is Op^{-1}_ee. So a way for preconditioning + /// the solution of A_ee is with a preconditioner on Op restricted to the even sites. The + /// solver does that when options for `prec_ee' are given. + /// + /// Also, A_ee can be additionally preconditioned by the left with Op_ee. Note that left + /// preconditioning will not change the original residual norm. This option is activated + /// when use_Aee_prec is true. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getSpinEvenOddPrec(Operator op, const Options& ops, + Operator prec_) + { + auto dims = op.d.kvdim(); + if (dims.count('s') == 0 || dims.at('s') % 2 != 0) + ops.throw_error( + "getSpinEvenOddPrec: only supported on operators with even spin components"); + + if (prec_) + throw std::runtime_error("getSpinEvenOddPrec: unsupported input preconditioner"); + + std::string prefix = getOption(ops, "prefix", ""); + + // Partition the operator: s -> Ss, where S is the spin oddity + char S = detail::get_free_label(op.d.order); + std::string Ss = std::string({S, 's'}); + int ns = dims.at('s'); + auto eg_d = op.d.like_this(none, {{'s', ns / 2}}).make_eg(); + auto eg_i = op.i.like_this(none, {{'s', ns / 2}}).make_eg(); + Operator op_ee{ + [=](const Tensor& x, Tensor y) { + auto x0 = op.template make_compatible_img("n", {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 0}}, {{S, 1}})); + op(x0) + .template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 0}}, {{S, 1}}) + .copyTo(y); + }, + eg_d, eg_i, nullptr, op}; + Operator op_eo{ + [=](const Tensor& x, Tensor y) { + auto x0 = op.template make_compatible_img("n", {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 1}}, {{S, 1}})); + op(x0) + .template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 0}}, {{S, 1}}) + .copyTo(y); + }, + eg_d, eg_i, nullptr, op}; + + Operator op_oe{ + [=](const Tensor& x, Tensor y) { + auto x0 = op.template make_compatible_img("n", {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 0}}, {{S, 1}})); + op(x0) + .template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 1}}, {{S, 1}}) + .copyTo(y); + }, + eg_d, eg_i, nullptr, op}; + + Operator op_oo{ + [=](const Tensor& x, Tensor y) { + auto x0 = op.template make_compatible_img("n", {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 1}}, {{S, 1}})); + op(x0) + .template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}) + .kvslice_from_size({{S, 1}}, {{S, 1}}) + .copyTo(y); + }, + eg_d, eg_i, nullptr, op}; + + + // Get solver on op_oo + const auto solver_oo = getSolver(op_oo, getOptions(ops, "solver_oo")); + + // Get an explicit form for A = Op_ee-Op_eo*Op_oo^{-1}*Op_oe + unsigned int max_dist_neighbors_opA = 2; + Operator opA{ + [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("spin eo matvec ") + prefix); + + // y = Op_ee * x + op_ee(x, y); + + // y1 = Op_oe * x + auto y1 = op_oe(x); + + // y2 = Op_oo^{-1} * y1 + auto y2 = solver_oo(y1); + + // y += -Op_eo * y2 + op_eo(std::move(y2)).scale(-1).addTo(y); + }, + eg_d, + eg_i, + nullptr, + op.order_t, + op.domLayout, + op.imgLayout, + getNeighbors(op.i.kvdim(), max_dist_neighbors_opA, op.imgLayout), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Get solver on opA + const auto solver = getSolver(opA, getOptions(ops, "solver_A")); + + // Create the solver + Operator rop{ + [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("spin eo solver ") + prefix); + + auto x_eo = x.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}); + auto y_eo = y.template split_dimension('s', Ss, {{S, 2}, {'s', ns / 2}}); + + // be = x_e - Op_eo*Op_oo^{-1}*x_o + auto x_e = x_eo.kvslice_from_size({{S, 0}}, {{S, 1}}); + auto x_o = x_eo.kvslice_from_size({{S, 1}}, {{S, 1}}); + auto b_e = op_eo(solver_oo(x_o).scale(-1)); + x_e.addTo(b_e); + + // Solve opA * y_e = be + auto y_e = y_eo.kvslice_from_size({{S, 0}}, {{S, 1}}); + auto y_o = y_eo.kvslice_from_size({{S, 1}}, {{S, 1}}); + solver(b_e, y_e); + + // y_o = Op_oo^{-1}*(-Op_oe*y_e + x_o) + auto yo0 = b_e; + x_o.copyTo(yo0); + op_oe(y_e).scale(-1).addTo(yo0); + solver_oo(yo0, y_o); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Do a test + if (superbblas::getDebugLevel() > 0) + { + auto x = op.template make_compatible_img("n", {{'n', 2}}); + urand(x, -1, 1); + auto y = op(rop(x)); + x.scale(-1).addTo(y); + auto normx = norm<1>(x, "n"); + auto normdiff = norm<1>(y, "n"); + double max_err = 0; + for (int i = 0, vol = normdiff.volume(); i < vol; ++i) + max_err = std::max(max_err, (double)normdiff.get({{i}}) / normx.get({{i}})); + QDPIO::cout << " spin eo prec error: " << detail::tostr(max_err) << std::endl; + } + + return rop; + } + + enum LandSeaDomain { Land, Sea, All }; + + /// Returns a generic two domains preconditioner. + /// + /// It returns an approximation of Op^{-1} by splitting the rows and columns into the two + /// domains (islands, domains without holes, and sea, the domain which connect all the domains) and doing: + /// + /// [ Op_ss Op_si ]^{-1} = [ I 0 ] * [ Op_ss-Op_si*Op_ii^{-1}*Op_is 0 ]^{-1} * + /// [ Op_is Op_ii ] [ -Op_ii^{-1}*Op_is I ] [ 0 Op_ii ] + /// + /// * [ I -Op_si*Op_ii^{-1} ] + /// [ 0 I ] + /// + /// The matrix Op_ii^{-1} is block diagonal, while + /// (Op_ss-Op_si*Op_ii^{-1}*Op_is)^{-1} should be close to Op_ss^{-1}. Note that + /// the residual norm while solving A_ss=Op_ss-Op_si*Op_ii^{-1}*Op_is is the same as the original + /// residual norm if the linear systems with Op_ii are solved exactly. See a prove in the comment of `getEvenOddPrec`. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getHierarchicalPrec(Operator op, const Options& ops, + Operator prec_, + LandSeaDomain land_sea_domain = All, + std::array given_divisions = {}, + std::array given_land = {}) + { + auto dims = op.d.kvdim(); + + if (prec_) + throw std::runtime_error("getHierarchicalPrec: unsupported input preconditioner"); + if (!op.sp) + throw std::runtime_error("getHierarchicalPrec: unsupported implicit operators"); + + std::string prefix = getOption(ops, "prefix", ""); + auto divisions = land_sea_domain == All + ? getOption>(ops, "divisions") + : given_divisions; + auto land = + land_sea_domain == All ? getOption>(ops, "land") : given_land; + bool use_red_black = getOption(ops, "use_red_black", false); + + // Check divisions and land + if (divisions == std::array{{}}) + ops.getValue("divisions").throw_error("the four values cannot be zero"); + std::array latt_size{dims.at('x'), dims.at('y'), dims.at('z'), dims.at('t')}; + for (std::size_t i = 0; i < 4; ++i) + if (divisions[i] > latt_size[i]) + divisions[i] = latt_size[i]; + for (std::size_t i = 0; i < 4; ++i) + { + if (divisions[i] == 0) + continue; + if (land[i] == 0 || land[i] >= latt_size[i] / divisions[i]) + ops.getValue("land").throw_error( + "values cannot be zero nor the total size of the divisision"); + } + + // Function + // The i-th division starts at T / divisions * i + std::min(i, T % divisions). + const auto part_ith_starts = [&](int i, int d) { + return latt_size[d] / divisions[d] * i + std::min(i, latt_size[d] % (int)divisions[d]); + }; + // So all coordinates before division T % divisions have T / divisions + 1 elements, + // and the others have T / divisions elements. + std::array first_coor_within_small_partition, partition_size_large_partition, + partition_size_small_partition; + for (int d = 0; d < 4; ++d) + { + if (divisions[d] == 0) + continue; + first_coor_within_small_partition[d] = part_ith_starts(latt_size[d] % divisions[d], d); + partition_size_large_partition[d] = latt_size[d] / divisions[d] + 1; + partition_size_small_partition[d] = latt_size[d] / divisions[d]; + } + + const auto is_true_island = [&](const std::array& coor) { + // The coordinate is island if for all components is island + int num_sea = 0; + for (int d = 0; d < 4; ++d) + { + if (divisions[d] == 0) + continue; + int c = coor[d], disp, land_size; + if (c < first_coor_within_small_partition[d]) + { + disp = c - part_ith_starts(c / partition_size_large_partition[d], d); + land_size = land[d]; + } + else + { + disp = c - part_ith_starts(latt_size[d] % divisions[d] + + (c - first_coor_within_small_partition[d]) / + partition_size_small_partition[d], + d); + land_size = land[d]; + } + if (disp >= land_size) + { + if (!use_red_black) + return false; + else + ++num_sea; + } + } + if (!use_red_black) + return true; + else + return num_sea % 2 == 1; + }; + // Border of the island + const auto is_coast = [&](const std::array& coor) { + // The coordinate is land border if for all components is island and for some component is border + bool r = false; + for (int d = 0; d < 4; ++d) + { + if (divisions[d] == 0) + continue; + int c = coor[d], disp, land_size; + if (c < first_coor_within_small_partition[d]) + { + disp = c - part_ith_starts(c / partition_size_large_partition[d], d); + land_size = land[d]; + } + else + { + disp = c - part_ith_starts(latt_size[d] % divisions[d] + + (c - first_coor_within_small_partition[d]) / + partition_size_small_partition[d], + d); + land_size = land[d]; + } + if (disp >= land_size) + return false; + if (disp == 0 || disp == land_size - 1) + r = true; + } + return r; + }; + + // Border of the sea + const auto is_littoral = [&](const std::array& coor) { + // The coordinate is sea border if for all components is sea and for some component is border + bool r = false; + for (int d = 0; d < 4; ++d) + { + if (divisions[d] == 0) + continue; + int c = coor[d], disp, land_size, size; + if (c < first_coor_within_small_partition[d]) + { + disp = c - part_ith_starts(c / partition_size_large_partition[d], d); + land_size = land[d]; + size = partition_size_large_partition[d]; + } + else + { + disp = c - part_ith_starts(latt_size[d] % divisions[d] + + (c - first_coor_within_small_partition[d]) / + partition_size_small_partition[d], + d); + land_size = land[d]; + size = partition_size_small_partition[d]; + } + if (disp < land_size) + return false; + if (disp == land_size || disp == size - 1) + r = true; + } + return r; + }; + + std::array latt_labels{'x', 'y', 'z', 't'}; + std::array idx_dom, idx_img; + for (std::size_t i = 0; i < 4; ++i) + { + idx_dom[i] = + std::find(op.d.order.begin(), op.d.order.end(), latt_labels[i]) - op.d.order.begin(); + idx_img[i] = + std::find(op.i.order.begin(), op.i.order.end(), latt_labels[i]) - op.i.order.begin(); + } + if (idx_dom != idx_img) + throw std::runtime_error("getHierarchicalPrec: unsupported operator, it should have the " + "same domain and image orderings"); + + const auto is_island = [&](const Coor& op_coor) { + std::array latt_coor; + for (std::size_t i = 0; i < 4; ++i) + latt_coor[i] = op_coor[idx_dom[i]]; + switch (land_sea_domain) + { + case All: return is_true_island(latt_coor); + case Land: return is_coast(latt_coor); + case Sea: return is_littoral(latt_coor); + } + return false; // avoid warning + }; + + // return whether the coordinate is image part: island, domain part: island + const auto is_island_island = [&](const Coor& dom_coor, const Coor& img_coor) { + return is_island(dom_coor) & is_island(img_coor); + }; + // return whether the coordinate is image part: island, domain part: sea + const auto is_island_sea = [&](const Coor& dom_coor, const Coor& img_coor) { + return !is_island(dom_coor) & is_island(img_coor); + }; + // return whether the coordinate is image part: sea, domain part: island + const auto is_sea_island = [&](const Coor& dom_coor, const Coor& img_coor) { + return is_island(dom_coor) & !is_island(img_coor); + }; + // return whether the coordinate is image part: sea, domain part: sea + const auto is_sea_sea = [&](const Coor& dom_coor, const Coor& img_coor) { + return !is_island(dom_coor) & !is_island(img_coor); + }; + + // Get the block diagonal of the operator with rows cs and columns CS + Operator op_ss = op.kvslice_from_size(is_sea_sea); + Operator op_si = op.kvslice_from_size(is_sea_island); + Operator op_is = op.kvslice_from_size(is_island_sea); + Operator op_ii = op.kvslice_from_size(is_island_island); + Operator id_ii = op.get_identiy().kvslice_from_size(is_island_island); + + // Get solvers for the island and the sea operators + const auto& solver_ssii_ops = getOptions(ops, "sea_land_solver"); + const Operator solver_ss = getSolver(op_ss, solver_ssii_ops); + const Operator solver_ii = getSolver(op_ii, solver_ssii_ops); + + // Get solvers for inside opA + const auto solver_opA_ops = getOptionsMaybe(ops, "schur_solver"); + const auto solver_ssii_schur_ops = + solver_opA_ops ? getOptionsMaybe(ops, "sea_land_schur_solver") : none; + bool is_solver_ssii_schur_hie = + solver_ssii_schur_ops && land_sea_domain == All + ? getOption(solver_ssii_schur_ops.getSome(), "type") == std::string("hie") + : false; + const Operator solver_ss_schur = + solver_ssii_schur_ops + ? (is_solver_ssii_schur_hie + ? getHierarchicalPrec(op_ss, solver_ssii_schur_ops.getSome(), + Operator{}, Sea, divisions, land) + : getSolver(op_ss, solver_ssii_schur_ops.getSome())) + : solver_ss; + const Operator solver_ii_schur = + solver_ssii_schur_ops + ? (is_solver_ssii_schur_hie + ? getHierarchicalPrec(op_ii, solver_ssii_schur_ops.getSome(), + Operator{}, Land, divisions, land) + : getSolver(op_ii, solver_ssii_schur_ops.getSome())) + : solver_ii; + + unsigned int create_operator_max_rhs = + getOption(ops, "create_operator_max_rhs", 0); + ColOrdering co = getOption(ops, "operator_ordering", getColOrderingMap(), + op.preferred_col_ordering); + ColOrdering co_blk = + getOption(ops, "operator_block_ordering", getColOrderingMap(), RowMajor); + unsigned int max_dist_neighbors_opA = 2; + + // Do opA = I - Op_si * Op_ii^{-1} * Op_is * Op_ss^{-1} + Operator opA{ + [=](const Tensor& x, Tensor y) { + foreachInChuncks(x, y, create_operator_max_rhs, + [&](Tensor x, Tensor y) { + Tracker _t(std::string("hierarchical matvec ") + prefix); + + // y = x + x.copyTo(y); + + // y -= Op_si * Op_ii^{-1} * Op_is * Op_ss^{-1} * x + op_si(solver_ii_schur(op_is(solver_ss_schur(x)))).scale(-1).addTo(y); + }); + }, + op_ss.d, op_ss.i, nullptr, op_ss}; + const Operator opA_solver = + solver_opA_ops ? getSolver(opA, solver_opA_ops.getSome()) : Operator{}; + + // Create the solver + Operator rop{ + [=](const Tensor& x, Tensor y) { + Tracker _t(std::string("hierarchical solver ") + prefix); + + // b_s = x_s - Op_si*Op_ii^{-1}*x_i = x - x_i - Op_si*Op_ii^{-1}*x_i + auto x_i = id_ii(x); + Tensor b_s = op_si(solver_ii(x_i.scale(-1))); + x.addTo(b_s); + x_i.scale(-1).addTo(b_s); + + // y_s = opA^{-1} * b_s + const auto y_s = opA_solver ? opA_solver(b_s) : b_s; + + // y = Op_ss^{-1} * y_s + solver_ss(y_s, y); + + // y += Op_ii^{-1}*(-Op_is*y + x_i) + op_is(y).scale(-1).addTo(x_i); + solver_ii(x_i, y_s); + y_s.addTo(y); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Do a test + if (superbblas::getDebugLevel() > 0) + { + auto x = op.template make_compatible_img("n", {{'n', 2}}); + urand(x, -1, 1); + auto y = op(rop(x)); + x.scale(-1).addTo(y); + auto normx = norm<1>(x, "n"); + auto normdiff = norm<1>(y, "n"); + double max_err = 0; + for (int i = 0, vol = normdiff.volume(); i < vol; ++i) + max_err = std::max(max_err, (double)normdiff.get({{i}}) / normx.get({{i}})); + QDPIO::cout << " hierarchical prec error: " << detail::tostr(max_err) << std::endl; + } + + return rop; + } + + /// Returns an approximation of the inverse of the domains local to the processes + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + /// \param prec_: preconditioner + /// + /// By default (`with_correction` being false), the solver returns D^{-1}*x, + /// where D is the local part of the operator. If asking for extra correction, + /// `with_correciton` being true, then the solver returns instead: + /// + /// A^{-1} = (A - D + D)^{-1} = (I - (I - D^{-1}*A))^{-1}*D^{-1} + /// \approx (2*I - D^{-1}*A)*D^{-1} + + template + Operator getDomainDecompositionPrec(Operator op, + const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getDomainDecompositionPrec: unsupported input preconditioner"); + + // Get options + bool with_correction = getOption(ops, "with_correction", false); + std::string prefix = getOption(ops, "prefix", ""); + + // Get the local operator and make the solver + auto local_op = op.getGlocal(); + const Operator solver = getSolver(local_op, getOptions(ops, "solver")); + + // Return the solver + if (!with_correction) + return {[=](const Tensor& x, Tensor y) { + // y = D^{-1}*x + solver(x.getGlocal(), y.getGlocal()); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + else + return {[=](const Tensor& x, Tensor y) { + // y0 = D^{-1}*x + auto y0 = y.make_compatible(); + solver(x.getGlocal(), y0.getGlocal()); + // y1 = A*y0 + auto y1 = op(y0); + // y = -D^{-1}*y1 + solver(y1.scale(-1).getGlocal(), y.getGlocal()); + // y += 2*y0 + y0.scale(2).addTo(y); + }, + op.i, + op.d, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + } + + /// Returns the inverse of the block diagonal + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getBlockJacobi(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getBlockJacobi: unsupported input preconditioner"); + + // Get the blocking + auto dim = op.d.kvdim(); + std::vector default_blocking{ + {op.imgLayout == EvensOnlyLayout ? 2u : 1u, 1u, 1u, 1u}}; + std::vector blocking = + getOption>(ops, "blocking", default_blocking); + if (blocking.size() != Nd) + ops.getValue("blocking") + .throw_error("getBlocking: the blocking should be a vector with four elements"); + std::map mblk{ + {'x', blocking[0]}, {'y', blocking[1]}, {'z', blocking[2]}, {'t', blocking[3]}}; + + // Shortcut for default blocking + if (blocking == default_blocking) + { + // Get the block diagonal of the operator with rows cs and columns CS + const std::string blk_rows = "cs"; // order of the block of rows to invert + remap m_blk = getNewLabels(blk_rows, op.d.order + op.i.order); // column labels + const std::string blk_cols = + update_order(blk_rows, m_blk); // order of the block of columns to invert + Tensor opDiag = getBlockDiag(op, blk_rows, m_blk); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + // y = Op_diag^{-1} * x + solve<2, NOp + 1, NOp + 2, NOp + 1, COMPLEX>( + opDiag, blk_rows, blk_cols, x.rename_dims(m_blk), blk_cols, CopyTo, y); + }, + op.d, op.i, nullptr, op}; + } + + // Check that the blocking divide the lattice sizes. For an operator with support only + // on the even sites, make sure that the blocking on x is divisible by two + int X = op.imgLayout == EvensOnlyLayout ? 2 : dim.at('X'); + bool x_blocking_divide_X = (mblk.at('x') % X == 0); + if (!x_blocking_divide_X && op.imgLayout == EvensOnlyLayout) + ops.getValue("blocking") + .throw_error( + "When using even-odd preconditioning, the blocking on x should be divisible by 2"); + if (!x_blocking_divide_X && blocking[0] != default_blocking[0]) + ops.getValue("blocking") + .throw_error( + "unsupported blocking which is neither one nor divisible by 2 on the x direction"); + for (const auto it : getNatLatticeDims(dim, op.imgLayout)) + { + if (it.second % mblk.at(it.first) != 0) + ops.getValue("blocking") + .throw_error("The operator dimensions are not divisible by the blocking"); + } + + // Shortcut when no blocking on x direction + if (default_blocking[0] == blocking[0]) + { + // Get the block diagonal of the operator with rows cs and columns CS + const std::string blk_rows = "cs0123"; // order of the block of rows to invert + remap m_blk = getNewLabels(blk_rows, op.d.order + op.i.order); // column labels + const std::string blk_cols = + update_order(blk_rows, m_blk); // order of the block of columns to invert + + std::map blk{ + {'1', (int)blocking[1]}, {'2', (int)blocking[2]}, {'3', (int)blocking[3]}}; + std::map blk_u{ + {'x', 1u}, {'y', blocking[1]}, {'z', blocking[2]}, {'t', blocking[3]}}; + auto new_d = op.d + .template reshape_dimensions( + {{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true) + .make_eg(); + auto new_i = op.i + .template reshape_dimensions( + {{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true) + .make_eg(); + auto blk_op = Operator{ + [=](const Tensor& x, Tensor y) { + auto x0 = x.template reshape_dimensions( + {{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, dim, true); + auto y0 = op(std::move(x0)); + y0.template reshape_dimensions({{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, + blk, true) + .copyTo(y); + }, + new_d, + new_i, + nullptr, + op.order_t, + op.domLayout == EvensOnlyLayout ? NaturalLayout : op.domLayout, + op.imgLayout == EvensOnlyLayout ? NaturalLayout : op.imgLayout, + getNeighborsAfterBlocking(blk_u, op.d.kvdim(), op.neighbors, op.imgLayout), + op.preferred_col_ordering, + op.is_kronecker()}; + + Tensor opDiag = + getBlockDiag(blk_op, blk_rows, m_blk, ConsiderBlockingDense); + + // Return the solver + return Operator{ + [=](const Tensor& x, Tensor y) { + auto x0 = x.template reshape_dimensions( + {{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true); + + // y = Op_diag^{-1} * x + auto y0 = solve<6, NOp + 1, NOp + 6, NOp + 1, COMPLEX>( + opDiag, blk_rows, blk_cols, std::move(x0).rename_dims(m_blk), blk_cols); + + y0.template reshape_dimensions({{"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, + dim, true) + .copyTo(y); + }, + op.d, op.i, nullptr, op}; + } + else + { + // Get the block diagonal of the operator with rows cs and columns CS + const std::string blk_rows = "csX0123"; // order of the block of rows to invert + remap m_blk = getNewLabels(blk_rows, op.d.order + op.i.order); // column labels + const std::string blk_cols = + update_order(blk_rows, m_blk); // order of the block of columns to invert + + std::map blk{{'0', (int)(blocking[1] / X)}, + {'X', 1}, + {'1', (int)blocking[1]}, + {'2', (int)blocking[2]}, + {'3', (int)blocking[3]}}; + std::map blk_u{ + {'x', blocking[1] / X}, {'y', blocking[1]}, {'z', blocking[2]}, {'t', blocking[3]}}; + + auto new_d = op.d + .template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true) + .make_eg(); + auto new_i = op.i + .template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true) + .make_eg(); + auto blk_op = Operator{ + [=](const Tensor& x, Tensor y) { + auto x0 = x.template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, dim, true); + auto y0 = op(std::move(x0)); + y0.template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true) + .copyTo(y); + }, + new_d, + new_i, + nullptr, + op.order_t, + NaturalLayout, + NaturalLayout, + getNeighborsAfterBlocking(blk_u, op.d.kvdim(), op.neighbors, op.imgLayout), + op.preferred_col_ordering, + op.is_kronecker()}; + + Tensor opDiag = + getBlockDiag(blk_op, blk_rows, m_blk, ConsiderBlockingDense); + + // Return the solver + return Operator{ + [=](const Tensor& x, Tensor y) { + auto x0 = x.template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blk, true); + + // y = Op_diag^{-1} * x + auto y0 = solve<7, NOp + 1, NOp + 7, NOp + 1, COMPLEX>( + opDiag, blk_rows, blk_cols, std::move(x0).rename_dims(m_blk), blk_cols); + + y0.template reshape_dimensions( + {{"X0x", "X0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, dim, true) + .copyTo(y); + }, + op.d, op.i, nullptr, op}; + } + } + + /// Returns a blocking, which should enhanced the performance of the sparse-dense tensor contraction. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getBlocking(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getBlocking: unsupported input preconditioner"); + + auto dim = op.d.kvdim(); + + std::vector default_blocking{ + {dim.count('0') == 1 ? (unsigned int)dim.at('0') : 1u, + dim.count('1') == 1 ? (unsigned int)dim.at('1') : 1u, + dim.count('2') == 1 ? (unsigned int)dim.at('2') : 1u, + dim.count('3') == 1 ? (unsigned int)dim.at('3') : 1u}}; + std::vector blocking = + getOption>(ops, "blocking", default_blocking); + if (blocking.size() != Nd) + ops.getValue("blocking") + .throw_error("getBlocking: the blocking should be a vector with four elements"); + std::map mblk{ + {'x', blocking[0]}, {'y', blocking[1]}, {'z', blocking[2]}, {'t', blocking[3]}}; + + // Check that the blocking divide the lattice sizes. For an operator with support only + // on the even sites, make sure that the blocking on x is divisible by two + auto opdims = op.d.kvdim(); + int X = op.imgLayout == EvensOnlyLayout ? 2 : opdims.at('X'); + bool x_blocking_divide_X = (mblk.at('x') % X == 0); + if (!x_blocking_divide_X && op.imgLayout == EvensOnlyLayout) + ops.getValue("blocking") + .throw_error( + "When using even-odd preconditioning, the blocking on x should be divisible by 2"); + for (const auto it : getNatLatticeDims(opdims, op.imgLayout)) + { + if (it.second % mblk.at(it.first) != 0) + ops.getValue("blocking") + .throw_error("The operator dimensions are not divisible by the blocking"); + } + + // Construct the blocked operator + ColOrdering co = getOption(ops, "operator_ordering", getColOrderingMap(), + op.preferred_col_ordering); + ColOrdering co_blk = + getOption(ops, "operator_block_ordering", getColOrderingMap(), RowMajor); + unsigned int power = getOption(ops, "power", 1); + bool make_explicit = getOption(ops, "make_explicit", true); + + auto blkd = op.d + .template reshape_dimensions( + {{"0x", "0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, + {{'0', std::max(X, (int)mblk.at('x')) / X}, + {'1', mblk.at('y')}, + {'2', mblk.at('z')}, + {'3', mblk.at('t')}}, + true) + .reorder("%0123Xxyzt", '%'); + + const auto blkdim = blkd.kvdim(); + const Operator sop = cloneOperator( + Operator{ + [&](const Tensor& x, Tensor y) { + op(x.template reshape_dimensions( + {{"0x", "0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, dim, true)) + .template reshape_dimensions( + {{"0x", "0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blkdim, true) + .copyTo(y); + }, + blkd, blkd, nullptr, op}, + getFurthestNeighborDistance(op) * power, co, co_blk, ConsiderBlockingSparse, "blocking"); + + auto solverOps = getOptionsMaybe(ops, "solver"); + const Operator solver = + solverOps.hasSome() ? getSolver(sop, solverOps.getSome()) : sop; + + return {[=](const Tensor& x, Tensor y) { + solver(x.template reshape_dimensions( + {{"0x", "0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, blkdim, true)) + .template reshape_dimensions( + {{"0x", "0x"}, {"1y", "1y"}, {"2z", "2z"}, {"3t", "3t"}}, dim, true) + .copyTo(y); + }, + solverOps.hasSome() ? op.i : op.d, solverOps.hasSome() ? op.d : op.i, nullptr, sop}; + } + } + + /// Returns an inexact Generalized Davidson on op*g5, that is the left SV of op + /// NOTE: this is an eigensolver not a linear solver + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + EigensolverFun getInexactEigensolverGD(Operator op, + const Options& ops) + { + using namespace detail; + + // Get eigensolver properties + unsigned int max_basis_size = getOption(ops, "max_basis_size", 0); + unsigned int max_block_size = getOption(ops, "max_block_size", 1); + Verbosity verb = getOption(ops, "verbosity", getVerbosityMap(), NoOutput); + auto op_double = op.template cast(); + + // Return the solver + return [=](int numEvals, + double tol) -> std::tuple, Tensor> { + DeviceHost primme_dev = OnHost; +# if defined(SUPERBBLAS_USE_GPU) + primme_dev = OnDefaultDevice; +# endif +# if defined(SUPERBBLAS_USE_HIP) + hipblasHandle_t gpublas_handle; + ns_getColorvecs::gpuBlasCheck(hipblasCreate(&gpublas_handle)); +# endif + + // Create an auxiliary struct for the PRIMME's matvec + // NOTE: Please keep 'n' as the slowest index; the rows of vectors taken by PRIMME's matvec has dimensions 'cxyztX', + // and 'n' is the dimension for the columns. + ns_getColorvecs::OperatorAux opaux{op_double, primme_dev}; + + // Make a bigger structure holding + primme_params primme; + primme_initialize(&primme); + + // Primme solver setup + primme.numEvals = numEvals; + primme.printLevel = + (verb == NoOutput ? 0 : verb == JustSummary ? 1 : verb == Detailed ? 3 : 5); + primme.n = op_double.d.volume(); + primme.eps = tol; + primme.target = primme_largest_abs; + double zeros = 0; + primme.targetShifts = &zeros; + primme.numTargetShifts = 1; + + // Set parallel settings + primme.nLocal = op_double.d.getLocal().volume(); + primme.numProcs = QDP::Layout::numNodes(); + primme.procID = QDP::Layout::nodeNumber(); + primme.globalSumReal = ns_getColorvecs::primmeGlobalSum; + + // No preconditioner for my matrix + primme.matrixMatvec = ns_getColorvecs::primmeMatvecFermion; + primme.matrix = &opaux; + + // Set block size + primme.maxBasisSize = max_basis_size; + primme.maxBlockSize = max_block_size; + primme.ldOPs = primme.nLocal; + + // Should set lots of defaults + if (primme_set_method(PRIMME_DEFAULT_MIN_MATVECS, &primme) < 0) + { + QDPIO::cerr << __func__ << ": invalid preset method\n"; + QDP_abort(1); + } + + // Print primme options + if (verb >= VeryDetailed && Layout::nodeNumber() == 0) + primme_display_params(primme); + + // Allocate space for converged Ritz values and residual norms + std::vector evals(primme.numEvals); + std::vector rnorms(primme.numEvals); + Tensor evecs = op_double.d.template make_compatible( + "%n", '%', "", {{'n', (int)numEvals}}, primme_dev); +# if defined(SUPERBBLAS_USE_GPU) +# if defined(SUPERBBLAS_USE_CUDA) + superbblas::detail::GpuBlasHandle gpublas_handle = + superbblas::detail::getGpuBlasHandle(evecs.ctx().toGpu(0)); + // Make sure cublas handle operates on legacy stream for primme + ns_getColorvecs::gpuBlasCheck(cublasSetStream(gpublas_handle, 0)); + +# endif + primme.queue = &gpublas_handle; +# endif + + // Call primme +# if defined(SUPERBBLAS_USE_GPU) + int ret = cublas_zprimme(evals.data(), evecs.data(), rnorms.data(), &primme); +# else + int ret = zprimme(evals.data(), evecs.data(), rnorms.data(), &primme); +# endif + + if (verb != NoOutput) + { + QDPIO::cout << "Eigenpairs converged: " << primme.initSize << std::endl; + QDPIO::cout << "Tolerance : " << primme.aNorm * primme.eps << std::endl; + QDPIO::cout << "Iterations: " << (int)primme.stats.numOuterIterations << std::endl; + QDPIO::cout << "Restarts : " << (int)primme.stats.numRestarts << std::endl; + QDPIO::cout << "Matvecs : " << (int)primme.stats.numMatvecs << std::endl; + QDPIO::cout << "Preconds : " << (int)primme.stats.numPreconds << std::endl; + QDPIO::cout << "T. ortho : " << primme.stats.timeOrtho << std::endl; + QDPIO::cout << "T. matvec : " << primme.stats.timeMatvec << std::endl; + QDPIO::cout << "Total time: " << primme.stats.elapsedTime << std::endl; + } + + if (ret != 0) + { + QDPIO::cerr << "Error: primme returned with nonzero exit status\n"; + QDP_abort(1); + } + + // Check the residuals, |op*v-lambda*v|_2<=|op|*tol + if (evals.size() > 0) + { + auto g5 = getGamma5(op.d.kvdim().at('s')); + auto r = + op(contract(g5.rename_dims({{'j', 's'}}), evecs, "s") + .rename_dims({{'i', 's'}})); + std::vector evals_cmpl(evals.begin(), evals.end()); + contract( + evecs, asTensorView(evals_cmpl).rename_dims({{'i', 'n'}}).scale(-1), "", AddTo, r); + auto rnorm = norm<1>(r, "n"); + for (int i = 0, vol = rnorm.volume(); i < vol; ++i) + { + if (rnorm.get({{i}}) > primme.stats.estimateLargestSVal * primme.eps * 10) + { + QDPIO::cerr << "Error: primme returned eigenpairs with too much error\n"; + QDP_abort(1); + } + } + } + + // Cleanup + primme_free(&primme); + +# if defined(SUPERBBLAS_USE_HIP) + ns_getColorvecs::gpuBlasCheck(hipblasDestroy(gpublas_handle)); +# endif + + // Return + return std::make_tuple(evals, evecs.template cast()); + }; + } + + template EigensolverFun + getInexactEigensolverGD(Operator op, const Options& ops); + template EigensolverFun + getInexactEigensolverGD(Operator op, const Options& ops); + + namespace detail + { + /// Returns an inexact Generalized Davidson. + /// NOTE: this is an eigensolver not a linear solver + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getInexactGD(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getInexactGD: unsupported input preconditioner"); + + const Operator solver = getSolver(op, getOptions(ops, "solver")); + double tol = getOption(ops, "tol"); + auto eigensolver = getInexactEigensolverGD(solver, ops); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + auto values_vectors = eigensolver(x.kvdim().at('n'), tol); + std::get<1>(values_vectors).copyTo(y); + }, + solver.i, + solver.d, + nullptr, + solver.order_t, + solver.imgLayout, + solver.domLayout, + DenseOperator(), + solver.preferred_col_ordering, + false /* no Kronecker form */}; + } + + /// Returns a shifted operator + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getShiftedOp(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getShiftedOp: unsupported input preconditioner"); + + // Get the remainder options + double shift_I = getOption(ops, "shift_I", 0.0); + double shift_ig5 = getOption(ops, "shift_ig5", 0.0); + + // Produced the shifted operator + auto shifted_op = op; + if (shift_I != 0 || shift_ig5 != 0) + { + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + shifted_op = {[=](const Tensor& x, Tensor y) { + // y = op * x + op(x, y); + + // y += x * shift_I + if (shift_I != 0) + x.scale(shift_I).addTo(y); + + // y += shift_ig5 * i * g5 + if (shift_ig5 != 0) + { + COMPLEX ishift = + static_cast(std::complex{0.0, shift_ig5}); + if (ns == 1) + { + x.scale(ishift).addTo(y); + } + else + { + contract(g5.rename_dims({{'j', 's'}}).scale(ishift), x, "s", + AddTo, y, {{'s', 'i'}}); + } + } + }, + op.d, op.i, nullptr, op}; + } + + // Get the solver + Maybe solverOps = getOptionsMaybe(ops, "solver"); + if (solverOps) + return getSolver(shifted_op, solverOps.getSome()); + return shifted_op; + } + + /// Returns the conjugate transpose of an operator + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getDagger(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getDagger: unsupported input preconditioner"); + + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + + // Return the solver + return { + [=](const Tensor& x, Tensor y) { + if (ns == 1) + { + x.copyTo(y); + } + else + { + // y = g5 * op * g5 * x + auto y0 = op( + contract(g5.rename_dims({{'j', 's'}}), x, "s").rename_dims({{'i', 's'}})); + contract(g5.rename_dims({{'j', 's'}}), y0, "s", CopyTo, + y.rename_dims({{'s', 'i'}})); + } + }, + op.d, op.i, nullptr, op}; + } + + /// Returns an operator that applies \gamma_5 + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getG5(Operator op, const Options& ops, + Operator prec_) + { + if (prec_) + throw std::runtime_error("getG5: unsupported input preconditioner"); + + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + + // Return the solver + return {[=](const Tensor& x, Tensor y) { + if (ns == 1) + { + x.copyTo(y); + } + else + { + // y = g5 * x + contract(g5.rename_dims({{'j', 's'}}), x, "s", CopyTo, + y.rename_dims({{'s', 'i'}})); + } + }, + op.d, + op.i, + nullptr, + op.order_t, + op.domLayout, + op.imgLayout, + getNeighbors(op.d.kvdim(), 0, op.domLayout), + op.preferred_col_ordering, + op.is_kronecker()}; + } + + /// Returns a solver with possible different precision than the operator's + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Operator getCasting(Operator op, const Options& ops, + Operator prec_, SolverSpace solverSpace) + { + // Get the current precision and the requested by the user + enum Precision { Single, Double, Default }; + static const std::map precisionMap{ + {"default", Default}, {"single", Single}, {"float", Single}, {"double", Double}}; + Precision defaultPrecision = std::is_same::value ? Double : Single; + Precision requestedPrecision = + getOption(ops, "precision", precisionMap, Default); + if (requestedPrecision == Default) + requestedPrecision = defaultPrecision; + + // Get the solver options + const Options& solverOps = getOptions(ops, "solver"); + + if (requestedPrecision == Double) + { + return getSolver(op.template cast(), solverOps, prec_.template cast(), + solverSpace) + .template cast(); + } + else + { + return getSolver(op.template cast(), solverOps, prec_.template cast(), + solverSpace) + .template cast(); + } + } + } + + /// Returns an operator that approximate the inverse of a given operator + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver from `solvers` and influence the solver construction + + template + Operator getSolver(const Operator& op, const Options& ops, + const Operator& prec, SolverSpace solverSpace) + { + enum SolverType { + FGMRES, + BICGSTAB, + MR, + GCR, + MG, + EO, + HIE, + DD, + BJ, + SHIFT, + PROJ, + SPINEO, + IGD, + DDAG, + G5, + BLOCKING, + CASTING + }; + static const std::map solverTypeMap{{"fgmres", FGMRES}, + {"bicgstab", BICGSTAB}, + {"mr", MR}, + {"gcr", GCR}, + {"mg", MG}, + {"eo", EO}, + {"hie", HIE}, + {"dd", DD}, + {"bj", BJ}, + {"shift", SHIFT}, + {"proj", PROJ}, + {"spineo", SPINEO}, + {"igd", IGD}, + {"g5", G5}, + {"blocking", BLOCKING}, + {"casting", CASTING}}; + SolverType solverType = getOption(ops, "type", solverTypeMap); + switch (solverType) + { + case FGMRES: // flexible GMRES + return detail::getFGMRESSolver(op, ops, prec); + case BICGSTAB: // bicgstab + return detail::getBicgstabSolver(op, ops, prec); + case MR: // minimal residual + return detail::getMRSolver(op, ops, prec); + case GCR: // generalized conjugate residual + return detail::getGCRSolver(op, ops, prec); + case MG: // Multigrid + return detail::getMGPrec(op, ops, prec, solverSpace); + case EO: // even-odd Schur preconditioner + return detail::getEvenOddPrec(op, ops, prec, solverSpace); + case HIE: // hierarchical Schur preconditioner + return detail::getHierarchicalPrec(op, ops, prec); + case DD: // domain decomposition with domains local to processes + return detail::getDomainDecompositionPrec(op, ops, prec); + case BJ: // block Jacobi + return detail::getBlockJacobi(op, ops, prec); + case SHIFT: // shift operator + return detail::getShiftedOp(op, ops, prec); + case PROJ: // projector preconditioner + return detail::getProjPrec(op, ops, prec); + case SPINEO: // even-odd Schur complement on the spin components + return detail::getSpinEvenOddPrec(op, ops, prec); + case IGD: // inexact Generalized Davidson + return detail::getInexactGD(op, ops, prec); + case DDAG: // return the operator conjugate transposed + return detail::getDagger(op, ops, prec); + case G5: // apply \gamma_5 + return detail::getG5(op, ops, prec); + case BLOCKING: // reshape the operator + return detail::getBlocking(op, ops, prec); + case CASTING: // change the precision + return detail::getCasting(op, ops, prec, solverSpace); + } + throw std::runtime_error("This shouldn't happen"); + } + + /// Return an Operator that wraps up a LinearOperator + inline Operator asOperatorView(const LinearOperator& linOp, + bool use_kron_format = true) + { + LatticeFermion a; + auto d = asTensorView(a).toComplex(); + auto blkd = + d.template reshape_dimensions({{"x", "0x"}, {"y", "1y"}, {"z", "2z"}, {"t", "3t"}}, + {{'0', 1}, {'1', 1}, {'2', 1}, {'3', 1}}, true) + .make_eg(); + const auto dim = d.kvdim(); + const auto blkdim = blkd.kvdim(); + return { + [&, blkdim](Tensor x, Tensor y) { + Tracker _t("chroma's matvec "); + unsigned int n = x.kvdim().at('n'); + _t.arity = n; + auto tx = x.template reshape_dimensions( + {{"0x", "x"}, {"1y", "y"}, {"2z", "z"}, {"3t", "t"}}, {}, true); + auto ty = tx.make_compatible(); + LatticeFermion x0, y0; + for (unsigned int i = 0; i < n; ++i) + { + tx.kvslice_from_size({{'n', i}}, {{'n', 1}}).copyTo(asTensorView(x0)); + y0 = zero; + linOp(y0, x0, PLUS /* I believe, it's ignored */); + asTensorView(y0).copyTo(ty.kvslice_from_size({{'n', i}}, {{'n', 1}})); + } + ty.template reshape_dimensions( + {{"x", "0x"}, {"y", "1y"}, {"z", "2z"}, {"t", "3t"}}, blkdim, true) + .copyTo(y); + }, + blkd, // domain + blkd, // image + nullptr, // no conjugate + "", // no order_t + XEvenOddLayout, + XEvenOddLayout, + detail::getNeighbors(dim, 1 /* near-neighbors links only */, XEvenOddLayout), + ColumnMajor, // preferred ordering + use_kron_format /* has a Kronecker form */ + }; + } + + // + // High-level chroma operations + // + + /// Constructor + /// \param fermAction: XML for the fermion action + /// \param invParam: XML for the quark propagator + /// \param u: gauge fields + + ChimeraSolver::ChimeraSolver(const GroupXML_t& fermAction, const GroupXML_t& invParam, + const multi1d& u) + { + // Initialize fermion action and state + std::istringstream xml_s(fermAction.xml); + XMLReader fermacttop(xml_s); + QDPIO::cout << "FermAct = " << fermAction.id << std::endl; + S = TheFermionActionFactory::Instance().createObject(fermAction.id, fermacttop, + fermAction.path); + state = S->createState(u); + + // If the inverter is MGPROTON, use this infrastructure + if (invParam.id == std::string("MGPROTON")) + { + QDPIO::cout << "Setting up MGPROTON invertor..." << std::endl; + Tracker _t("setup mgproton"); + + // Parse XML with the inverter options + std::shared_ptr ops = getOptionsFromXML(broadcast(invParam.xml)); + + // Clone the matvec + LinearOperator* fLinOp = S->genLinOp(state); + ColOrdering co = getOption(*ops, "InvertParam/operator_ordering", + getColOrderingMap(), ColumnMajor); + ColOrdering co_blk = getOption(*ops, "InvertParam/operator_block_ordering", + getColOrderingMap(), RowMajor); + Operator linOp = detail::cloneOperator( + asOperatorView(*fLinOp), co, co_blk, detail::ConsiderBlockingSparse, "chroma's operator"); + + // Destroy chroma objects + delete fLinOp; + state = State(); + S = Action(); + + // Construct the solver + op = Operator(getSolver(linOp, getOptions(*ops, "InvertParam"))); + + // Clean cache of operators + detail::cleanEvenOddOperatorsCache(); + + QDPIO::cout << "MGPROTON invertor ready; setup time: " + << detail::tostr(_t.stopAndGetElapsedTime()) << " s" << std::endl; + } + else + { + PP = S->qprop(state, invParam); + } + } + + namespace detail + { + /// Apply the inverse to LatticeColorVec tensors for a list of spins + /// \param PP: invertor + /// \param chi: lattice color tensor on a t_slice, cxyzXn + /// \param t_source: time-slice in chi + /// \param Nt_forward: return the next Nt_forward time-slices after t_source + /// \param Nt_backward: return the previous Nt_backward time-slices before t_source + /// \param spin_sources: list of spins + /// \param max_rhs: maximum number of vectors solved at once + /// \param order_out: coordinate order of the output tensor, a permutation of cSxyztXns where + /// s is the spin source and S is the spin sink + /// \return: tensor cSxyztXns where the first t_slice is the t_source-Nt_backward time-slice of + /// the vectors after the inversion, and goes increasingly until time-source t_source+Nt_forward + + template + Tensor doInversion(const SystemSolver& PP, + const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, + const std::vector& spin_sources, int max_rhs, + const std::string& order_out = "cSxyztXns") + { + int num_vecs = chi.kvdim()['n']; + Tensor psi( + order_out, + latticeSize( + order_out, + {{'t', n_tslice_out}, {'S', Ns}, {'s', spin_sources.size()}, {'n', num_vecs}}), + chi.getDev()); + + int max_step = std::max(num_vecs, max_rhs); + std::vector> chis(max_step), quark_solns(max_step); + for (int col = 0; col < max_step; col++) + chis[col].reset(new LatticeFermion); + for (int col = 0; col < max_step; col++) + quark_solns[col].reset(new LatticeFermion); + + for (int spin_source : spin_sources) + { + for (int n0 = 0, n_step = std::min(max_rhs, num_vecs); n0 < num_vecs; + n0 += n_step, n_step = std::min(n_step, num_vecs - n0)) + { + for (int n = n0, col = 0; col < n_step; ++n, ++col) + { + // Put the colorvec sources for the t_source on chis for spin `spin_source` + // chis[col][s=spin_source] = chi[n=n0] + *chis[col] = zero; + chi.kvslice_from_size({{'n', n}}, {{'n', 1}}) + .copyTo(SB::asTensorView(*chis[col]) + .kvslice_from_size({{'t', t_source}, {'s', spin_source}})); + + *quark_solns[col] = zero; + } + + // Solve + std::vector res = + PP(std::vector>(quark_solns.begin(), + quark_solns.begin() + n_step), + std::vector>(chis.begin(), + chis.begin() + n_step)); + + for (int n = n0, col = 0; col < n_step; ++n, ++col) + { + // psi[n=n] = quark_solns[col][t=first_tslice+(0:n_tslice_out-1)] + asTensorView(*quark_solns[col]) + .kvslice_from_size({{'t', first_tslice_out}}, {{'t', n_tslice_out}}) + .rename_dims({{'s', 'S'}}) + .copyTo(psi.kvslice_from_size({{'n', n}, {'s', spin_source}})); + } + } + } + + return psi; + } + + /// Apply the inverse to LatticeFermion tensors + /// \param sol: invertor, "linear" operator in cs0123xyztX + /// \param chi: spin-color lattice tensor, csxyztXn + /// \param max_rhs: maximum number of vectors solved at once + /// \return: tensor with the same labels as the input + + template + Tensor doInversion(const Operator& op, + const Tensor& chi, int max_rhs) + { + // Get the columns labels, which are the ones not contracted with the operator + std::string order_cols = remove_dimensions(chi.order, op.i.order); + + // Create tensors with full support on the lattice + auto x0 = chi.template reshape_dimensions( + {{"x", "0x"}, {"y", "1y"}, {"z", "2z"}, {"t", "3t"}, {order_cols, "n"}}, + {{'0', 1}, {'1', 1}, {'2', 1}, {'3', 0}}, true); + auto y0 = x0.make_compatible(); + foreachInChuncks( + x0, y0, max_rhs, + [=](Tensor x, Tensor y) { op(x, y); }); + return y0.template reshape_dimensions({{"n", order_cols}}); + } + + /// Apply the inverse to LatticeColorVec tensors for a list of spins + /// \param sol: invertor + /// \param chi: lattice color tensor on a t_slice, cxyzXn + /// \param t_source: time-slice in chi + /// \param Nt_forward: return the next Nt_forward time-slices after t_source + /// \param Nt_backward: return the previous Nt_backward time-slices before t_source + /// \param spin_sources: list of spins + /// \param max_rhs: maximum number of vectors solved at once + /// \param order_out: coordinate order of the output tensor, a permutation of cSxyztXns where + /// s is the spin source and S is the spin sink + /// \return: tensor cSxyztXns where the first t_slice is the t_source-Nt_backward time-slice of + /// the vectors after the inversion, and goes increasingly until time-source t_source+Nt_forward + + template + Tensor doInversion(const Operator& op, + const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, + const std::vector& spin_sources, int max_rhs, + const std::string& order_out = "cSxyztXns") + { + int num_vecs = chi.kvdim()['n']; + Tensor psi( + order_out, + latticeSize( + order_out, + {{'t', n_tslice_out}, {'S', Ns}, {'s', spin_sources.size()}, {'n', num_vecs}}), + chi.getDev()); + + // Create tensors with full support on the lattice + int max_step = std::max(num_vecs, max_rhs); + auto aux = chi.template make_compatible( + op.preferred_col_ordering == ColumnMajor ? "0123csxyztXn" : "0123ncsxyztX", + {{'n', max_step}, + {'t', Layout::lattSize()[3]}, + {'s', Ns}, + {'0', 1}, + {'1', 1}, + {'2', 1}, + {'3', 1}}); + + for (int spin_source : spin_sources) + { + for (int n0 = 0, n_step = std::min(max_rhs, num_vecs); n0 < num_vecs; + n0 += n_step, n_step = std::min(n_step, num_vecs - n0)) + { + auto aux0 = aux.kvslice_from_size({}, {{'n', n_step}}); + aux0.set_zero(); + chi.kvslice_from_size({{'n', n0}}, {{'n', n_step}}) + .copyTo(aux0.kvslice_from_size({{'t', t_source}, {'s', spin_source}})); + + // Solve + op(aux0) + .kvslice_from_size({{'t', first_tslice_out}}, {{'t', n_tslice_out}}) + .rename_dims({{'s', 'S'}}) + .copyTo(psi.kvslice_from_size({{'n', n0}, {'s', spin_source}})); + } + } + + return psi; + } + + /// Apply the inverse to a list of LatticeFermions + /// \param PP: invertor + /// \param chi: lattice spin-color field tensor, csxyztX + /// \param max_rhs: maximum number of vectors solved at once + /// \return: + template + Tensor doInversion(const SystemSolver& PP, + const Tensor& chi, int max_rhs) + { + detail::check_order_contains(chi.order, "csxyztX"); + std::string n_order = detail::remove_dimensions(chi.order, "csxyztX"); + Coor n_dim = latticeSize(n_order, chi.kvdim()); + int n_vol = (N == 7 ? 1 : superbblas::detail::volume(n_dim)); + + Tensor r = chi.make_compatible(); // output tensor + int max_step = std::max(1, std::max(n_vol, max_rhs)); + + // Quick exit + if (n_vol == 0) + return r; + + if (N == 7) + { + // For a single vector + LatticeFermion chi0, psi0; + chi.copyTo(asTensorView(chi0)); + SystemSolverResults_t res = PP(psi0, chi0); + asTensorView(psi0).copyTo(r); + } + else + { + // Auxiliary LatticeFermion + std::vector> chis(max_step), quark_solns(max_step); + for (int col = 0; col < max_step; col++) + chis[col].reset(new LatticeFermion); + for (int col = 0; col < max_step; col++) + quark_solns[col].reset(new LatticeFermion); + + Coor n_strides = detail::get_strides(n_dim, superbblas::FastToSlow); + for (int n0 = 0, n_step = std::min(max_rhs, n_vol); n0 < n_vol; + n0 += n_step, n_step = std::min(n_step, n_vol - n0)) + { + for (int n = n0, col = 0; col < n_step; ++n, ++col) + { + // Get the field to copy from the tensor chi + Coor ni = detail::index2coor(n, n_dim, n_strides); + std::map from{}, size{}; + for (int d = 0; d < N - 7; ++d) + from[n_order[d]] = ni[d], size[n_order[d]] = 1; + + // Copy the field into a LatticeFermion + chi.kvslice_from_size(from, size).copyTo(asTensorView(*chis[col])); + + *quark_solns[col] = zero; + } + + // Solve + std::vector res = + PP(std::vector>(quark_solns.begin(), + quark_solns.begin() + n_step), + std::vector>(chis.begin(), + chis.begin() + n_step)); + + for (int n = n0, col = 0; col < n_step; ++n, ++col) + { + // Get the field to copy from the tensor chi + Coor ni = detail::index2coor(n, n_dim, n_strides); + std::map from{}, size{}; + for (int d = 0; d < N - 7; ++d) + from[n_order[d]] = ni[d], size[n_order[d]] = 1; + + // Copy from LatticeFermion to the output tensor + asTensorView(*quark_solns[col]).copyTo(r.kvslice_from_size(from, size)); + } + } + } + + return r; + } + + /// Returns a projector onto the approximate right singular value space of the given operator + /// for the smallest singular values. + /// + /// It can work as a projector on the right of inv(op): + /// P = V*inv(V'*g5*op*V)*V'*g5*op + /// Then: + /// tr P*inv(op) = tr V*inv(V'*g5*op*V)*V'*g5 = \sum_i v_i'*g5*vi/lambda_i + /// + /// It computes a number of the smallest right singular vectors of op as the eigenvectors of + /// inv(g5*op), which is Hermitian. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Projector getDeflationProj(Operator op, const Options& ops) + { + // Get options + unsigned int rank = getOption(ops, "rank"); + if (rank == 0) + { + // Return trivial solution + auto almost_proj = Operator{ + [=](const Tensor&, Tensor y) { y.set_zero(); }, + op.d, op.i, nullptr, op}; + + auto VUfun = [=](unsigned int ifrom, unsigned int isize, + char col_label) -> Tensor { return {}; }; + + std::vector lambdas(0); + + return {almost_proj, VUfun, VUfun, lambdas, op}; + } + double tol = getOption(ops, "tol", 0.1); + auto solver = getSolver(op, ops.getValue("solver")); + auto default_eig_ops = DictionaryOption(ops); + auto eigensolver = SB::getInexactEigensolverGD( + solver, ops.getValue("eigensolver", Maybe(default_eig_ops))); + + // Compute the eigenpairs of inv(op) * g5, the right singular vectors of op + auto values_vectors = eigensolver(rank, tol); + auto inv_values = std::get<0>(values_vectors); + auto vectors = std::get<1>(values_vectors).rename_dims({{'n', 'i'}}); + + // Return the projector + int ns = op.d.kvdim().at('s'); + auto g5 = getGamma5(ns, op.d.getDev(), op.d.dist); + auto mult_by_g5 = [=](Tensor v) { + char i = detail::get_free_label(v.order); + return contract(g5.rename_dims({{'j', 's'}, {'i', i}}), v, "s") + .rename_dims({{i, 's'}}); + }; + auto Vt_g5_op_V = contract<2>( + vectors.conj(), mult_by_g5(op(vectors.rename_dims({{'i', 'j'}}))), op.d.order); + auto inv_Vt_g5_op_V = inv(Vt_g5_op_V, "i", "j"); + auto almost_proj = Operator{ + [=](const Tensor& x, Tensor y) { + // Do V'*g5*x, dims: i x n + auto Vt_g5_x = contract<2>(vectors.conj(), mult_by_g5(x), op.d.order); + + // Do inv(V'*g5*op*V)*V'*g5*x, dims I x i * i x n = I x n -> i x n + auto lambda_inv_Vt_g5_x = + contract<2>(inv_Vt_g5_op_V.rename_dims({{'i', 'I'}, {'j', 'i'}}), Vt_g5_x, "i") + .rename_dims({{'I', 'i'}}); + + // Do y = V*inv(V'*g5*op*V)*V'*g5*x, dims latt x i * i x n = latt x n + contract(vectors, lambda_inv_Vt_g5_x, "i", CopyTo, y); + }, + op.d, op.i, nullptr, solver}; + + // Return V*inv_Vt_g5_op_V[:,ifrom..ifrom+isize] + auto Vfun = [=](unsigned int ifrom, unsigned int isize, char col_label) { + return contract( + vectors, inv_Vt_g5_op_V.kvslice_from_size({{'j', ifrom}}, {{'j', isize}}), "i") + .rename_dims({{'j', col_label}}); + }; + + // Returns \gamma_5*V[ifrom..ifrom+isize-1] + auto Ufun = [=](unsigned int ifrom, unsigned int isize, char col_label) { + return mult_by_g5(vectors.kvslice_from_size({{'i', ifrom}}, {{'i', isize}}) + .rename_dims({{'i', col_label}})); + }; + + // Return all ones + std::vector lambdas(inv_values.size(), COMPLEX{1}); + + return {almost_proj, Vfun, Ufun, lambdas, op}; + } + + /// Returns a projector onto the approximate right singular value space of the given operator + /// for the smallest singular values. + /// + /// It can work as a projector on the right of inv(op): + /// P = Q*V*inv(V'*g5*Q'*op*Q*V)*V'*g5*Q'*op + /// Then: + /// tr P*inv(op) = tr Q*V*inv(V'*g5*Q'*op*Q*V)*V'*g5*Q' = \sum_i v_i'*g5*vi/lambda_i + /// + /// It computes a number of the smallest right singular vectors of op as the eigenvectors of + /// inv(g5*op), which is Hermitian. + /// + /// \param op: operator to make the inverse of + /// \param ops: options to select the solver and the null-vectors creation + + template + Projector getMGDeflationProj(Operator op, const Options& ops) + { + // Get prolongator and coarse operator + auto prolongator_coarse_spin_splitting = + getProlongatorAndCoarse(op, ops.getValue("prolongator"), FullSpace); + Operator Q = std::get<0>(prolongator_coarse_spin_splitting); + Operator op_c = std::get<1>(prolongator_coarse_spin_splitting); + + // Return the projector for the coarse operator + auto proj_c = getProjector(op_c, ops.getValue("proj")); + + /// Return Q*V*inv(U'*Q'*op*Q*V)*U'*Q' = Q proj_c * Q' + auto almost_proj = Operator{ + [=](const Tensor& x, Tensor y) { + Q(proj_c.V_inv_Ut(Q.tconj()(x)), y); + }, + op.d, + op.i, + nullptr, + op.order_t, + op.imgLayout, + op.domLayout, + DenseOperator(), + op.preferred_col_ordering, + false /* no Kronecker blocking */}; + + // Return Q*V[:,ifrom..ifrom+isize] + auto Vfun = [=](unsigned int ifrom, unsigned int isize, char col_label) { + return Q(proj_c.V(ifrom, isize, col_label)); + }; + + // Returns Q*U[ifrom..ifrom+isize-1] + auto Ufun = [=](unsigned int ifrom, unsigned int isize, char col_label) { + return Q(proj_c.U(ifrom, isize, col_label)); + }; + + return {almost_proj, Vfun, Ufun, proj_c.lambdas, op}; + } + } + + /// Apply the inverse to LatticeColorVec tensors for a list of spins + /// \param PP: invertor + /// \param chi: lattice color tensor on a t_slice, cxyzXn + /// \param t_source: time-slice in chi + /// \param Nt_forward: return the next Nt_forward time-slices after t_source + /// \param Nt_backward: return the previous Nt_backward time-slices before t_source + /// \param spin_sources: list of spins + /// \param max_rhs: maximum number of vectors solved at once + /// \param order_out: coordinate order of the output tensor, a permutation of cSxyztXns where + /// s is the spin source and S is the spin sink + /// \return: tensor cSxyztXns where the first t_slice is the t_source-Nt_backward time-slice of + /// the vectors after the inversion, and goes increasingly until time-source t_source+Nt_forward + + template + Tensor + doInversion(const ChimeraSolver& sol, const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, const std::vector& spin_sources, + int max_rhs, const std::string& order_out) + { + detail::check_order_contains(order_out, "cSxyztXns"); + if (chi.kvdim()['t'] != 1) + throw std::runtime_error("Expected one time-slice"); + const int num_vecs = chi.kvdim()['n']; + + if (n_tslice_out > Layout::lattSize()[3]) + throw std::runtime_error("Too many tslices"); + + StopWatch snarss1; + snarss1.reset(); + snarss1.start(); + + Tensor r; + if (sol.op) + r = detail::doInversion( + sol.op, chi, t_source, first_tslice_out, n_tslice_out, spin_sources, max_rhs, order_out); + else + r = detail::doInversion( + *sol.PP, chi, t_source, first_tslice_out, n_tslice_out, spin_sources, max_rhs, order_out); + + snarss1.stop(); + QDPIO::cout << "Time to compute inversions for " << spin_sources.size() + << " spin sources and " << num_vecs + << " colorvecs : " << snarss1.getTimeInSeconds() << " secs" << std::endl; + + return r; + } + + template Tensor + doInversion(const ChimeraSolver& sol, const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, const std::vector& spin_sources, + int max_rhs, const std::string& order_out); + template Tensor + doInversion(const ChimeraSolver& sol, const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, const std::vector& spin_sources, + int max_rhs, const std::string& order_out); + + /// Apply the inverse to a list of LatticeFermions + /// \param sol: Chimera invertor + /// \param chi: lattice spin-color tensor, at least dimensions csxyztX + /// \param max_rhs: maximum number of vectors solved at once + /// \param conjugate: whether to apply the invertor transpose-conjugate + /// \return: tensor with the same ordering as `chi`. + + // template + // Tensor doInversion(const ChimeraSolver& sol, + // const Tensor& chi, int max_rhs = 0, + // Conjugation conj = NotConjugate) + // { + // detail::check_order_contains(chi.order, "csxyztX"); + // const int num_vecs = + // (N == 7 ? 1 + // : detail::volume(chi.kvdim(), detail::remove_dimensions(chi.order, "csxyztX"))); + + // StopWatch snarss1; + // snarss1.reset(); + // snarss1.start(); + + // // Multiply the input by g5 if applied conjugate + // Tensor<2, COMPLEX_CHI> g5; + // if (conj == Conjugate) + // { + // g5 = Gamma(Ns * Ns - 1).cloneOn(chi.getDev()); + // chi.contract(chi, {}, NotConjugate, g5, {{'j', 's'}}, NotConjugate, {{'s', 'i'}}); + // } + + // Tensor r; + // if (sol.op) + // r = detail::doInversion(sol.op, chi, max_rhs); + // else + // r = detail::doInversion(*sol.PP, chi, max_rhs); + + // // Multiply the input by g5 if applied conjugate + // if (conj == Conjugate) + // { + // r.contract(r, {}, NotConjugate, g5, {{'j', 's'}}, NotConjugate, {{'s', 'i'}}); + // } + + // snarss1.stop(); + // QDPIO::cout << "Time to compute " << num_vecs + // << " inversions: " << snarss1.getTimeInSeconds() << " secs" << std::endl; + + // return r; + // } + + /// Apply the inverse to a list of LatticeFermions + /// \param sol: Chimera invertor + /// \param psis: output lattice spin-color tensor, at least dimensions csxyztX + /// \param chis: input lattice spin-color tensor, at least dimensions csxyztX + /// \param max_rhs: maximum number of vectors solved at once + + void doInversion(const ChimeraSolver& sol, MultipleLatticeFermions& psis, + const ConstMultipleLatticeFermions& chis, int max_rhs) + { + StopWatch snarss1; + snarss1.reset(); + snarss1.start(); + + if (max_rhs <= 0) + max_rhs = chis.size(); + + // Do the inversion + if (sol.op) + { + auto op = sol.op; + auto tchi = op.make_compatible_dom("n", {{'n', max_rhs}}); + auto tpsi = op.make_compatible_img("n", {{'n', max_rhs}}); + for (int i = 0, n = std::min(max_rhs, (int)chis.size()); i < chis.size(); + i += n, n = std::min((int)chis.size() - i, max_rhs)) + { + // Adjust the size of tchi and tpsi to n + auto this_tchi = tchi.kvslice_from_size({{'n', 0}}, {{'n', n}}); + auto this_tpsi = tpsi.kvslice_from_size({{'n', 0}}, {{'n', n}}); + + // Copy chis into this_tchi + for (int j = 0; j < n; ++j) + asTensorView(*chis[i + j]).copyTo(this_tchi.kvslice_from_size({{'n', j}}, {{'n', 1}})); + + // Do the inversion: this_tpsi = D^{-1} * this_tchi + op(this_tchi, this_tpsi); + + // Copy the solution into psis + for (int j = 0; j < n; ++j) + this_tpsi.kvslice_from_size({{'n', j}}, {{'n', 1}}).copyTo(asTensorView(*psis[i + j])); + } + } + else + { + for (int i = 0, n = std::min(max_rhs, (int)chis.size()); i < chis.size(); + i += n, n = std::min((int)chis.size() - i, max_rhs)) + { + (*sol.PP)(MultipleLatticeFermions(psis.begin() + i, psis.begin() + i + n), + ConstMultipleLatticeFermions(chis.begin() + i, chis.begin() + i + n)); + } + } + + snarss1.stop(); + QDPIO::cout << "Time to compute " << chis.size() + << " inversions: " << snarss1.getTimeInSeconds() << " secs" << std::endl; + } + + /// Apply the inverse to a list of LatticeFermions + /// \param sol: Chimera invertor + /// \param psis: output lattice spin-color tensor, at least dimensions csxyztX + /// \param chis: input lattice spin-color tensor, at least dimensions csxyztX + /// \param max_rhs: maximum number of vectors solved at once + + Operator getOperator(const ChimeraSolver& sol, int max_rhs) + { + if (sol.op) + return sol.op; + + LatticeFermion a; + auto blkd = + asTensorView(a) + .toComplex() + .template reshape_dimensions({{"x", "0x"}, {"y", "1y"}, {"z", "2z"}, {"t", "3t"}}, + {{'0', 1}, {'1', 1}, {'2', 1}, {'3', 1}}, true) + .make_eg(); + auto PP_handle = sol.PP; + return { + [=](Tensor x, Tensor y) { + Tracker _t("chroma's inversion "); + detail::doInversion(*PP_handle, x, max_rhs).copyTo(y); + }, + blkd, // domain + blkd, // image + nullptr, // no conjugate + "", // no order_t + XEvenOddLayout, + XEvenOddLayout, + DenseOperator(), + ColumnMajor, // preferred ordering + false /* has a Kronecker form */ + }; + } + + /// Returns a projector onto the smallest singular space on an operator + /// \param op: operator to make the projector onto + /// \param ops: options to select the projector and influence the projector construction + + template + Projector getProjector(const Operator& op, const Options& ops) + { + enum ProjectorType { DEFL, MGDEFL }; + static const std::map projectorTypeMap{{"defl", DEFL}, + {"mg", MGDEFL}}; + ProjectorType projectorType = getOption(ops, "type", projectorTypeMap); + switch (projectorType) + { + case DEFL: // plain deflation + return detail::getDeflationProj(op, ops); + case MGDEFL: // Multigrid deflation + return detail::getMGDeflationProj(op, ops); + } + throw std::runtime_error("This shouldn't happen"); + } + + /// Constructor + /// \param fermAction: XML for the fermion action + /// \param projParam: XML for a projector onto the quark propagator + /// \param u: gauge fields + + ChimeraProjector::ChimeraProjector(const GroupXML_t& fermAction, const GroupXML_t& projParam, + const multi1d& u) + { + // Initialize fermion action and state + std::istringstream xml_s(fermAction.xml); + XMLReader fermacttop(xml_s); + QDPIO::cout << "FermAct = " << fermAction.id << std::endl; + S = TheFermionActionFactory::Instance().createObject(fermAction.id, fermacttop, + fermAction.path); + state = S->createState(u); + + // If the inverter is MGPROTON, use this infrastructure + if (projParam.id == std::string("MGPROTON")) + { + QDPIO::cout << "Setting up MGPROTON projector..." << std::endl; + Tracker _t("setup mgproton projector"); + + // Parse XML with the inverter options + std::shared_ptr ops = getOptionsFromXML(broadcast(projParam.xml)); + + // Clone the matvec + LinearOperator* fLinOp = S->genLinOp(state); + ColOrdering co = getOption(*ops, "Projector/operator_ordering", + getColOrderingMap(), ColumnMajor); + ColOrdering co_blk = getOption(*ops, "Projector/operator_block_ordering", + getColOrderingMap(), RowMajor); + Operator linOp = detail::cloneOperator( + asOperatorView(*fLinOp), co, co_blk, detail::ConsiderBlockingSparse, "chroma's operator"); + + // Destroy chroma objects + delete fLinOp; + state = State(); + S = Action(); + + // Construct the projector + op = getProjector(linOp, getOptions(*ops, "Projector")); + + // Clean cache of operators + detail::cleanEvenOddOperatorsCache(); + + QDPIO::cout << "MGPROTON projector ready; setup time: " + << detail::tostr(_t.stopAndGetElapsedTime()) << " s" << std::endl; + } + else + { + chroma_proj = S->projector(state, projParam); + } + } + + /// Apply the oblique projector V*inv(U^H*op*V)*U^H*op + /// \param proj: Chimera projector + /// \param psis: output lattice spin-color tensor, at least dimensions csxyztX + /// \param chis: input lattice spin-color tensor, at least dimensions csxyztX + /// \param max_rhs: maximum number of vectors solved at once + + void doVUAObliqueProjector(const ChimeraProjector& proj, MultipleLatticeFermions& psis, + const ConstMultipleLatticeFermions& chis, int max_rhs) + { + if (chis.size() == 0) + return; + + StopWatch snarss1; + snarss1.reset(); + snarss1.start(); + + if (max_rhs <= 0) + max_rhs = chis.size(); + + // Do the projection + if (proj.op.op) + { + auto tchi = proj.op.op.make_compatible_dom("n", {{'n', max_rhs}}); + auto tpsi = proj.op.op.make_compatible_img("n", {{'n', max_rhs}}); + for (int i = 0, n = std::min(max_rhs, (int)chis.size()); i < chis.size(); + i += n, n = std::min((int)chis.size() - i, max_rhs)) + { + // Adjust the size of tchi and tpsi to n + auto this_tchi = tchi.kvslice_from_size({{'n', 0}}, {{'n', n}}); + auto this_tpsi = tpsi.kvslice_from_size({{'n', 0}}, {{'n', n}}); + + // Copy chis into this_tchi + for (int j = 0; j < n; ++j) + asTensorView(*chis[i + j]).copyTo(this_tchi.kvslice_from_size({{'n', j}}, {{'n', 1}})); + + // Do the projection: this_tpsi = V * inv(U'*D*V) * U' * D * this_tchi + proj.op.V_inv_Ut(proj.op.op(this_tchi), this_tpsi); + + // Copy the solution into psis + for (int j = 0; j < n; ++j) + this_tpsi.kvslice_from_size({{'n', j}}, {{'n', 1}}).copyTo(asTensorView(*psis[i + j])); + } + } + else + { + for (int i = 0, n = std::min(max_rhs, (int)chis.size()); i < chis.size(); + i += n, n = std::min((int)chis.size() - i, max_rhs)) + { + proj.chroma_proj->VUAObliqueProjector( + MultipleLatticeFermions(psis.begin() + i, psis.begin() + i + n), + ConstMultipleLatticeFermions(chis.begin() + i, chis.begin() + i + n)); + } + } + + snarss1.stop(); + QDPIO::cout << "Time to compute " << chis.size() + << " projection: " << snarss1.getTimeInSeconds() << " secs" << std::endl; + } + + /// Get the rank of the oblique projector V*inv(U^H*op*V)*U^H*op + /// \param proj: Chimera projector + + unsigned int getProjectorRank(const ChimeraProjector& proj) + { + if (proj.op.op) + return proj.op.lambdas.size(); + return proj.chroma_proj->rank(); + } + + /// Get the left basis of the oblique projector V*inv(U^H*op*V)*U^H*op + /// \param proj: Chimera projector + /// \param from: index of the first basis column + /// \param psis: output lattice spin-color tensor, at least dimensions csxyztX + + void getV(const ChimeraProjector& proj, unsigned int from, MultipleLatticeFermions& psis) + { + if (!proj.op.op) + { + for (unsigned int i = 0; i < psis.size(); ++i) + proj.chroma_proj->V(from + i, *psis[i]); + } + else + { + auto V = proj.op.V(from, psis.size(), 'i'); + for (unsigned int i = 0; i < psis.size(); ++i) + V.kvslice_from_size({{'i', i}}, {{'i', 1}}).copyTo(asTensorView(*psis[i])); + } + } + + /// Get the right basis of the oblique projector V*inv(U^H*op*V)*U^H*op + /// \param proj: Chimera projector + /// \param from: index of the first basis column + /// \param psis: output lattice spin-color tensor, at least dimensions csxyztX + + void getU(const ChimeraProjector& proj, unsigned int from, MultipleLatticeFermions& psis) + { + if (!proj.op.op) + { + for (unsigned int i = 0; i < psis.size(); ++i) + proj.chroma_proj->U(from + i, *psis[i]); + } + else + { + auto U = proj.op.U(from, psis.size(), 'i'); + for (unsigned int i = 0; i < psis.size(); ++i) + U.kvslice_from_size({{'i', i}}, {{'i', 1}}).copyTo(asTensorView(*psis[i])); + } + } + + /// Get U_i^H*op*V_i for the oblique projector V*inv(U^H*op*V)*U^H*op + /// \param proj: Chimera projector + /// \param index: index of the U and V basis + + DComplex getLambda(const ChimeraProjector& proj, unsigned int index) + { + DComplex r; + if (!proj.op.op) + { + proj.chroma_proj->lambda(index, r); + } + else + { + auto l = proj.op.lambdas[index]; + r.elem().elem().elem() = QDP::RComplex(std::real(l), std::imag(l)); + } + return r; + } + } +} + +#endif // BUILD_SB diff --git a/lib/util/ferm/mgproton.h b/lib/util/ferm/mgproton.h new file mode 100644 index 0000000000..50216a0ce9 --- /dev/null +++ b/lib/util/ferm/mgproton.h @@ -0,0 +1,148 @@ +// -*- C++ -*- +/*! \file + * \brief Multigrid prototype next + * + * Hadron spectrum calculations utilities + */ + +#ifndef __INCLUDE_MGPROTON__ +#define __INCLUDE_MGPROTON__ + +#include "chromabase.h" +#include "util/ferm/superb_contractions.h" +#include "util/ferm/superb_options.h" + +#include + +#ifdef BUILD_SB +namespace Chroma +{ + namespace SB + { + /// Either a Chroma solver or a superb solver + struct ChimeraSolver { + /// Action type + using Action = Handle< + FermionAction, multi1d>>; + + /// Action + Action S; + + /// State type + using State = + Handle, multi1d>>; + + /// State + State state; + + /// Chroma solver (optional) + Handle> PP; + + /// Operator on scxyztX (optional) + Operator op; + + /// Constructor + /// \param fermAction: XML for the fermion action + /// \param invParam: XML for the quark propagator + /// \param u: gauge fields + ChimeraSolver(const GroupXML_t& fermAction, const GroupXML_t& invParam, + const multi1d& u); + }; + + /// Multiple spin-color lattice fields + template + Tensor + doInversion(const ChimeraSolver& sol, const Tensor chi, int t_source, + int first_tslice_out, int n_tslice_out, const std::vector& spin_sources, + int max_rhs, const std::string& order_out = "cSxyztXns"); + + using MultipleLatticeFermions = std::vector>; + using ConstMultipleLatticeFermions = std::vector>; + + void doInversion(const ChimeraSolver& sol, MultipleLatticeFermions& psis, + const ConstMultipleLatticeFermions& chis, int max_rhs = 0); + Operator getOperator(const ChimeraSolver& sol, int max_rhs = 1); + + template + EigensolverFun getInexactEigensolverGD(Operator op, + const Options& ops); + + /// Function that returns V[from..from+size-1] + /// \param from: first index to return + /// \param size: number of vectors to return starting from `from` + /// \param label: label indicating the columns + + template + using VectorFun = std::function(unsigned int, unsigned int, char)>; + + /// Returns the pieces of a projector of the form: + /// V*inv(U'*op*V)*U'*op, where U'*op*V = diag(lambda) + /// + /// It can work as a projector on the right of inv(op), that is P*inv(op), where + /// P = V*inv(U'*op*V)*U'*op + /// or it can work as a projector on the left of inv(op), that is inv(op)*Q, where + /// Q = op*V*inv(U'*op*V)*U' + /// + /// Then, the trace of either P*inv(op) or inv(op)*Q is + /// tr V*inv(U'*op*V)*U' = \sum_i u_i'*vi/lambda_i + + template + struct Projector { + /// Function that applies V * inv(U'*op*V) * U' + Operator V_inv_Ut; + + /// Function that returns the i-th left base of the projector + VectorFun V; + + /// Function that returns the i-th right base of the projector + VectorFun U; + + /// Inner products with the operator, lambda_i = U[i]'*op*V[i] + std::vector lambdas; + + /// Function that applies the operator + Operator op; + }; + + /// Chroma or mgproton projector, that is, P*P*x = P*x + + struct ChimeraProjector { + /// Action type + using Action = Handle< + FermionAction, multi1d>>; + + /// Action + Action S; + + /// State type + using State = + Handle, multi1d>>; + + /// State + State state; + + /// Chroma projector (optional) + Handle> chroma_proj; + + /// Operator on scxyztX (optional) + SB::Projector op; + + /// Constructor + /// \param fermAction: XML for the fermion action + /// \param projParam: XML for the projector + /// \param u: gauge fields + ChimeraProjector(const GroupXML_t& fermAction, const GroupXML_t& projParam, + const multi1d& u); + }; + + void doVUAObliqueProjector(const ChimeraProjector& proj, MultipleLatticeFermions& psis, + const ConstMultipleLatticeFermions& chis, int max_rhs = 0); + unsigned int getProjectorRank(const ChimeraProjector& proj); + void getV(const ChimeraProjector& proj, unsigned int from, MultipleLatticeFermions& psis); + void getU(const ChimeraProjector& proj, unsigned int from, MultipleLatticeFermions& psis); + DComplex getLambda(const ChimeraProjector& proj, unsigned int index); + } +} +#endif // BUILD_SB + +#endif // __INCLUDE_MGPROTON__ diff --git a/lib/util/ferm/staggered_operators_s.cc b/lib/util/ferm/staggered_operators_s.cc index f2d980c4ad..0fb105bf3e 100644 --- a/lib/util/ferm/staggered_operators_s.cc +++ b/lib/util/ferm/staggered_operators_s.cc @@ -19,13 +19,13 @@ namespace Chroma class Datum{ public: multi1d d; - Real sign ; - void init(int i,int j, int k, int l, Real s){ + REAL sign ; + void init(int i,int j, int k, int l, REAL s){ d.resize(4) ; sign=s ; d[0]=i;d[1]=j;d[2]=k;d[3]=l ; } Datum(){d.resize(4);sign=1.0;} - Datum(int i,int j, int k, int l, Real s){ + Datum(int i,int j, int k, int l, REAL s){ init(i,j,k,l,s); } ~Datum(){} diff --git a/lib/util/ferm/superb_contractions.h b/lib/util/ferm/superb_contractions.h index 466fb38d25..67582abbc5 100644 --- a/lib/util/ferm/superb_contractions.h +++ b/lib/util/ferm/superb_contractions.h @@ -18,12 +18,12 @@ # include "actions/ferm/fermacts/fermact_factory_w.h" # include "actions/ferm/fermacts/fermacts_aggregate_w.h" +# include "meas/hadron/greedy_coloring.h" # include "meas/smear/link_smearing_factory.h" # include "qdp.h" # include "qdp_map_obj_disk_multiple.h" # include "superbblas.h" # include "util/ferm/key_timeslice_colorvec.h" -# include "util/ft/sftmom.h" # include # include # include @@ -32,6 +32,7 @@ # include # include # include +# include # include # include # include @@ -45,6 +46,9 @@ # ifdef BUILD_PRIMME # include +# if defined(SUPERBBLAS_USE_HIP) +# include +# endif # endif namespace Chroma @@ -53,12 +57,28 @@ namespace Chroma namespace SB { - using Index = superbblas::IndexType; - using Complex = std::complex; - using ComplexD = std::complex; - using ComplexF = std::complex; + using Index = superbblas::IndexType; ///< Default index type, `int` for now + using Complex = std::complex; ///< Default chroma complex precision + using ComplexD = std::complex; ///< Complex double + using ComplexF = std::complex; ///< Complex single + + /// Implicit complex type, there's a 2-size dimension for representing + /// the real and the imaginary part, which usually has the label `.` + template + struct DIYComplex { + using value_type = T; + }; + + /// Type to represent coordinates template using Coor = superbblas::Coor; + + /// Type to represent tensor layouts, its coordinate has the number of + /// elements to jump to the element with the next coordinate + template + using Stride = superbblas::Coor; + + /// Type of checksum, used by StorageTensor using checksum_type = superbblas::checksum_type; /// Where to store the tensor (see class Tensor) @@ -67,14 +87,29 @@ namespace Chroma OnDefaultDevice ///< on GPU memory if possible }; - /// How to distribute the tensor (see class Tensor) - enum Distribution { - OnMaster, ///< Fully supported on node with index zero - OnEveryone, ///< Distribute the lattice dimensions (x, y, z, t) as chroma does - OnEveryoneReplicated, ///< All nodes have a copy of the tensor - Local ///< Non-collective + /// Whether the file is in a local or a shared filesystem + enum LocalSharedFile { + LocalFSFile, ///< on a local file system + SharedFSFile ///< on a shared file system }; + /// How to distribute the tensor (see class Tensor) + using Distribution = std::string; + /// Fully supported on node with index zero + static const Distribution OnMaster("__OnMaster__"); + /// Distribute the lattice dimensions (x, y, z, t) + static const Distribution OnEveryone("tzyx"); + /// Distribute the lattice dimensions (x, y, z, t) as chroma does + static const Distribution OnEveryoneAsChroma("__OnEveryonAsChroma__"); + /// All nodes have a copy of the tensor + static const Distribution OnEveryoneReplicated("__OnEveryoneReplicated__"); + /// Local (single process) and non-collective + static const Distribution Local(""); + /// Only the local process has support and non-collective + static const Distribution Glocal("__glocal__"); + /// Distribute along the labels, one at a time + static const Distribution OnEveryoneCompact("__OnEveryoneCompact__"); + /// Whether complex conjugate the elements before contraction (see Tensor::contract) enum Conjugation { NotConjugate, Conjugate }; @@ -84,18 +119,99 @@ namespace Chroma /// Whether to copy or add the values into the destination tensor (see Tensor::doAction) enum Action { CopyTo, AddTo }; - /// Auxiliary class for initialize Maybe with no value + /// Auxiliary class for initialize Maybe with no value struct None { }; + template ::value> + struct Maybe; + + template + struct is_maybe { + static constexpr bool value = false; + }; + + template <> + struct is_maybe { + static constexpr bool value = true; + }; + + template + struct is_maybe> { + static constexpr bool value = true; + }; + + /// Class for optional values + template + struct Maybe { + /// Whether the value is set + bool has_value; + + /// The value + T value; + + /// Constructor without a value + Maybe() : has_value{false}, value{} + { + } + + /// Constructor without a value + Maybe(None) : Maybe() + { + } + + /// Constructor with a value + template ::value && std::is_convertible::value), + bool>::type = true> + Maybe(const Q& t) : has_value{true}, value{T(t)} + { + } + + /// Return whether it has been initialized with a value + bool hasSome() const + { + return has_value; + } + + /// Return whether it has been initialized with a value + explicit operator bool() const noexcept + { + return has_value; + } + + /// Return the value if it has been initialized with some + T getSome() const + { + if (has_value) + return value; + throw std::runtime_error("Maybe::getSome: value isn't set"); + } + + /// Return the value if it has been initialized with some; otherwise return `def` + T getSome(T def) const + { + if (has_value) + return getSome(); + else + return def; + } + }; + /// Class for optional values template - struct Maybe { - /// opt_val.first is whether a value was set, and opt_val.second has the value if that's the case - std::pair opt_val; + struct Maybe { + /// Whether the value is set + bool has_value; + + /// The value type: change references by pointers + using Tvalue = typename std::remove_reference::type*; + + /// The value + Tvalue value; /// Constructor without a value - Maybe() : opt_val{false, {}} + Maybe() : has_value{false}, value{} { } @@ -105,30 +221,35 @@ namespace Chroma } /// Constructor with a value - template - Maybe(Q t) : opt_val{true, T(t)} + Maybe(const T& t) : has_value{true}, value{&t} { } /// Return whether it has been initialized with a value bool hasSome() const { - return opt_val.first; + return has_value; + } + + /// Return whether it has been initialized with a value + explicit operator bool() const noexcept + { + return has_value; } /// Return the value if it has been initialized with some T getSome() const { - if (opt_val.first) - return opt_val.second; - throw std::runtime_error("W!"); + if (has_value) + return *value; + throw std::runtime_error("Maybe::getSome: value isn't set"); } /// Return the value if it has been initialized with some; otherwise return `def` T getSome(T def) const { - if (opt_val.first) - return opt_val.second; + if (has_value) + return getSome(); else return def; } @@ -144,6 +265,15 @@ namespace Chroma .count(); } + /// Wrapper around superbblas time tracking + struct Tracker : public superbblas::detail::tracker { + Tracker(const std::string& funcName) + : superbblas::detail::tracker(funcName, superbblas::detail::Cpu{}, + true /* time it anyway */) + { + } + }; + namespace detail { /// Throw an error if it is not a valid order, that is, if some label is repeated @@ -176,8 +306,8 @@ namespace Chroma enum Throw_kvcoors { NoThrow, ThrowOnUnmatchLabel, ThrowOnMissing }; - template - Coor kvcoors(const std::string& order, const std::map& m, Index missing = 0, + template + Coor kvcoors(const std::string& order, const std::map& m, Index missing = 0, Throw_kvcoors t = ThrowOnUnmatchLabel) { detail::check_order(order); @@ -255,9 +385,9 @@ namespace Chroma // Return the equivalent value of the coordinate `v` in the interval [0, dim[ for a periodic // dimension with length `dim`. - inline int normalize_coor(int v, int dim) + inline int normalize_coor(int coor, int dim) { - return (v + dim * (v < 0 ? -v / dim + 1 : 0)) % dim; + return (dim == 0 ? 0 : (coor + dim * (coor < 0 ? -coor / dim + 1 : 0)) % dim); } // Return the equivalent value of the coordinate `v` in the interval [0, dim[ for a periodic @@ -276,6 +406,34 @@ namespace Chroma { using namespace superbblas::detail; + /// Return whether a character is in a string + /// \param s: the string + /// \param c: the character + + inline bool is_in(const std::string& s, char c) + { + return std::find(s.begin(), s.end(), c) != s.end(); + } + + inline std::map update_kvcoor(const std::map& kvcoor, const remap& m) + { + std::map r; + for (auto const& it : kvcoor) + r[m.at(it.first)] = it.second; + return r; + } + + /// Return the inverse map + /// \param map: from domain labels to image labels + + inline remap reverse(const remap& map) + { + remap o; + for (const auto& it : map) + o.insert({it.second, it.first}); + return o; + } + // Throw an error if `order` does not contain a label in `should_contain` inline void check_order_contains(const std::string& order, const std::string& should_contain) { @@ -318,19 +476,25 @@ namespace Chroma return union_dimensions(order, "", remove_dims); } - template - std::string update_order(std::string order, const remap& m) + inline std::string update_order(std::string order, const remap& m) { - for (std::size_t i = 0; i < N; ++i) + for (std::size_t i = 0; i < order.size(); ++i) { auto it = m.find(order[i]); if (it != m.end()) order[i] = it->second; } - check_order(order); return order; } + template + std::string update_order_and_check(std::string order, const remap& m) + { + std::string new_order = update_order(order, m); + check_order(new_order); + return new_order; + } + template Coor remove_coor(Coor v, std::size_t pos) { @@ -386,78 +550,388 @@ namespace Chroma return v; } - // Return a context on either the host or the device - inline std::shared_ptr getContext(DeviceHost dev) + /// Return a character not given + /// \param used_labels: labels not to use + + inline char get_free_label(const std::string& used_labels) + { + for (char c = '0'; true; ++c) + { + if (used_labels.find(c) == std::string::npos) + return c; + if (c == std::numeric_limits::max()) + break; + } + throw std::runtime_error("get_free_labels: out of labels"); + } + + /// Return a string version of the number in scientific notation + /// \param v: number to convert + /// \param prec: number of digits to print + + inline std::string tostr(double v, unsigned int prec = 2) + { + std::stringstream ss; + ss << std::scientific << std::setprecision(prec) << v; + return ss.str(); + } + + /// Return a map to transform given labels into another ones + /// \param labels: labels to remap + /// \param used_labels: labels not to use, besides `labels` + + inline remap getNewLabels(const std::string& labels, std::string used_labels) + { + remap r; + used_labels += labels; + for (unsigned int i = 0; i < labels.size(); ++i) + { + char c = get_free_label(used_labels); + r[labels[i]] = c; + used_labels.push_back(c); + } + return r; + } + + // Return an array with an order, used by superbblas + + template + std::array to_sb_order(const std::string& order) + { + if (order.size() != N) + throw std::runtime_error( + "to_sb_order: the given string doesn't match the template parameter"); + std::array c; + std::copy_n(order.begin(), N, c.begin()); + return c; + } + + /// Return the volume associated to an order + /// \param m: size of each dimension + /// \param labels: labels to consider + + inline std::size_t volume(const std::map& m, const std::string& labels) + { + if (labels.size() == 0) + return 0; + std::size_t vol = 1; + for (char c : labels) + vol *= (std::size_t)m.at(c); + return vol; + } + + enum CoorType { From, Size }; + + /// Return whether two from-size ranges are compatible on common dimensions + /// \param o0: order for the first coordinates + /// \param from0: first coordinate on the first range + /// \param size0: range size on the first range + /// \param o0: order for the second coordinates + /// \param from1: first coordinate on the second range + /// \param size1: range size on the second range + /// \param labelsToCompare: labels to compare + + template + bool compatibleRanges(const std::string& o0, const Coor& from0, const Coor& size0, + const std::string& o1, const Coor& from1, const Coor& size1, + const std::string& labelsToCompare) + { + if (o0.size() != N0 || o1.size() != N1) + throw std::runtime_error("compatibleRanges: invalid size of input ordering"); + std::map> mfs0; + for (unsigned int i = 0; i < N0; ++i) + if (std::find(labelsToCompare.begin(), labelsToCompare.end(), o0[i]) != + labelsToCompare.end()) + mfs0[o0[i]] = {{from0[i], size0[i]}}; + for (unsigned int i = 0; i < N1; ++i) + { + if (mfs0.count(o1[i]) == 0) + continue; + auto fs0 = mfs0.at(o1[i]); + if (fs0[0] != from1[i] || fs0[1] != size1[i]) + return false; + } + return true; + } + + /// Coarse a range given a blocking + /// \param fs: range to block + /// \param blocking: blocking on each coordinate + + template + std::array, 2> coarse_range(const std::array, 2>& fs, const Coor& blocking) + { + std::array, 2> r; + for (unsigned int i = 0; i < N; ++i) + { + r[0][i] = fs[0][i] / blocking[i] * blocking[i]; + r[1][i] = (fs[0][i] + fs[1][i] + blocking[i] - 1) / blocking[i] * blocking[i] - r[0][i]; + } + return r; + } + + /// Split a dimension into another dimensions + /// \param pos: dimension to split + /// \param c: coordinate to transform + /// \param new_dim: dimensions of the new tensor + /// \param t: either `From` (first element) or `Size` (number of elements in each dimension) + + template + Coor split_dimension(std::size_t pos, const Coor& c, const Coor& new_dim, + CoorType t) + { + constexpr std::size_t Nnew = Nout + 1 - N; + Coor r; + std::copy_n(c.begin(), pos, r.begin()); + Index stride = 1; + for (unsigned int k = 0; k < Nnew; ++k) + { + if (!(t == From || c[pos] < stride || c[pos] % stride == 0)) + throw std::runtime_error("split_dimension: Not supporting for this partition"); + if (t == From) + r[pos + k] = (c[pos] / stride) % new_dim[pos + k]; + else + r[pos + k] = std::min((c[pos] + stride - 1) / stride, new_dim[pos + k]); + stride *= new_dim[pos + k]; + } + if (t == Size && new_dim[pos + Nnew - 1] != std::numeric_limits::max() && + c[pos] > stride) + throw std::runtime_error("split_dimension: dimension shorter than it should be"); + std::copy_n(c.begin() + pos + 1, N - pos - 1, r.begin() + pos + Nnew); + return r; + } + + /// Collapse several dimensions into another dimension + /// \param pos: first dimension to collapse + /// \param c: coordinate to transform + /// \param old_dim: dimensions of the old tensor + /// \param t: either `From` (first element) or `Size` (number of elements in each dimension) + + template + Coor collapse_dimensions(std::size_t pos, const Coor& c, const Coor& old_dim, + CoorType t) + { + if (pos >= N || pos >= Nout) + throw std::runtime_error("collapse_dimensions: invalid pos"); + constexpr std::size_t Ncol = N + 1 - Nout; // number of dimensions to collapse + Coor r; + std::copy_n(c.begin(), pos, r.begin()); + Index stride = 1, i = (t == From ? 0 : 1); + bool odd_dim_watched = false; + for (unsigned int k = 0; k < Ncol; ++k) + { + if (t == Size && c[pos + k] > 0 && c[pos + k] != old_dim[pos + k]) + { + if (odd_dim_watched) + throw std::runtime_error( + "collapse_dimensions: unsupported to collapse a range with holes"); + odd_dim_watched = true; + } + if (t == From) + i += c[pos + k] * stride; + else + i *= c[pos + k]; + stride *= old_dim[pos + k]; + } + r[pos] = i; + std::copy_n(c.begin() + pos + Ncol, N - pos - Ncol, r.begin() + pos + 1); + return r; + } + + enum ReshapeDimensionsError { + Success, ///< success + CollapseRangeWithHoles, ///< unsupported to collapse a range with holes + SizeNotDivisibleByPartition, ///< unsupported size for the partition + NewDimensionIsTooShort ///< new dimension shorter than it should be + }; + + /// Reshape several dimensions into another dimension + /// \param ncollapse: number of dimensions to collapse starting from each old dimension + /// \param nsplit: number of dimensions to split starting from each old dimension + /// \param old_dim: dimensions of the old tensor + /// \param new_dim: maximum dimension size for the new tensor + /// \param t: either `From` (first element) or `Size` (number of elements in each dimension) + /// \param c: coordinate to transform + + template + std::pair> + reshape_dimensions(const Coor& ncollapse, const Coor& nsplit, const Coor& old_dim, + const Coor& new_dim, CoorType t, const Coor& c) + { + Coor r; + unsigned int ri = 0; + for (unsigned int ci = 0; ci < N; ++ci) + { + if (ncollapse[ci] == 1 && nsplit[ci] == 1) + { + r[ri++] = c[ci]; + } + else + { + // Collapse the dimensions from it[0] up to it[0]+it[1]-1 + Index idx = 0; + { + Index stride = 1; + idx = (t == From ? 0 : 1); + bool odd_dim_watched = false; + for (unsigned int k = 0; k < ncollapse[ci]; ++k) + { + if (t == Size && c[ci + k] > 0 && c[ci + k] != old_dim[ci + k]) + { + if (odd_dim_watched) + return {CollapseRangeWithHoles, {}}; + odd_dim_watched = true; + } + if (t == From) + idx += c[ci + k] * stride; + else + idx *= c[ci + k]; + stride *= old_dim[ci + k]; + } + } + + // Split the new dimension into it[2] new dimensions + { + Index stride = 1; + for (unsigned int k = 0; k < nsplit[ci]; ++k) + { + if (!(t == From || idx < stride || idx % stride == 0)) + return {SizeNotDivisibleByPartition, {}}; + if (t == From) + r[ri + k] = (idx / stride) % new_dim[ri + k]; + else + r[ri + k] = std::min((idx + stride - 1) / stride, new_dim[ri + k]); + stride *= new_dim[ri + k]; + } + if (t == Size && new_dim[ri + nsplit[ci] - 1] != std::numeric_limits::max() && + idx > stride) + return {NewDimensionIsTooShort, {}}; + } + + ri += nsplit[ci]; + ci += ncollapse[ci] - 1; + } + } + + // Return the new coordinates + return {Success, r}; + } + + /// Destroy function + using DestroyFun = std::function; + + /// Return a list of destroy callbacks to execute just before finishing chroma + + inline std::vector& getDestroyList() + { + static std::vector list; + return list; + } + + // Get the cpu context + inline std::shared_ptr& getCpuContext() { - // Creating GPU context can be expensive; so do it once - static std::shared_ptr cudactx; static std::shared_ptr cpuctx; if (!cpuctx) + { cpuctx = std::make_shared(superbblas::createCpuContext()); + getDestroyList().push_back([] { getCpuContext().reset(); }); + } + return cpuctx; + } + + // Return a context on either the host or the device + inline std::shared_ptr& getGpuContext() + { + // Creating GPU context can be expensive; so do it once + static std::shared_ptr cudactx; - switch (dev) +# ifdef SUPERBBLAS_USE_GPU + if (!cudactx) { - case OnHost: return cpuctx; - case OnDefaultDevice: -# ifdef QDP_IS_QDPJIT - if (!cudactx) - { - int dev = -1; -# ifdef SUPERBBLAS_USE_CUDA - superbblas::detail::cudaCheck(cudaGetDevice(&dev)); -# elif defined(SUPERBBLAS_USE_HIP) - superbblas::detail::hipCheck(hipGetDevice(&dev)); + int dev = -1; +# if defined(QDP_IS_QDPJIT) + // When using QDP-JIT, the GPU device to use is already selected +# ifdef SUPERBBLAS_USE_CUDA + superbblas::detail::gpuCheck(cudaGetDevice(&dev)); +# elif defined(SUPERBBLAS_USE_HIP) + superbblas::detail::gpuCheck(hipGetDevice(&dev)); +# else +# error unsupported GPU platform +# endif # else -# error superbblas was not build with support for GPUs -# endif - - // Workaround on a potential issue in qdp-jit: avoid passing through the pool allocator - if (jit_config_get_max_allocation() == 0) + // When not using QDP-JIT, select the GPU device based on either the local + // MPI rank or the global MPI rank and assuming that consecutive MPI ranks + // tends to be on the same node. + const char* l = std::getenv("SB_NUM_GPUS_ON_NODE"); + if (l) + { + dev = Layout::nodeNumber() % std::atoi(l); + } + else + { + const char* l = std::getenv("SLURM_LOCALID"); + if (l) { - cudactx = std::make_shared(superbblas::createGpuContext(dev)); + dev = std::atoi(l); } else { - cudactx = std::make_shared(superbblas::createGpuContext( - dev, - - // Make superbblas use the same memory allocator for gpu as any other qdp-jit lattice object - [](std::size_t size, superbblas::platform plat) -> void* { - if (size == 0) - return nullptr; - if (plat == superbblas::CPU) - return malloc(size); - void* ptr = nullptr; - QDP_get_global_cache().addDeviceStatic(&ptr, size, true); - assert(superbblas::detail::getPtrDevice(ptr) >= 0); - return ptr; - }, - - // The corresponding deallocator - [](void* ptr, superbblas::platform plat) { - if (ptr == nullptr) - return; - if (plat == superbblas::CPU) - free(ptr); - else - QDP_get_global_cache().signoffViaPtr(ptr); - })); + QDPIO::cerr << "Please set SB_NUM_GPUS_ON_NODE or SLURM_LOCALID" << std::endl; + QDP_abort(1); } } - return cudactx; -# else - return cpuctx; -# endif +# endif + + // Workaround on a potential issue in qdp-jit: avoid passing through the pool allocator +# if defined(QDP_IS_QDPJIT) + if (jit_config_get_max_allocation() != 0) + { + // Make superbblas use the same memory allocator for gpu as any other qdp-jit lattice object + superbblas::getCustomAllocator() = [](std::size_t size, + superbblas::platform plat) -> void* { + if (size == 0) + return nullptr; + if (plat == superbblas::CPU) + return malloc(size); + void* ptr = nullptr; + QDP_get_global_cache().addDeviceStatic(&ptr, size, true); + assert(superbblas::detail::getPtrDevice(ptr) >= 0); + return ptr; + }; + + // The corresponding deallocator + superbblas::getCustomDeallocator() = [](void* ptr, superbblas::platform plat) { + if (ptr == nullptr) + return; + if (plat == superbblas::CPU) + free(ptr); + else + QDP_get_global_cache().signoffViaPtr(ptr); + }; + } +# endif // defined(QDP_IS_QDPJIT) + cudactx = std::make_shared(superbblas::createGpuContext(dev)); + getDestroyList().push_back([] { getGpuContext().reset(); }); } - throw std::runtime_error("Unsupported `DeviceHost`"); + return cudactx; +# else // SUPERBBLAS_USE_GPU + return getCpuContext(); +# endif // SUPERBBLAS_USE_GPU + } + + // Return a context on either the host or the device + inline const std::shared_ptr& getContext(DeviceHost dev) + { + return dev == OnHost ? getCpuContext() : getGpuContext(); } /// Return if two devices are the same inline bool is_same(DeviceHost a, DeviceHost b) { -# ifdef QDP_IS_QDPJIT +# ifdef SUPERBBLAS_USE_GPU return a == b; # else // Without gpus, OnHost and OnDefaultDevice means on cpu. @@ -465,6 +939,55 @@ namespace Chroma # endif } + /// is_complex::value is true if `T` is complex + + template + struct is_complex : std::false_type { + }; + + template + struct is_complex> : std::true_type { + }; + + /// real_type::type is T::value_type if T is complex or DIYComplex; otherwise it is T + + template + struct real_type { + using type = T; + }; + + template + struct real_type> { + using type = T; + }; + + template + struct real_type> { + using type = T; + }; + + /// is_diycomplex::value is true if `T` is DIYComplex + + template + struct is_diycomplex : std::false_type { + }; + + template + struct is_diycomplex> : std::true_type { + }; + + /// base_type::type is T excepting for base_type>::type that is T + + template + struct base_type { + using type = T; + }; + + template + struct base_type> { + using type = T; + }; + /// Return x if conjugate is false and conj(x) otherwise /// \param x: value to conjugate @@ -485,16 +1008,64 @@ namespace Chroma { std::string r(N, 0); for (std::size_t i = 0; i < N; ++i) - r[i] = i % 128; + r[i] = (i + 1) % 128; return r; } - /// Stores the subtensor supported on each node (used by class Tensor) - template - struct TensorPartition { - public: - using PartitionStored = std::vector>; - Coor dim; ///< Dimensions of the tensor + /// Return a map with pairs from elements withdraw from two lists + /// ita_begin, first element for the first pair + /// ita_end, first element not to include + /// itb_begin, second element for the first pair + + template + std::map zip(ITA_BEGIN ita_begin, ITA_END ita_end, ITB_BEGIN itb_begin) + { + std::map m; + while (ita_begin != ita_end) + { + m[*ita_begin] = *itb_begin; + ita_begin++; + itb_begin++; + } + return m; + } + + /// Return whether the tensor isn't local or on master or replicated + inline bool isDistributedOnEveryone(const Distribution& dist) + { + return dist != OnMaster && dist != OnEveryoneReplicated && dist != Local && dist != Glocal; + } + + /// Return whether the tensor is local (not collective) + inline bool is_distribution_local(const Distribution& dist) + { + return dist == Local || dist == Glocal; + } + + /// Return whether the tensor isn't local or on master or replicated + inline Distribution compatible_replicated_distribution(const Distribution& dist) + { + if (dist == Local || dist == Glocal) + return dist; + return OnEveryoneReplicated; + } + + /// Return whether the tensor isn't local or on master or replicated + inline Distribution + compatible_oneveryone_distribution(const Distribution& dist, + const Distribution& everyone_dist = OnEveryone) + { + if (dist == Local || dist == Glocal) + return dist; + return everyone_dist; + } + + /// Stores the subtensor supported on each node (used by class Tensor) + template + struct TensorPartition { + public: + using PartitionStored = std::vector>; + Coor dim; ///< Dimensions of the tensor PartitionStored p; ///< p[i] = {first coordinate, size} of tensor on i-th node bool isLocal; ///< Whether the partition is non-collective @@ -507,15 +1078,35 @@ namespace Chroma { detail::check_order(order); isLocal = false; - switch (dist) + if (dist == OnMaster) + { + p = all_tensor_on_master(dim); + } + else if (dist == OnEveryoneReplicated) + { + p = all_tensor_replicated(dim); + } + else if (dist == OnEveryoneAsChroma) + { + p = partitioning_chroma_compatible(order, dim); + } + else if (dist == Local) { - case OnMaster: p = all_tensor_on_master(dim); break; - case OnEveryone: p = partitioning_chroma_compatible(order, dim); break; - case OnEveryoneReplicated: p = all_tensor_replicated(dim); break; - case Local: p = local(dim); isLocal = true; - break; + } + else if (dist == Glocal) + { + p = all_tensor_glocal(dim); + } + else if (dist.size() > OnEveryoneCompact.size() && + dist.substr(0, OnEveryoneCompact.size()) == OnEveryoneCompact) + { + p = partitioning_distributed_compact(order, dim, dist.substr(OnEveryoneCompact.size())); + } + else + { + p = partitioning_distributed(order, dim, dist); } } @@ -524,11 +1115,16 @@ namespace Chroma /// \param p: partition /// \praam isLocal: whether the tensor is local - TensorPartition(Coor dim, const PartitionStored& p, bool isLocal = false) + TensorPartition(Coor dim, const PartitionStored& p, bool isLocal) : dim(dim), p(p), isLocal(isLocal) { } + /// Empty constructor + TensorPartition() : dim{}, p{}, isLocal{false} + { + } + /// Return the volume of the tensor supported on this node std::size_t localVolume() const { @@ -583,6 +1179,69 @@ namespace Chroma return (isLocal ? 0 : Layout::nodeNumber()); } + /// Return the maximum local volume supported supported by a process + + std::size_t maxLocalVolume() const + { + std::size_t maxLocalVol = 0; + for (const auto& it : p) + maxLocalVol = std::max(maxLocalVol, superbblas::detail::volume(it[1])); + return maxLocalVol; + } + + /// Return whether other partition is compatible with this one + + template + bool is_compatible(const std::string& o0, const TensorPartition& t, + const std::string& o1, const std::string& labelToCompare) const + { + if (t.p.size() != p.size()) + return false; + for (unsigned int i = 0; i < p.size(); ++i) + if (!compatibleRanges(o0, p[i][0], p[i][1], o1, t.p[i][0], t.p[i][1], labelToCompare)) + return false; + + return true; + } + + /// Make a partition compatible with a given one + + template + TensorPartition make_compatible(const std::string& o, const std::string& o1, + const Coor& new_dim, + const std::string& labelsToCompare) const + { + if (o.size() != N || o1.size() != N1) + throw std::runtime_error( + "make_compatible: one the given orders does not match the expected length"); + + typename TensorPartition::PartitionStored r( + p.size(), std::array, 2>{Coor{{}}, new_dim}); + for (unsigned int i = 0; i < p.size(); ++i) + { + for (unsigned int j = 0; j < N1; ++j) + { + if (std::find(labelsToCompare.begin(), labelsToCompare.end(), o1[j]) != + labelsToCompare.end()) + { + int j0 = std::find(o.begin(), o.end(), o1[j]) - o.begin(); + r[i][0][j] = p[i][0][j0]; + r[i][1][j] = p[i][1][j0]; + } + if (superbblas::detail::volume(r[i][1]) == 0) + r[i] = std::array, 2>{Coor{{}}, Coor{{}}}; + } + } + TensorPartition new_t{new_dim, r, isLocal}; + + // TODO: Fusing different tensor partitions isn't trivial. If both partitions differ in the number of + // active processes (processes with nonzero support), then the current implementation will fail + if (!is_compatible(o, new_t, o1, labelsToCompare)) + throw std::runtime_error("make_compatible is broken and you hit a corner case"); + + return new_t; + } + /// Insert a new non-distributed dimension TensorPartition insert_dimension(std::size_t pos, std::size_t dim_size) const @@ -591,7 +1250,7 @@ namespace Chroma r.reserve(p.size()); for (const auto& i : p) r.push_back({insert_coor(i[0], pos, 0), insert_coor(i[1], pos, dim_size)}); - return TensorPartition{insert_coor(dim, pos, dim_size), r}; + return TensorPartition{insert_coor(dim, pos, dim_size), r, isLocal}; } /// Remove a non-distributed dimension @@ -602,28 +1261,114 @@ namespace Chroma r.reserve(p.size()); for (const auto& i : p) r.push_back({remove_coor(i[0], pos), remove_coor(i[1], pos)}); - return TensorPartition{remove_coor(dim, pos), r}; + return TensorPartition{remove_coor(dim, pos), r, isLocal}; } /// Split a dimension into a non-distributed dimension and another dimension - TensorPartition split_dimension(std::size_t pos, Index step) const + template 0), bool>::type = true> + TensorPartition split_dimension(std::size_t pos, const Coor& new_dim) const { - typename TensorPartition::PartitionStored r; + typename TensorPartition::PartitionStored r; + r.reserve(p.size()); + for (const auto& i : p) + r.push_back({detail::split_dimension(pos, i[0], new_dim, From), + detail::split_dimension(pos, i[1], new_dim, Size)}); + return TensorPartition{detail::split_dimension(pos, dim, new_dim, Size), r, + isLocal}; + } + + /// Coarse the ranges on each process + + template 0), bool>::type = true> + TensorPartition coarse_support(const Coor& blocking) const + { + typename TensorPartition::PartitionStored r; + r.reserve(p.size()); + for (const auto& i : p) + r.push_back(detail::coarse_range(i, blocking)); + return TensorPartition{dim, r, isLocal}; + } + + /// Collapse several dimensions into another dimension + + template 0), bool>::type = true> + TensorPartition collapse_dimensions(std::size_t pos) const + { + typename TensorPartition::PartitionStored r; + r.reserve(p.size()); + for (const auto& i : p) + r.push_back({detail::collapse_dimensions(pos, i[0], dim, From), + detail::collapse_dimensions(pos, i[1], dim, Size)}); + return TensorPartition{detail::collapse_dimensions(pos, dim, dim, Size), r, + isLocal}; + } + + /// Reshape several dimensions into another dimension + /// \param ncollapse: number of dimensions to collapse starting from each old dimension + /// \param nsplit: number of dimensions to split starting from each old dimension + /// \param new_dim: maximum dimension size for the new tensor + + template 0 && Nout > 0), bool>::type = true> + std::pair> + reshape_dimensions(const Coor& ncollapse, const Coor& nsplit, + const Coor& new_dim) const + { + auto new_dim_aux = + detail::reshape_dimensions(ncollapse, nsplit, dim, new_dim, Size, dim); + if (new_dim_aux.first != Success) + return {new_dim_aux.first, {}}; + typename TensorPartition::PartitionStored r; + r.reserve(p.size()); + for (const auto& i : p) + { + auto new_from = + detail::reshape_dimensions(ncollapse, nsplit, dim, new_dim, From, i[0]); + auto new_size = + detail::reshape_dimensions(ncollapse, nsplit, dim, new_dim, Size, i[1]); + if (new_from.first != Success) + return {new_from.first, {}}; + if (new_size.first != Success) + return {new_size.first, {}}; + r.push_back({new_from.second, new_size.second}); + } + return {Success, TensorPartition{new_dim_aux.second, r, isLocal}}; + } + + /// Extend the support of distributed dimensions by one step in each direction + + TensorPartition extend_support(Coor m) const + { + typename TensorPartition::PartitionStored r; + r.reserve(p.size()); + for (const auto& i : p) + { + superbblas::PartitionItem fs; + for (unsigned int j = 0; j < N; ++j) + { + fs[1][j] = std::min(i[1][j] + 2 * m[j], dim[j]); + fs[0][j] = (fs[1][j] < dim[j] ? (i[0][j] - m[j] + dim[j]) % dim[j] : 0); + } + r.push_back(fs); + } + return TensorPartition{dim, r, isLocal}; + } + + /// Return a subpartition given a range + + TensorPartition get_subpartition(const Coor& from, const Coor& size) const + { + typename TensorPartition::PartitionStored r; r.reserve(p.size()); for (const auto& i : p) { - if (i[1][pos] % step != 0 && i[1][pos] > step) - throw std::runtime_error("Unsupported splitting a dimension with an uneven lattice " - "portions in all processes"); - r.push_back( - {insert_coor(replace_coor(i[0], pos, i[0][pos] % step), pos + 1, i[0][pos] / step), - insert_coor(replace_coor(i[1], pos, std::min(i[1][pos], step)), pos + 1, - (i[1][pos] + step - 1) / step)}); + Coor lfrom, lsize; + superbblas::detail::intersection(i[0], i[1], from, size, dim, lfrom, lsize); + r.push_back(superbblas::detail::volume(lsize) == 0 + ? std::array, 2>{Coor{{}}, Coor{{}}} + : std::array, 2>{normalize_coor(lfrom - from, dim), lsize}); } - return TensorPartition{ - insert_coor(replace_coor(dim, pos, std::min(dim[pos], step)), pos + 1, dim[pos] / step), - r}; + return TensorPartition{size, r, isLocal}; } /// Return a partition with the local portion of the tensor @@ -634,6 +1379,15 @@ namespace Chroma localSize(), PartitionStored(1, superbblas::PartitionItem{{{}, localSize()}}), true}; } + /// Return a partition with the local portion of the tensor + + TensorPartition get_glocal_partition() const + { + PartitionStored r(p.size()); + r[MpiProcRank()] = p[MpiProcRank()]; + return TensorPartition{dim, r, isLocal}; + } + /// Return a copy of this tensor with a compatible distribution to be contracted with the given tensor /// \param order: labels for this distribution /// \param t: given tensor distribution @@ -663,6 +1417,9 @@ namespace Chroma msize[ordert[i]] = t.p[pi][1][i]; for (std::size_t i = 0; i < N; ++i) r[pi][1][i] = msize[order[i]]; + + if (superbblas::detail::volume(r[pi][1]) == 0) + r[pi] = std::array, 2>{Coor{{}}, Coor{{}}}; } return TensorPartition{dim, r, isLocal}; } @@ -696,14 +1453,25 @@ namespace Chroma static PartitionStored all_tensor_replicated(Coor dim) { int nprocs = Layout::numNodes(); - // Set the first coordinate of the tensor supported on each prop to zero and the size - // to dim + // Set the range size to given dim PartitionStored fs(nprocs); for (auto& it : fs) it[1] = dim; return fs; } + /// Return a partitioning where only the current node has support for the tensor + /// \param dim: dimension size for the tensor + + static PartitionStored all_tensor_glocal(Coor dim) + { + int nprocs = Layout::numNodes(); + // Set the first coordinate of the tensor supported on each prop to zero and the size + PartitionStored fs(nprocs); + fs[Layout::nodeNumber()][1] = dim; + return fs; + } + /// Return a partitioning for a tensor of `dim` dimension onto a grid of processes /// \param order: dimension labels (use x, y, z, t for lattice dimensions) /// \param dim: dimension size for the tensor @@ -764,9 +1532,81 @@ namespace Chroma break; } } + + // Normalize + if (superbblas::detail::volume(fs[rank][1]) == 0) + fs[rank] = std::array, 2>{Coor{{}}, Coor{{}}}; } return fs; } + + /// Return a partitioning for a tensor of `dim` dimension onto a grid of processes + /// \param order: dimension labels + /// \param dim: dimension size for the tensor + /// \param order: labels to distribute + + static PartitionStored partitioning_distributed(const std::string& order, + const Coor& dim, + const std::string& dist_labels) + { + Coor dist_dim = dim; + + // Update the dimension x with the even-odd label X + { + const auto& itX = std::find(order.begin(), order.end(), 'X'); + const auto& itx = std::find(order.begin(), order.end(), 'x'); + if (itX != order.end() && itx != order.end()) + { + dist_dim[itx - order.begin()] *= dim[itX - order.begin()]; + } + } + + // Avoid splitting even and odds components for xyzt + const std::string even_odd_labels = "xyzt"; + for (unsigned int i = 0; i < even_odd_labels.size(); ++i) + { + const auto& it = std::find(order.begin(), order.end(), even_odd_labels[i]); + if (it != order.end() && dist_dim[it - order.begin()] % 2 == 0) + dist_dim[it - order.begin()] /= 2; + } + + int num_procs = Layout::numNodes(); + auto procs = superbblas::partitioning_distributed_procs(order.c_str(), dist_dim, + dist_labels.c_str(), num_procs); + return superbblas::basic_partitioning(order.c_str(), dim, procs, dist_labels.c_str(), + num_procs); + } + + /// Return a partitioning for a tensor of `dim` dimension onto a grid of processes + /// \param order: dimension labels + /// \param dim: dimension size for the tensor + /// \param order: labels to distribute + + static PartitionStored partitioning_distributed_compact(const std::string& order, + const Coor& dim, + const std::string& dist_labels) + { + Coor procs; + for (unsigned int i = 0; i < N; ++i) procs[i] = 1; + + // Update the dimension x with the even-odd label X + { + int num_procs = Layout::numNodes(); // remaining processes + for (int dist_label_index = 0; dist_label_index < dist_labels.size(); + ++dist_label_index) + { + const auto& it = std::find(order.begin(), order.end(), dist_labels[dist_label_index]); + if (it == order.end()) + continue; + + procs[it - order.begin()] = std::min(dim[it - order.begin()], num_procs); + num_procs = num_procs / procs[it - order.begin()]; + } + } + + return superbblas::basic_partitioning(order.c_str(), dim, procs, dist_labels.c_str(), + Layout::numNodes()); + } }; template @@ -783,6 +1623,15 @@ namespace Chroma template struct NaN; + /// Specialization for int + template <> + struct NaN { + static int get() + { + return std::numeric_limits::min(); + } + }; + /// Specialization for float template <> struct NaN { @@ -810,6 +1659,15 @@ namespace Chroma } }; + /// Specialization for DIYComplex + template + struct NaN> { + static T get() + { + return NaN::get(); + } + }; + /// Return if a float, double, and std::complex is finite template struct IsFinite { @@ -842,18 +1700,6 @@ namespace Chroma return s; } - template - Ostream& operator<<(Ostream& s, Distribution dist) - { - switch (dist) - { - case OnMaster: s << "OnMaster"; break; - case OnEveryone: s << "OnEveryone"; break; - case OnEveryoneReplicated: s << "OnEveryoneReplicated"; break; - } - return s; - } - template Ostream& operator<<(Ostream& s, std::complex o) { @@ -869,6 +1715,9 @@ namespace Chroma return s; } + template + Ostream& operator<<(Ostream& s, const std::array& o); + template Ostream& operator<<(Ostream& s, const std::vector& o) { @@ -915,16 +1764,6 @@ namespace Chroma log(1, ss.str()); } - /// is_complex::value is true if `T` is complex - - template - struct is_complex : std::false_type { - }; - - template - struct is_complex> : std::true_type { - }; - template ::value, bool>::type = true> T safe_div(A a, B b) @@ -951,33 +1790,122 @@ namespace Chroma }(); return v; } + + /// Data allocation + template + struct Allocation { + /// Allocation + T* ptr; + + /// Context of the allocation + std::shared_ptr ctx; + + /// Unfinished operations on the allocation + std::vector pending_operations; + + /// Deallocate the pointer on destruction + bool destroy_ptr; + + /// Allocate n elements of type T with the given context + /// \param n: number of elements to allocate + /// \param ctx: context of the allocation + + Allocation(std::size_t n, const std::shared_ptr& ctx) + : ptr(superbblas::allocate(n, *ctx)), ctx(ctx), destroy_ptr(true) + { + } + + /// User given pointer that will not be deallocated automatically + /// \param ptr: pointer to an allocation + /// \param ctx: context of the allocation + + Allocation(T* ptr, const std::shared_ptr& ctx) + : ptr(ptr), ctx(ctx), destroy_ptr(false) + { + } + + /// Destructor + + ~Allocation() + { + finish_pending_operations(); + if (destroy_ptr) + superbblas::deallocate(ptr, *ctx); + } + + /// Return the pointer + + T* data() + { + finish_pending_operations(); + return ptr; + } + + /// Append a pending operation + /// \param req: pending operation to finish later + /// NOTE: finish the operation for pointers not managed; those come from Chroma objects + /// and Chroma users may get unexpected results if they access the Chroma object + /// while the proxy Tensor object created with `asTensorView` is still alive. + + void append_pending_operation(const superbblas::Request& req) + { + if (destroy_ptr) + { + if (req) + pending_operations.push_back(req); + } + else + { + superbblas::wait(req); + } + } + + /// Finish all pending operations + void finish_pending_operations() + { + for (const auto& i : pending_operations) + superbblas::wait(i); + pending_operations.clear(); + } + }; } + enum CopyingTrash { dontCopyingTrash, doCopyingTrash }; + /// Class for operating dense tensors template struct Tensor { - static_assert(superbblas::supported_type::value, "Not supported type"); + using value_type = typename detail::base_type::type; + static_assert(superbblas::supported_type::value, "Not supported type"); + + /// Allocation type + /// NOTE: the complex types decay to the base type, which is needed by `toFakeReal` + using Allocation = detail::Allocation::type>; public: - std::string order; ///< Labels of the tensor dimensions - Coor dim; ///< Length of the tensor dimensions - std::shared_ptr ctx; ///< Tensor storage information (device/host) - std::shared_ptr data; ///< Pointer to the tensor storage + std::string order; ///< Labels of the tensor dimensions + Coor dim; ///< Length of the tensor dimensions + std::shared_ptr allocation; ///< Tensor storage std::shared_ptr> - p; ///< Distribution of the tensor among the processes - Distribution dist; ///< Whether the tensor is stored on the cpu or a device - Coor from; ///< First active coordinate in the tensor - Coor size; ///< Number of active coordinates on each dimension - Coor strides; ///< Displacement for the next element along every direction - T scalar; ///< Scalar factor of the tensor - bool conjugate; ///< Whether the values are implicitly conjugated + p; ///< Distribution of the tensor among the processes + Distribution dist; ///< Whether the tensor is stored on the cpu or a device + Coor from; ///< First active coordinate in the tensor + Coor size; ///< Number of active coordinates on each dimension + Stride strides; ///< Displacement for the next element along every direction + value_type scalar; ///< Scalar factor of the tensor for unconjugated + bool conjugate; ///< Whether the values are implicitly conjugated + bool eg; ///< Whether this tensor is an example + bool unordered_writing; ///< Whether to allow execution of writing operations + /// in a different order than the one issue + char complexLabel; /// label for the dimension having the real and the imaginary part + /// (only used when `T` is DIYComplex /// Return a string describing the tensor /// \param ptr: pointer to the memory allocation /// \return: the string representing the tensor - std::string repr(T* ptr = nullptr) const + std::string repr(value_type* ptr = nullptr) const { using namespace detail::repr; std::stringstream ss; @@ -985,8 +1913,8 @@ namespace Chroma if (ptr) ss << "data:" << ptr << ", "; std::size_t sizemb = p->localVolume() * sizeof(T) / 1024 / 1024; - ss << "order:" << order << ", dim:" << dim << ", dist:" << dist - << ", local_storage:" << sizemb << " MiB}"; + ss << "order:" << order << ", from:" << from << ", size:" << size << ", dim:" << dim + << ", dist:" << dist << ", local_storage:" << sizemb << " MiB}"; return ss.str(); } @@ -998,10 +1926,11 @@ namespace Chroma /// \param dist: how to distribute the tensor, see `Distribution` Tensor(const std::string& order, Coor dim, DeviceHost dev = OnDefaultDevice, - Distribution dist = OnEveryone) + Distribution dist = OnEveryone, char complexLabel = 0) : Tensor(order, dim, dev, dist, std::make_shared>( - detail::TensorPartition(order, dim, dist))) + detail::TensorPartition(order, dim, dist)), + false /*= unordered_writing */, complexLabel) { } @@ -1009,16 +1938,18 @@ namespace Chroma Tensor() : order(detail::getTrivialOrder(N)), - dim{}, - ctx(detail::getContext(OnHost)), + dim{{}}, p(std::make_shared>( - detail::TensorPartition(detail::getTrivialOrder(N), {}, OnEveryoneReplicated))), + detail::TensorPartition(detail::getTrivialOrder(N), {{}}, OnEveryoneReplicated))), dist(OnEveryoneReplicated), - from{}, - size{}, - strides{}, + from{{}}, + size{{}}, + strides{{}}, scalar{0}, - conjugate{false} + conjugate{false}, + eg{false}, + unordered_writing{false}, + complexLabel{0} { } @@ -1028,19 +1959,23 @@ namespace Chroma /// \param dev: where to allocate the content on the GPU if available (`OnDefaultDevice`) /// or on CPU always (`OnHost`) /// \param dist: how to distribute the tensor, see `Distribution` + /// \param ptr: pointer to the first element Tensor(const std::string& order, Coor dim, DeviceHost dev, Distribution dist, - std::shared_ptr data) + value_type* ptr, char complexLabel = 0) : order(order), dim(dim), - ctx(detail::getContext(dev)), - data(data), + allocation(std::make_shared((typename detail::real_type::type*)ptr, + detail::getContext(dev))), dist(dist), - from{}, + from{{}}, size(dim), - strides(detail::get_strides(dim, superbblas::FastToSlow)), + strides(detail::get_strides(dim, superbblas::FastToSlow)), scalar{1}, - conjugate{false} + conjugate{false}, + eg{false}, + unordered_writing{false}, + complexLabel{complexLabel} { checkOrder(); @@ -1053,34 +1988,38 @@ namespace Chroma /// Internal constructor, used by `toFakeReal` /// \param order: dimension labels of the tensor /// \param dim: size for each dimension - /// \param ctx: superbblas context - /// \param data: memory allocation + /// \param allocation: allocation /// \param p: partition of the tensor among the processes /// \param dist: how to distribute the tensor, see `Distribution` /// \param from: coordinate of the first element in this view /// \param size: elements in each direction in this view /// \param scalar: scalar factor of this view /// \param conjugate: whether the elements are implicitly conjugated - - Tensor(const std::string& order, Coor dim, std::shared_ptr ctx, - std::shared_ptr data, std::shared_ptr> p, - Distribution dist, Coor from, Coor size, T scalar, bool conjugate) + /// \param eg: whether the tensor is an example gratia + /// \param unordered_writing: whether its allow to apply the writings in different order + /// \param complexLabel: complexity label + + Tensor(const std::string& order, Coor dim, std::shared_ptr allocation, + std::shared_ptr> p, Distribution dist, Coor from, + Coor size, value_type scalar, bool conjugate, bool eg, bool unordered_writing, + char complexLabel) : order(order), dim(dim), - ctx(ctx), - data(data), + allocation(allocation), p(p), dist(dist), from(normalize_coor(from, dim)), size(size), - strides(detail::get_strides(dim, superbblas::FastToSlow)), + strides(detail::get_strides(dim, superbblas::FastToSlow)), scalar(scalar), - conjugate(conjugate) + conjugate(conjugate), + eg(eg), + unordered_writing(unordered_writing), + complexLabel(complexLabel) { checkOrder(); } - protected: /// Internal constructor, used by `make_suitable_for_contraction` /// \param order: dimension labels of the tensor /// \param dim: size for each dimension @@ -1088,31 +2027,29 @@ namespace Chroma /// \param p: partition of the tensor among the processes Tensor(const std::string& order, Coor dim, DeviceHost dev, Distribution dist, - std::shared_ptr> p) + std::shared_ptr> p, bool unordered_writing, + char complexLabel) : order(order), dim(dim), - ctx(detail::getContext(dev)), + /// NOTE: the extra two factor shouldn't apply for DIYComplex + allocation(std::make_shared( + p->localVolume() * (detail::is_complex::value ? 2u : 1u), detail::getContext(dev))), p(p), dist(dist), - from{}, + from{{}}, size(dim), - strides(detail::get_strides(dim, superbblas::FastToSlow)), + strides(detail::get_strides(dim, superbblas::FastToSlow)), scalar{1}, - conjugate{false} + conjugate{false}, + eg{false}, + unordered_writing{unordered_writing}, + complexLabel{complexLabel} { checkOrder(); - superbblas::Context ctx0 = *ctx; - std::string s = repr(); - detail::log(1, "allocating " + s); - T* ptr = superbblas::allocate(p->localVolume(), *ctx); detail::log_mem(); - data = std::shared_ptr(ptr, [=](const T* ptr) { - superbblas::deallocate(ptr, ctx0); - detail::log(1, "deallocated " + s); - detail::log_mem(); - }); } + protected: /// Internal constructor, used by functions making slices, eg. `kvslice_from_size` /// \param order: dimension labels of the tensor /// \param from: coordinate of the first element in this view @@ -1121,15 +2058,17 @@ namespace Chroma Tensor(const Tensor& t, const std::string& order, Coor from, Coor size) : order(order), dim(t.dim), - ctx(t.ctx), - data(t.data), + allocation(t.allocation), p(t.p), dist(t.dist), from(normalize_coor(from, t.dim)), size(size), strides(t.strides), scalar{t.scalar}, - conjugate{t.conjugate} + conjugate{t.conjugate}, + eg{t.eg}, + unordered_writing{t.unordered_writing}, + complexLabel{t.complexLabel} { checkOrder(); } @@ -1138,18 +2077,20 @@ namespace Chroma /// \param scalar: scalar factor of this view /// \param conjugate: whether the elements are implicitly conjugated - Tensor(const Tensor& t, T scalar, bool conjugate) + Tensor(const Tensor& t, value_type scalar, bool conjugate) : order(t.order), dim(t.dim), - ctx(t.ctx), - data(t.data), + allocation(t.allocation), p(t.p), dist(t.dist), from(t.from), size(t.size), strides(t.strides), scalar{scalar}, - conjugate{conjugate} + conjugate{conjugate}, + eg{false}, + unordered_writing{t.unordered_writing}, + complexLabel{t.complexLabel} { checkOrder(); } @@ -1166,7 +2107,7 @@ namespace Chroma explicit operator bool() const noexcept { - return superbblas::detail::volume(size) > 0; + return volume() > 0; } /// Return whether the view doesn't start at the origin or doesn't encompass the whole original tensor @@ -1180,7 +2121,7 @@ namespace Chroma bool isSubtensor() const { - return (from != Coor{} || size != dim); + return (from != Coor{{}} || size != dim); } /// Return the first coordinate supported by the tensor @@ -1215,6 +2156,22 @@ namespace Chroma return d; } + /// Return the allocated dimensions of the tensor + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// t.kvslice_from_size({}, {{'s',1}}).kvdim(); // is {{'c',Nc},{'s',1}} + /// t.kvslice_from_size({}, {{'s',1}}).alloc_kvdim(); // is {{'c',Nc},{'s',2}} + + std::map alloc_kvdim() const + { + std::map d; + for (unsigned int i = 0; i < N; ++i) + d[order[i]] = dim[i]; + return d; + } + /// Return the number of the elements in the tensor /// /// Example: @@ -1228,40 +2185,161 @@ namespace Chroma return superbblas::detail::volume(size); } - /// Get an element of the tensor - /// \param coor: coordinates of the element to get - /// \return: the value of the element at the coordinate - /// - /// NOTE: - /// - operation allowed only for tensors supported on the CPU and replicated on every process (or local) - /// - the operation is slow, avoid in critical performance operations + /// Return the number of local elements in the tensor to this process /// /// Example: /// - /// Tensor<2,Complex> t("cs", {{Nc,Ns}}, OnHost, OnEveryoneReplicated); - /// t.set({0,1}, 1.0); // set the element with c=0 and s=1 to 1.0 - /// t.get({0,1}); // get the element with c=0 and s=1 - /// - /// Tensor<5,double> q("xyztX", latticeSize<5>("xyztX"), OnHost); - /// q.getLocal().set({0,0,0,0,0}, 1.0); // set the first local element in this process to 1.0 - /// q.getLocal().get({0,0,0,0,0}); // get the first local element in this process + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// t.localVolume(); // is Nc*Ns if t replicated among all processes - T get(Coor coor) const + std::size_t localVolume() const { - if (ctx->plat != superbblas::CPU) - throw std::runtime_error( - "Unsupported to `get` elements from tensors not stored on the host"); - if (dist == OnEveryone) + return p->localVolume(); + } + + /// Return the product of the size for each given label + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// t.volume("c"); // is Nc + + std::size_t volume(const std::string& labels) const + { + const auto d = kvdim(); + std::size_t vol = 1; + for (char l : labels) + vol *= d.at(l); + return vol; + } + + /// Return whether the tensor is an example + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// t.volume(); // is Nc*Ns + /// t.kvslice_from_size({}, {{'s',1}}).volume(); // is Nc*1 + + bool is_eg() const + { + return eg; + } + + /// Return if the given tensor has the same distribution as this + /// \param w: tensor to compare with + + template + bool isDistributedAs(Tensor w, Maybe labels = none) const + { + return p->is_compatible(order, *w.p, w.order, labels.getSome(order)); + } + + /// Return if the given tensor the same length for the shared dimensions + /// \param w: tensor to compare with + /// \param labels: dimension labels to compare if given; all labels otherwise + + template + bool is_compatible(Tensor w, Maybe labels = none) const + { + std::string labels_to_compare = labels.getSome(order); + auto dims = kvdim(); + auto wdims = w.kvdim(); + for (char c : labels_to_compare) + if (wdims.count(c) == 1 && wdims.at(c) != dims.at(c)) + return false; + return true; + } + + /// Return the allocation is managed by superbblas + + bool is_managed() const + { + if (!allocation) + return true; + return allocation->destroy_ptr; + } + + /// Return the pointer to the first local element + /// NOTE: there will be no pending writing operations + + value_type* data() const + { + if (!allocation) + return nullptr; + + // If the pointer isn't managed by supperbblas, it may be managed by Chroma + // and we make sure that all operations from Chroma side are finished + if (!is_managed()) + superbblas::syncLegacyStream(ctx()); + + return (value_type*)allocation->data(); + } + + /// Return the pointer to the first local element + /// NOTE: there may be pending writing operations if `unordered_writing` is true + + value_type* data_for_writing() const + { + if (!allocation) + return nullptr; + + // If the pointer isn't managed by supperbblas, it may be managed by Chroma + // and we make sure that all operations from Chroma side are finished + if (!allocation->destroy_ptr) + superbblas::syncLegacyStream(ctx()); + + if (unordered_writing) + return (value_type*)allocation->ptr; + + return (value_type*)allocation->data(); + } + + /// Return the allocation context + + superbblas::Context& ctx() const + { + if (!allocation) + return *detail::getContext(OnHost); + return *allocation->ctx; + } + + /// Get an element of the tensor + /// \param coor: coordinates of the element to get + /// \return: the value of the element at the coordinate + /// + /// NOTE: + /// - operation allowed only for tensors supported on the CPU and replicated on every process (or local) + /// - the operation is slow, avoid in critical performance operations + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}, OnHost, OnEveryoneReplicated); + /// t.set({0,1}, 1.0); // set the element with c=0 and s=1 to 1.0 + /// t.get({0,1}); // get the element with c=0 and s=1 + /// + /// Tensor<5,double> q("xyztX", latticeSize<5>("xyztX"), OnHost); + /// q.getLocal().set({0,0,0,0,0}, 1.0); // set the first local element in this process to 1.0 + /// q.getLocal().get({0,0,0,0,0}); // get the first local element in this process + + value_type get(Coor coor) const + { + if (ctx().plat != superbblas::CPU) + throw std::runtime_error( + "Unsupported to `get` elements from tensors not stored on the host"); + if (dist != OnEveryoneReplicated && dist != Local && dist != Glocal) throw std::runtime_error( "Unsupported to `get` elements on a distributed tensor; change the distribution to " - "be supported on master, replicated among all nodes, or local"); + "`OnEveryoneReplicated` or local or glocal"); + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); // coor[i] = coor[i] + from[i] for (unsigned int i = 0; i < N; ++i) coor[i] = normalize_coor(normalize_coor(coor[i], size[i]) + from[i], dim[i]); return detail::cond_conj(conjugate, - data.get()[detail::coor2index(coor, dim, strides)] * scalar); + data()[detail::coor2index(coor, dim, strides)] * scalar); } /// Set an element of the tensor @@ -1280,22 +2358,285 @@ namespace Chroma /// Tensor<5,double> q("xyztX", latticeSize<5>("xyztX"), OnHost); /// q.getLocal().set({0,0,0,0,0}, 1.0); // set the first local element in this process to 1.0 - void set(Coor coor, T v) + void set(Coor coor, value_type v) { - if (ctx->plat != superbblas::CPU) - throw std::runtime_error( - "Unsupported to `get` elements from tensors not stored on the host"); - if (dist == OnEveryone) + if (dist != OnEveryoneReplicated && dist != Local && dist != Glocal) throw std::runtime_error( "Unsupported to `set` elements on a distributed tensor; change the distribution to " - "be supported on master, replicated among all nodes, or local"); + "`OnEveryoneReplicated` or local or glocal"); + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); - // coor[i] = coor[i] + from[i] - for (unsigned int i = 0; i < N; ++i) - coor[i] = normalize_coor(normalize_coor(coor[i], size[i]) + from[i], dim[i]); + if (ctx().plat == superbblas::CPU) + { + // coor[i] = coor[i] + from[i] + for (unsigned int i = 0; i < N; ++i) + coor[i] = normalize_coor(normalize_coor(coor[i], size[i]) + from[i], dim[i]); + + data_for_writing()[detail::coor2index(coor, dim, strides)] = + detail::cond_conj(conjugate, v) / scalar; + } + else + { + Tensor<1, T>(std::string(1, order[0]), Coor<1>{1}, OnHost, OnEveryoneReplicated, &v) + .copyTo(this->slice_from_size(coor, superbblas::detail::ones())); + } + } + + /// Modify the content this tensor with the result of a function on each element + /// \param func: function () -> value_type + /// \param threaded: whether to run threaded + + template + void fillWithCPUFuncNoArgs(Func func, bool threaded = true) + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + auto t = isSubtensor() ? cloneOn(OnHost) : make_sure(none, OnHost); + std::size_t vol = t.getLocal().volume(); + value_type* ptr = t.data_for_writing(); + + if (threaded) + { +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + ptr[i] = func(); + } + else + { + for (std::size_t i = 0; i < vol; ++i) + ptr[i] = func(); + } + + t.copyTo(*this); + } + + /// Fill the tensor with the value of the function applied to each element + /// \param func: function (Coor) -> value_type + /// \param threaded: whether to run threaded + + template + void fillCpuFunCoor(Func func, bool threaded = true) + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + using superbblas::detail::operator+; + + auto t = isSubtensor() ? cloneOn(OnHost) : make_sure(none, OnHost); + std::size_t vol = t.getLocal().volume(); + value_type* ptr = t.data_for_writing(); + /// Number of elements in each direction for the local part + Coor local_size = t.getLocal().size; + /// Stride for the local volume + Stride local_stride = + superbblas::detail::get_strides(local_size, superbblas::FastToSlow); + /// Coordinates of first elements stored locally + Coor local_from = t.p->localFrom(); + + if (threaded) + { +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + { + // Get the global coordinates + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_size, local_stride) + local_from, t.dim); + ptr[i] = func(c); + } + } + else + { + for (std::size_t i = 0; i < vol; ++i) + { + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_size, local_stride) + local_from, t.dim); + ptr[i] = func(c); + } + } + + t.copyTo(*this); + } + + /// Return a new tensor with the value of the function applied to each element + /// \param func: function value_type -> value_type for Tr + /// \param threaded: whether to run threaded + + template + Tensor transformWithCPUFun(Func func, bool threaded = true) const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); - data.get()[detail::coor2index(coor, dim, strides)] = - detail::cond_conj(conjugate, v) / scalar; + auto t = isSubtensor() ? cloneOn(OnHost) : make_sure(none, OnHost); + auto r = t.template make_compatible(); + assert(!r.isSubtensor() && !t.isSubtensor()); + std::size_t vol = t.getLocal().volume(); + value_type* tptr = t.data(); + typename Tensor::value_type* rptr = r.data(); + + if (threaded) + { +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + rptr[i] = func(tptr[i]); + } + else + { + for (std::size_t i = 0; i < vol; ++i) + rptr[i] = func(tptr[i]); + } + + return r.make_sure(none, getDev()); + } + + /// Return a new tensor with the value of the function applied to each element + /// \param func: function (Coor, value_type) -> value_type for Tr + /// \param threaded: whether to run threaded + + template + Tensor transformWithCPUFunWithCoor(FuncWithCoor func, bool threaded = true) const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + using superbblas::detail::operator+; + + auto t = isSubtensor() ? cloneOn(OnHost) : make_sure(none, OnHost); + auto r = t.template make_compatible(); + assert(!r.isSubtensor() && !t.isSubtensor()); + std::size_t vol = t.getLocal().volume(); + value_type* tptr = t.data(); + typename Tensor::value_type* rptr = r.data(); + /// Number of elements in each direction for the local part + Coor local_size = t.getLocal().size; + /// Stride for the local volume + Stride local_stride = + superbblas::detail::get_strides(local_size, superbblas::FastToSlow); + /// Coordinates of first elements stored locally + Coor local_from = t.p->localFrom(); + + if (threaded) + { +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + { + // Get the global coordinates + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_size, local_stride) + local_from, t.dim); + rptr[i] = func(c, tptr[i]); + } + } + else + { + for (std::size_t i = 0; i < vol; ++i) + { + // Get the global coordinates + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_size, local_stride) + local_from, t.dim); + rptr[i] = func(c, tptr[i]); + } + } + + return r.make_sure(none, getDev()); + } + + /// Return the coordinates of the first element returning true by the given function + /// \param func: function (value_type) -> bool + + template + Maybe> find(Func func) const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + using superbblas::detail::operator+; + + auto t_ = make_sure(none, OnHost, OnMaster); + auto t = isSubtensor() ? t_.cloneOn(OnHost) : t_; + assert(!t.isSubtensor()); + std::size_t vol = t.getLocal().volume(); + value_type* tptr = t.data(); + /// Number of elements in each direction for the local part + Coor local_size = t.getLocal().size; + /// Stride for the local volume + Stride local_stride = + superbblas::detail::get_strides(local_size, superbblas::FastToSlow); + /// Coordinates of first elements stored locally + Coor local_from = t.p->localFrom(); + + Maybe> r = none; + for (std::size_t i = 0; i < vol; ++i) + { + if (func(detail::cond_conj(t.conjugate, tptr[i]))) + { + // Get the global coordinates + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_size, local_stride) + local_from, t.dim); + r = Maybe>(c); + break; + } + } + + return broadcast(r); + } + + /// Apply the function to each tensor element + /// \param func: function value_type -> void + /// \param threaded: whether to run threaded + + template + void foreachWithCPUFun(Func func, bool threaded = true) const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + auto t = isSubtensor() ? cloneOn(OnHost) : make_sure(none, OnHost); + assert(!t.isSubtensor()); + std::size_t vol = t.getLocal().volume(); + value_type* tptr = t.data(); + + if (threaded) + { +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + func(detail::cond_conj(t.conjugate, tptr[i])); + } + else + { + for (std::size_t i = 0; i < vol; ++i) + func(detail::cond_conj(t.conjugate, tptr[i])); + } + } + + /// Set all elements with the given value + /// \param v: the new value for all the elements + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}, OnHost, OnEveryoneReplicated); + /// t.set({0,1}, 1.0); // set the element with c=0 and s=1 to 1.0 + /// + /// Tensor<5,double> q("xyztX", latticeSize<5>("xyztX"), OnHost); + /// q.getLocal().set({0,0,0,0,0}, 1.0); // set the first local element in this process to 1.0 + + void set(value_type v) + { + if (std::norm(v) == 0) + set_zero(); + else + fillWithCPUFuncNoArgs([=]() { return v; }); } /// Return a new tensors with the dimension labels renamed @@ -1310,7 +2651,8 @@ namespace Chroma Tensor rename_dims(const SB::remap& m) const { - return Tensor(*this, detail::update_order(order, m), this->from, this->size); + return Tensor(*this, detail::update_order_and_check(order, m), this->from, + this->size); } /// Return a slice of the tensor starting at coordinate `kvfrom` and taking `kvsize` elements @@ -1319,7 +2661,7 @@ namespace Chroma /// /// \param kvfrom: dictionary with the index of the first element in each direction /// \param kvsize: dictionary with the number of elements in each direction - /// \return: new view of the tensor + /// \return: new view of the tensor /// /// Example: /// @@ -1367,6 +2709,67 @@ namespace Chroma return Tensor(*this, order, this->from + from, size); } + /// Return a similar tensor keeping the same distribution + /// \param new_order: dimension labels of the new tensor + /// \param kvsize: override the length of the given dimensions + /// \param new_dev: device + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// // Create a new tensor as a collection of three `t` tensors + /// Tensor<3,Complex> q = t.make_compatible<3>("csn", {{'n',3}}); + /// // Create a tensor like q but with allocation on host + /// Tensor<3,Complex> v = q.make_compatible(none, {}, OnHost); + + template + Tensor make_compatible(const Maybe& new_order = none, + const std::map& kvsize = {}, + Maybe new_dev = none) const + { + std::map new_kvdim = kvdim(); + for (const auto& it : kvsize) + new_kvdim[it.first] = it.second; + std::string new_order_ = new_order.getSome(order); + auto new_dim = kvcoors(new_order_, new_kvdim, 0, ThrowOnMissing); + std::string same_dim_labels; + auto dim_ = kvdim(); + for (char c : new_order_) + if (dim_.count(c) == 1 && dim_.at(c) == new_kvdim.at(c)) + same_dim_labels.push_back(c); + return Tensor(new_order_, new_dim, new_dev.getSome(getDev()), dist, + std::make_shared>( + p->get_subpartition(from, size) + .make_compatible(order, new_order_, new_dim, same_dim_labels)), + false /* unordered_writing */, complexLabel); + } + + /// Return a tensor on the same device and following the same distribution + /// \param new_order: dimension labels of the new tensor + /// \param remaining_char: placeholder for the remaining dimensions + /// \param kvsize: override the length of the given dimensions + /// \param new_dev: device + /// \param new_dist: distribution + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}); + /// // Create a new tensor as a collection of three `t` tensors + /// Tensor<3,Complex> q = t.make_compatible<3>("%n", '%', "", {{'n',3}}); + /// // Create a tensor like q but without the dimension c + /// Tensor<2,Complex> v = q.make_compatible<2>("%", '%', "c"); + + template + Tensor + make_compatible(const std::string& new_order, char remaining_char, + const std::string& remove_dims = "", const std::map& kvsize = {}, + Maybe new_dev = none) const + { + return make_compatible( + detail::remove_dimensions(get_order_for_reorder(new_order, remaining_char), remove_dims), + kvsize, new_dev); + } + /// Return a tensor on the same device and following the same distribution /// \param new_order: dimension labels of the new tensor /// \param kvsize: override the length of the given dimensions @@ -1390,8 +2793,9 @@ namespace Chroma for (const auto& it : kvsize) new_kvdim[it.first] = it.second; std::string new_order_ = new_order.getSome(order); - return Tensor(new_order_, kvcoors(new_order_, new_kvdim, 0, ThrowOnMissing), - new_dev.getSome(getDev()), new_dist.getSome(dist)); + auto new_dim = kvcoors(new_order_, new_kvdim, 0, ThrowOnMissing); + return Tensor(new_order_, new_dim, new_dev.getSome(getDev()), + new_dist.getSome(dist), complexLabel); } /// Return a tensor on the same device and following the same distribution @@ -1409,20 +2813,15 @@ namespace Chroma /// // Create a tensor like q but without the dimension c /// Tensor<2,Complex> v = q.like_this<2>("%", '%', "c"); - template Tensor like_this(const std::string& new_order, char remaining_char, const std::string& remove_dims = "", const std::map& kvsize = {}, Maybe new_dev = none, Maybe new_dist = none) const { - std::map new_kvdim = kvdim(); - for (const auto& it : kvsize) - new_kvdim[it.first] = it.second; - std::string new_order_ = - detail::remove_dimensions(get_order_for_reorder(new_order, remaining_char), remove_dims); - return Tensor(new_order_, kvcoors(new_order_, new_kvdim, 0, ThrowOnMissing), - new_dev.getSome(getDev()), new_dist.getSome(dist)); + return like_this( + detail::remove_dimensions(get_order_for_reorder(new_order, remaining_char), remove_dims), + kvsize, new_dev, new_dist); } /// Return a copy of this tensor, possibly with a new precision `nT` @@ -1451,19 +2850,52 @@ namespace Chroma template Tensor cloneOn(DeviceHost new_dev) const { - Tensor r = like_this(none, {}, new_dev); + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + Tensor r = dist != Glocal ? like_this(none, {}, new_dev) + : make_compatible(none, {}, new_dev); + r.conjugate = conjugate; copyTo(r); return r; } - private: - /// Return the new ordering based on a partial reordering - /// \param new_order: new dimension labels order - /// \param remaining_char: if it isn't the null char, placeholder for the dimensions not given + /// Return a template of this tensor /// - /// If the dimension labels order does not match the current order, return a copy of this - /// tensor with that ordering. If the given order does not contain all dimensions, only the - /// involved dimensions are permuted. + /// Example: + /// + /// Tensor<2,std::complex> t("cs", {{Nc,Ns}}); + /// Tensor<2,std::complex> t_eg = t.make_eg(); + /// Tensor<2,std::complex> q = t_eg.like_this(); // create a new tensor like t + + Tensor make_eg() const + { + return Tensor( + order, size, + std::make_shared(Allocation(std::size_t(0), detail::getContext(getDev()))), + std::make_shared>(p->get_subpartition(from, size)), dist, {{}}, + size, value_type{1}, false /* not conjugate */, true /* is eg */, + false /* ordered writing */, complexLabel); + } + + /// Return this tensor but allowing consecutive writing operations (from `copyTo`, `contract`...) + /// to apply non-atomically and in different order than issued. In return, this may reduce the + /// latency impact by overlapping communications with other operations. + + Tensor make_writing_nonatomic() const + { + Tensor t = *this; + t.unordered_writing = true; + return t; + } + + /// Return the new ordering based on a partial reordering + /// \param new_order: new dimension labels order + /// \param remaining_char: if it isn't the null char, placeholder for the dimensions not given + /// + /// If the dimension labels order does not match the current order, return a copy of this + /// tensor with that ordering. If the given order does not contain all dimensions, only the + /// involved dimensions are permuted. std::string get_order_for_reorder(const std::string& new_order, char remaining_char = 0) const { @@ -1496,7 +2928,6 @@ namespace Chroma return new_order1; } - public: /// Return a copy of this tensor with the given ordering /// \param new_order: new dimension labels order /// \param remaining_char: if it isn't the null char, placeholder for the dimensions not given @@ -1510,16 +2941,20 @@ namespace Chroma std::string new_order1 = get_order_for_reorder(new_order, remaining_char); if (order == new_order1) return *this; - Tensor r = like_this(new_order1); - copyTo(r); + Tensor r = make_compatible(new_order1); + if (is_eg()) + r = r.make_eg(); + else + copyTo(r); + r.unordered_writing = unordered_writing; return r; } /// Return whether the tensor has complex components although being stored with a non-complex type `T` - bool isFakeReal() const + static constexpr bool isFakeReal() { - return order.find('.') != std::string::npos; + return detail::is_diycomplex::value; } /// Check that the dimension labels are valid @@ -1529,24 +2964,24 @@ namespace Chroma // Check that all labels are different there are N detail::check_order(order); - /// Throw exception if this a fake real tensor but with a complex type `T` - if (isFakeReal() && detail::is_complex::value) - throw std::runtime_error("Invalid tensor: it is fake real and complex!"); + /// Throw exception if this a diycomplex tensor but there's no complexity label + if (isFakeReal() && order.find(complexLabel) == std::string::npos) + throw std::runtime_error("checkOrder: DIYComplex tensor missing the complexity label"); for (auto s : size) if (s < 0) std::runtime_error("Invalid tensor size: it should be positive"); } - /// Return a fake real view of this tensor + /// Return a view of this tensor with an extra label for the real and the imaginary parts + /// \param complexLabel: label to represent the real and the imaginary part - template ::value, bool>::type = true> - Tensor toFakeReal() const + template + typename std::enable_if::value && detail::is_complex::value, + Tensor>>::type + toFakeReal(char complexLabel = '.') const { - assert(!isFakeReal()); - - std::string new_order = "." + order; + std::string new_order = std::string(1, complexLabel) + order; Coor new_from = {0}; std::copy_n(from.begin(), N, new_from.begin() + 1); Coor new_size = {2}; @@ -1556,30 +2991,52 @@ namespace Chroma if (std::fabs(std::imag(scalar)) != 0) throw std::runtime_error( "Unsupported conversion to fake real tensors with an implicit complex scale"); - using new_T = typename T::value_type; - new_T new_scalar = std::real(scalar); - auto this_data = data; - auto new_data = - std::shared_ptr((new_T*)data.get(), [=](const new_T* ptr) { (void)this_data; }); + typename T::value_type new_scalar = std::real(scalar); auto new_p = std::make_shared>(p->insert_dimension(0, 2)); - return Tensor(new_order, new_dim, ctx, new_data, new_p, dist, new_from, - new_size, new_scalar, conjugate); + return Tensor>( + new_order, new_dim, allocation, new_p, dist, new_from, new_size, new_scalar, conjugate, + eg, unordered_writing, complexLabel); + } + + template + typename std::enable_if::value && !detail::is_complex::value, + Tensor>>::type + toFakeReal(char complexLabel = '.') const + { + return Tensor>(order, dim, allocation, p, dist, from, size, scalar, + conjugate, eg, unordered_writing, complexLabel); + } + + template + typename std::enable_if::value, Tensor>::type + toFakeReal(char newComplexLabel = 0) const + { + if (newComplexLabel == 0 || newComplexLabel == complexLabel) + return *this; + return rename_dims({{complexLabel, newComplexLabel}}); } - template ::value, bool>::type = true> - Tensor> toComplex(bool allow_cloning = true) const + /// Return a view or a copy of this tensor where the real and the imaginary parts are together in an std::complex + /// \param allow_cloning: whether to allow reordering to put the complexity label first + + template + typename std::enable_if::value, + Tensor>>::type + toComplex(bool allow_cloning = true) const { - assert(isFakeReal() && kvdim()['.'] == 2); + std::size_t dot_pos = order.find(complexLabel); + if (dot_pos == std::string::npos) + std::runtime_error("toComplex: invalid complex label"); + if (from[dot_pos] != 0 || size[dot_pos] != 2 || dim[dot_pos] != 2) + std::runtime_error("toComplex: not supported on a slice of the complex label"); - std::size_t dot_pos = order.find('.'); std::string new_order = detail::remove_coor(order, dot_pos); if (dot_pos != 0) { - if (allow_cloning) - return reorder("." + new_order).toComplex(false); + if (allow_cloning || is_eg()) + return reorder(std::string(1, complexLabel) + new_order).toComplex(false); else throw std::runtime_error("Not allow to create a new tensor in `toComplex`"); } @@ -1587,37 +3044,67 @@ namespace Chroma Coor new_from = detail::remove_coor(from, dot_pos); Coor new_size = detail::remove_coor(size, dot_pos); Coor new_dim = detail::remove_coor(dim, dot_pos); - using new_T = std::complex; - new_T new_scalar = new_T{scalar}; - auto this_data = data; - auto new_data = - std::shared_ptr((new_T*)data.get(), [=](const new_T* ptr) { (void)this_data; }); + std::complex new_scalar{scalar}; auto new_p = std::make_shared>(p->remove_dimension(dot_pos)); - return Tensor(new_order, new_dim, ctx, new_data, new_p, dist, new_from, - new_size, new_scalar, conjugate); + return Tensor>( + new_order, new_dim, allocation, new_p, dist, new_from, new_size, new_scalar, conjugate, + eg, unordered_writing, 0 /* no complex label */); } - template ::value, bool>::type = true> - Tensor toFakeReal() const + template + typename std::enable_if::value, Tensor>::type + toComplex(bool allow_cloning = true) const { - assert(isFakeReal()); + (void)allow_cloning; return *this; } - template ::value, bool>::type = true> - Tensor toComplex(bool allow_cloning = true) const + /// Return a copy of the tensor with the values conjugated if the tensor is implicitly conjugated + + template + typename std::enable_if::value && detail::is_complex::value, + Tensor>::type + make_conjugate_explicit() const { - (void)allow_cloning; - assert(!isFakeReal()); - return *this; + if (!conjugate) return *this; + return toFakeReal().make_conjugate_explicit().toComplex(); } - /// Return a fake real view of this tensor + template + typename std::enable_if::value && !detail::is_complex::value, + Tensor>::type + make_conjugate_explicit() const + { + if (!conjugate) return *this; + auto t = make_compatible(); + auto this_unconj = conj(); + auto old_scalar = this_unconj.scalar; + this_unconj.scalar = value_type{1}; + this_unconj.kvslice_from_size({{complexLabel, 0}}, {{complexLabel, 1}}) + .copyTo(t.kvslice_from_size({{complexLabel, 0}}, {{complexLabel, 1}})); + this_unconj.kvslice_from_size({{complexLabel, 1}}, {{complexLabel, 1}}).scale(-1) + .copyTo(t.kvslice_from_size({{complexLabel, 1}}, {{complexLabel, 1}})); + return t.scale(old_scalar); + } + + template + typename std::enable_if::value && !detail::is_complex::value, + Tensor>::type + make_conjugate_explicit() const + { + if (!conjugate) return *this; + return conj(); + } + + /// Split a dimension into another dimensions + /// \param dim_label: dimension to split + /// \param new_labels: the labels of the new dimensions + /// \param new_dim: number of elements in each new labels - Tensor split_dimension(char dim_label, std::string new_labels, Index step) const + template 0), bool>::type = true> + Tensor split_dimension(char dim_label, std::string new_labels, + const std::map& new_dim) const { using namespace detail; @@ -1631,47 +3118,307 @@ namespace Chroma throw std::runtime_error(ss.str()); } - // Check the other arguments + // Check the length of the output tensor + if (N + new_labels.size() - 1 != Nout) + throw std::runtime_error( + "split_dimension: `new_labels` doesn't match the output tensor dimensions!"); + + // Check that the size is divisible by the new partition + if (new_labels.size() == 0) + { + if (dim[pos] > 1) + throw std::runtime_error("Unsupported remove a dimension that isn't singlet; clone " + "this object before doing the operator"); + } + + // Set the new characteristics of the tensor + std::string new_order = std::string(order.begin(), order.begin() + pos) + new_labels + + std::string(order.begin() + pos + 1, order.end()); + Coor d; + std::copy_n(dim.begin(), pos, d.begin()); + for (unsigned int i = 0; i < new_labels.size(); ++i) + d[pos + i] = new_dim.at(new_labels[i]); + std::copy_n(dim.begin() + pos + 1, N - pos - 1, d.begin() + pos + new_labels.size()); + + // Transform the partition + auto new_p = std::make_shared>(p->split_dimension(pos, d)); + + return Tensor(new_order, new_p->dim, allocation, new_p, dist, + detail::split_dimension(pos, from, d, From), + detail::split_dimension(pos, size, d, Size), scalar, conjugate, eg, + unordered_writing, complexLabel); + } + + /// Split a dimension into another dimensions + /// \param dim_label: dimension to split + /// \param new_labels: the labels of the new dimensions + /// \param step: length of the first label in `new_labels` + + Tensor split_dimension(char dim_label, const std::string& new_labels, + Index step) const + { if (new_labels.size() != 2) - throw std::runtime_error("`new_labels` should have two labels!"); + throw std::runtime_error( + "split_dimension: invalid `new_labels`, it should have size two"); + if (alloc_kvdim().at(dim_label) == 1) + step = 1; + if (alloc_kvdim().at(dim_label) % step != 0) + throw std::runtime_error( + "split_dimension: invalid `step`, it should divide the dimension size"); + return split_dimension( + dim_label, new_labels, + {{new_labels[0], step}, {new_labels[1], alloc_kvdim().at(dim_label) / step}}); + } + + /// Collapse several dimensions into a new one + /// \param dim_label: dimension to split + /// \param new_labels: the labels of the new dimensions + /// \param new_dim: number of elements in each new labels + + template 0), bool>::type = true> + Tensor collapse_dimensions(std::string labels, char new_label, + bool allow_copy = false) const + { + using namespace detail; + + // Check that all `labels` are together in `order` + auto s_labels = std::search(order.begin(), order.end(), labels.begin(), labels.end()); + if (s_labels == order.end()) + { + if (!allow_copy) + throw std::runtime_error( + "collapse_dimensions: invalid labels to collapse or they are " + " not appear together in the same ordering and copying is not allow"); + + // Find the position of the first label + std::string new_order; + for (char c : order) + { + if (std::find(labels.begin(), labels.end(), c) != labels.end()) + break; + new_order.push_back(c); + } + new_order += labels + "%"; + return reorder(new_order, '%').template collapse_dimensions(labels, new_label); + } - if (step < 1) - throw std::runtime_error("`step` cannot be zero or negative"); + // Check the length of the output tensor + if (N - labels.size() + 1 != Nout) + throw std::runtime_error( + "collapse_dimensions: `labels` doesn't match the output tensor dimensions!"); - if (size[pos] % step != 0 && size[pos] > step) - throw std::runtime_error("Not supporting `split_dimension` for this lattice dimensions"); + // Lets put the new dimension on the first dimension to collapse + std::size_t pos = s_labels - order.begin(); // Set the new characteristics of the tensor - std::string new_order = - insert_coor(replace_coor(order, pos, new_labels[0]), pos + 1, new_labels[1]); - Coor new_from = - insert_coor(replace_coor(from, pos, from[pos] % step), pos + 1, from[pos] / step); - Coor new_size = insert_coor(replace_coor(size, pos, std::min(size[pos], step)), - pos + 1, (size[pos] + step - 1) / step); - Coor new_dim = insert_coor(replace_coor(dim, pos, std::min(dim[pos], step)), pos + 1, - (dim[pos] + step - 1) / step); + std::string new_order = std::string(order.begin(), order.begin() + pos) + + std::string(1, new_label) + + std::string(order.begin() + pos + labels.size(), order.end()); + + // Transform the partition + auto new_p = std::make_shared>( + p->template collapse_dimensions(pos)); + + return Tensor(new_order, new_p->dim, allocation, new_p, dist, + detail::collapse_dimensions(pos, from, dim, From), + detail::collapse_dimensions(pos, size, dim, Size), scalar, + conjugate, eg, unordered_writing, complexLabel); + } + + /// Rearrange several dimensions into new ones + /// \param m: maps from suborder of the current tensor to new orders + /// \param allow_copy: whether to allow to return a reordered copy of the current tensor + + template 0), bool>::type = true> + Tensor reshape_dimensions(const std::map& m, + const std::map& new_dim, + bool allow_copy = true) const + { + using namespace detail; + + // Check that all suborders in `m` are together in `order` + std::string old_order = order; + for (const auto& it : m) + { + if (it.first.size() == 0 || it.second.size() == 0) + throw std::runtime_error("reshape_dimensions: invalid map element with empty string"); + + auto s_labels = std::search(order.begin(), order.end(), it.first.begin(), it.first.end()); + if (s_labels == order.end()) + { + if (!allow_copy) + throw std::runtime_error( + "reshape_dimensions: invalid labels to reshape or they do " + " not appear together in the same way as in the tensor and copying is not allow"); + + // Find the position of the first label to reshape and enforce the given subordering + std::string old_order0; + for (char c : old_order) + { + if (std::find(it.first.begin(), it.first.end(), c) != it.first.end()) + break; + old_order0.push_back(c); + } + old_order = old_order0 + it.first + remove_dimensions(old_order, old_order0 + it.first); + } + } + if (old_order != order) + return reorder(old_order).template reshape_dimensions(m, new_dim, true); + + // Check the length of the output tensor + int nout = N; + for (const auto& it : m) + nout += (int)it.second.size() - (int)it.first.size(); + if (nout != Nout) + throw std::runtime_error("reshape_dimensions: the resulting tensor after the changes " + "given in `m` doesn't match the output tensor's dimensions!"); + + // Compute the new order + std::string new_order = order; + for (const auto& it : m) + { + auto s_first = std::find(new_order.begin(), new_order.end(), it.first.front()); + new_order = std::string(new_order.begin(), s_first) + it.second + + std::string(s_first + it.first.size(), new_order.end()); + } + + // Compute the dimensions of the new tensor + auto new_dim0 = kvdim(); + for (const auto& it : new_dim) + new_dim0[it.first] = it.second; + + // The last label on the new subordering is optional + for (const auto& it : m) + if (new_dim.count(it.second.back()) == 0) + new_dim0[it.second.back()] = std::numeric_limits::max(); + + // Compute the number of dimensions to collapse and to split + std::map m_ncollapse, m_nsplit; + for (const auto& it : m) + { + m_ncollapse[it.first.front()] = it.first.size(); + m_nsplit[it.first.front()] = it.second.size(); + } + Coor ncollapse = kvcoors(order, m_ncollapse, 1); + Coor nsplit = kvcoors(order, m_nsplit, 1); + auto d_aux = detail::reshape_dimensions( + ncollapse, nsplit, dim, kvcoors(new_order, new_dim0, 0, ThrowOnMissing), Size, dim); + if (d_aux.first != Success) + throw std::runtime_error( + "reshape_dimensions: invalid reshape, most likely some new dimension is too short"); + auto d = d_aux.second; // new dimensions + + // Transform the partition + auto new_p_aux = p->template reshape_dimensions(ncollapse, nsplit, d); + auto new_from = detail::reshape_dimensions(ncollapse, nsplit, dim, d, From, from); + auto new_size = detail::reshape_dimensions(ncollapse, nsplit, dim, d, Size, size); + + // Whether a compatible partition can be made that doesn't require a copy of the tensors' data + bool success = + (new_p_aux.first == Success && new_from.first == Success && new_size.first == Success); + + // Return the new tensor + if (!allow_copy && !success) + { + throw std::runtime_error("reshape_dimensions: unsupported reshape without copying"); + } + else if (success) + { + // Return a tensor with this data but a different shape + auto new_p = std::make_shared>(new_p_aux.second); + return Tensor(new_order, new_p->dim, allocation, new_p, dist, new_from.second, + new_size.second, scalar, conjugate, eg, unordered_writing, + complexLabel); + } + else if (new_size.first != Success) + { + // This shouldn't happen + throw std::runtime_error("reshape_dimensions: something is wrong..."); + } + else + { + // Try the other way around + Tensor r(new_order, new_size.second, getDev(), dist, complexLabel); + r.scalar = scalar; + r.conjugate = conjugate; + r.unordered_writing = unordered_writing; + if (eg) + return r.make_eg(); + std::map reverse_m; + for (const auto& it : m) + reverse_m[it.second] = it.first; + copyTo(r.template reshape_dimensions(reverse_m, kvdim(), false)); + return r; + } + } + + /// Append a dimension with size one + /// \param new_label: label for the new dimension + + template 0), bool>::type = true> + Tensor append_dimension(char new_label) const + { + std::string last_label{order.back()}; + return reshape_dimensions({{last_label, last_label + std::string(1, new_label)}}, + {{new_label, 1}}, false); + } + + /// Coarse the support range of the tensor on each process + /// \param blocking: blocking for each dimension + + template 0), bool>::type = true> + Tensor coarse_support(const std::map& blocking) const + { + // Get the blocking and check that it divides each diension + auto c_blk = kvcoors(order, blocking, 1); + for (std::size_t i = 0; i < N; ++i) + if (dim[i] % c_blk[i] != 0) + throw std::runtime_error( + "coarse_support: the given blocking isn't dividing the tensor dimensions"); + + // Transform the partition + auto new_p = std::make_shared>(p->coarse_support(c_blk)); - auto new_p = - std::make_shared>(p->split_dimension(pos, step)); + // Create output tensor + Tensor r(order, dim, getDev(), dist, new_p, unordered_writing, complexLabel); + r.from = from; + r.size = size; + r.conjugate = conjugate; - return Tensor(new_order, new_dim, ctx, data, new_p, dist, new_from, new_size, - scalar, conjugate); + // Return it + if (is_eg()) + return r.make_eg(); + copyTo(r); + return r; } /// Copy/add this tensor into the given one /// NOTE: if this tensor or the given tensor is fake real, force both to be fake real - template ::value != detail::is_complex::value, bool>::type = true> - void doAction(Action action, Tensor w) const + void doAction(Action action, Tensor w, Tensor m = {}, + Tensor wm = {}, const std::string& uneven_mask_labels = "", + CopyingTrash copying_trash = dontCopyingTrash) const { - toFakeReal().doAction(action, w.toFakeReal()); + if (m || wm) + throw std::runtime_error( + "doAction: unsupported mixing real and complex types with masks"); + toFakeReal().doAction(action, w.toFakeReal(), {}, {}, uneven_mask_labels, dontCopyingTrash); } /// Return the local support of this tensor Tensor getLocal() const { + // Shortcut for empty and local tensors + if (!*this || dist == Local) + return *this; + + // Finish writing operations: local tensor will not be able to finish pending writing operations + data(); + // Compute the size of the intersection of the current view and the local support Coor lfrom, lsize; superbblas::detail::intersection(p->localFrom(), p->localSize(), from, size, dim, lfrom, @@ -1682,64 +3429,333 @@ namespace Chroma return Tensor{}; using superbblas::detail::operator-; - return Tensor(order, p->localSize(), ctx, data, + return Tensor(order, p->localSize(), allocation, std::make_shared>(p->get_local_partition()), - Local, normalize_coor(from - p->localFrom(), dim), lsize, scalar, conjugate); + Local, normalize_coor(from - p->localFrom(), dim), lsize, scalar, + conjugate, eg, false /* ordered writing */, complexLabel); + } + + /// Return the local support of this tensor as a subset of the global tensor + Tensor getGlocal() const + { + // Shortcut for empty and local tensors + if (!*this || dist == Glocal || dist == Local) + return *this; + + // Finish writing operations: local tensor will not be able to finish pending writing operations + data(); + + return Tensor(order, dim, allocation, + std::make_shared>(p->get_glocal_partition()), + Glocal, from, size, scalar, conjugate, eg, + false /* ordered writing */, complexLabel); } /// Set zero void set_zero() { - T* ptr = this->data.get(); - MPI_Comm comm = (dist == OnMaster || dist == Local ? MPI_COMM_SELF : MPI_COMM_WORLD); + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + value_type* ptr = data_for_writing(); + MPI_Comm comm = + (dist == OnMaster || dist == Local || dist == Glocal ? MPI_COMM_SELF : MPI_COMM_WORLD); + auto p_disp = (dist == Glocal ? p->MpiProcRank() : 0); if (dist != OnMaster || Layout::nodeNumber() == 0) - superbblas::copy(T{0}, p->p.data(), 1, order.c_str(), from, size, (const T**)&ptr, - &*ctx, p->p.data(), 1, order.c_str(), from, &ptr, &*ctx, comm, - superbblas::FastToSlow, superbblas::Copy); + { + superbblas::copy(value_type{0}, p->p.data() + p_disp, 1, order.c_str(), from, size, + dim, (const value_type**)&ptr, nullptr, &ctx(), + p->p.data() + p_disp, 1, order.c_str(), from, dim, &ptr, nullptr, + &ctx(), comm, superbblas::FastToSlow, superbblas::Copy); + // Force synchronization in superbblas stream if the allocation isn't managed by superbblas + if (!is_managed()) + superbblas::sync(ctx()); + } } - /// Copy/Add this tensor into the given one - template + bool is_distributed_like(Tensor v) const + { + return order == v.order && from == v.from && size == v.size && dim == v.dim && + p->p == v.p->p; + } + + /// Return whether the given tensor has the same distribution as this one + template ::type = true> + bool is_distributed_like(Tensor) const + { + return false; + } + + /// Return whether the given tensor has the same shape, distribution, type, and implicit scalar + /// and conjugacy. + /// \param v: tensor to compare + + bool is_like(Tensor v) const + { + return order == v.order && from == v.from && size == v.size && dim == v.dim && + scalar == v.scalar && conjugate == v.conjugate && dist == v.dist && p->p == v.p->p; + } + + template < + std::size_t Nv, typename Tv, + typename std::enable_if<(N != Nv || !std::is_same::value), bool>::type = true> + bool is_like(Tensor) const + { + return false; + } + + /// Return whether the given tensor has the same memory allocation as this one + /// \param v: tensor to compare + + template + bool has_same_allocation(Tensor v) const + { + // Compare the allocation pointer, not the actual allocation.ptr; we are making sure that + // the two allocations are on the same device in this way + return allocation == v.allocation; + } + + template ::type, + typename detail::real_type::type>::value, + bool>::type = true> + bool has_same_allocation(Tensor) const + { + return false; + } + + /// Return whether the given tensor has the same distribution as this one + Tensor create_mask() const + { + Tensor m{order, dim, getDev(), dist, p, false /* ordered writing */, + 0 /* no complex label */}; + m.set_zero(); + m.from = from; + m.size = size; + m.conjugate = conjugate; + return m; + } + + /// Copy/Add this tensor into the given one; and copy only where the values of the mask are nonzero if given + template ::value == detail::is_complex::value, bool>::type = true> - void doAction(Action action, Tensor w) const + detail::is_complex::value == detail::is_complex::value && + detail::is_diycomplex::value == detail::is_diycomplex::value, + bool>::type = true> + void doAction(Action action, Tensor w, Tensor m = {}, + Tensor wm = {}, const std::string& uneven_mask_labels = "", + CopyingTrash copying_trash = dontCopyingTrash) const { + if (is_eg() || w.is_eg() || (m && m.is_eg()) || (wm && wm.is_eg())) + throw std::runtime_error("Invalid operation from an example tensor"); + Coor wsize = kvcoors(order, w.kvdim(), 1, NoThrow); for (unsigned int i = 0; i < N; ++i) - if (size[i] > wsize[i]) + if (size[i] > wsize[i] && !detail::is_in(uneven_mask_labels, order[i])) throw std::runtime_error("The destination tensor is smaller than the source tensor"); + if (m || wm) + for (unsigned int i = 0; i < N; ++i) + if (size[i] != wsize[i] && !detail::is_in(uneven_mask_labels, order[i])) + throw std::runtime_error("copying with masks tensor with different dimensions"); - if (action == AddTo && w.scalar != Tw{1}) - throw std::runtime_error("Not allowed to add to a tensor whose implicit scalar factor is not one"); - - if (conjugate != w.conjugate) - throw std::runtime_error("Not allowed to copy or add tensor with different implicit conjugacy"); + if (action == AddTo && w.scalar != decltype(w.scalar){1}) + throw std::runtime_error( + "Not allowed to add to a tensor whose implicit scalar factor is not one"); - if ((dist == Local && w.dist != Local) || (dist != Local && w.dist == Local)) + if (conjugate != w.conjugate && + (detail::is_complex::value || detail::is_diycomplex::value)) { - getLocal().doAction(action, w.getLocal()); + auto this_conj = (conjugate ? *this : conj()).make_conjugate_explicit(); + auto new_this = (conjugate ? this_conj : this_conj.conj()); + new_this.doAction(action, w, m, wm, uneven_mask_labels); return; } - T* ptr = this->data.get(); - Tw* w_ptr = w.data.get(); - MPI_Comm comm = - ((dist == OnMaster && w.dist == OnMaster) || dist == Local ? MPI_COMM_SELF - : MPI_COMM_WORLD); - if (dist != OnMaster || w.dist != OnMaster || Layout::nodeNumber() == 0) + bool some_is_local = + dist == Local || w.dist == Local || (m && m.dist == Local) || (wm && wm.dist == Local); + bool some_isnt_local = + dist != Local || w.dist != Local || (m && m.dist != Local) || (wm && wm.dist != Local); + if (some_is_local && some_isnt_local) + throw std::runtime_error( + "Not allowed to copy or add a non-local tensor into a local tensor or vice versa"); + + // Transform to a local copy when both tensors have the same shape and distribution + if (action == CopyTo && is_like(w)) + { + if (some_isnt_local) + { + auto this_local = getLocal(); + auto w_local = w.getLocal(); + if (w_local && this_local) + this_local.doAction(action, w_local, m.getLocal(), wm.getLocal(), uneven_mask_labels); + return; + } + if (some_is_local && has_same_allocation(w) && !m && !wm) + return; + } + + // Check if some dimension size doesn't match + if (m || wm) + { + std::map new_size; + bool v_has_new_size = false, w_has_new_size = false; + for (unsigned int i = 0; i < N; ++i) + { + new_size[order[i]] = std::max(size[i], wsize[i]); + if (size[i] != wsize[i]) + { + v_has_new_size |= new_size[order[i]] != size[i]; + w_has_new_size |= new_size[order[i]] != wsize[i]; + } + } + for (const auto& it : m.kvdim()) + if (new_size.count(it.first) == 0) + new_size[it.first] = it.second; + for (const auto& it : w.kvdim()) + if (new_size.count(it.first) == 0) + new_size[it.first] = it.second; + for (const auto& it : wm.kvdim()) + if (new_size.count(it.first) == 0) + new_size[it.first] = it.second; + if (v_has_new_size || w_has_new_size) + { + auto v0 = *this; + auto w0 = w; + auto m0 = m; + auto wm0 = wm; + if (v_has_new_size) + { + v0 = w_has_new_size ? make_compatible(none, new_size) + : w0.template make_compatible(order, new_size); + copyTo(v0, doCopyingTrash); + if (m) + { + m0 = v0.template make_compatible(m.order, new_size); + m0.set_zero(); + m.copyTo(m0); + } + } + if (w_has_new_size) + { + w0 = v0.template make_compatible(w.order, new_size); + w.copyTo(w0, doCopyingTrash); + if (wm) + { + wm0 = w0.template make_compatible(wm.order, new_size); + wm0.set_zero(); + wm.copyTo(wm0); + } + } + v0.doAction(action, w0, m0, wm0); + if (w_has_new_size) + w0.kvslice_from_size({}, w.kvdim()).doAction(CopyTo, w); + return; + } + } + + // Compute masks + float *m0ptr = nullptr, *m1ptr = nullptr; + Tensor m0; + Tensor m1; + if (m || wm) + { + if (m) + { + if (is_distributed_like(m)) + { + m0 = m; + } + else + { + m0 = create_mask(); + m.copyTo(m0); + } + } + + if (wm) + { + if (w.is_distributed_like(wm)) + { + m1 = wm; + } + else + { + m1 = w.create_mask(); + wm.copyTo(m1); + } + } + + if (m && !wm) + m0.copyTo(m1); + if (!m && wm) + m1.copyTo(m0); + + m0ptr = m0.data(); + m1ptr = m1.data(); + } + + // Get the pointers to data + value_type* ptr = data(); + typename decltype(w)::value_type* w_ptr = w.data_for_writing(); + + // Shortcuts for who is involved in the operation + bool do_operation = true; + int p_disp = 0; + MPI_Comm comm = MPI_COMM_WORLD; + // a) if the origin and destination tensors have full support on the master node + // and the destination tensor is only supported on the master node, the operation + // only happens on the master node + if ((dist == OnMaster || dist == OnEveryoneReplicated) && w.dist == OnMaster) + { + do_operation = Layout::nodeNumber() == 0; + comm = MPI_COMM_SELF; + } + // b) if the origin and destination tensors are replicated on every node or they are + // local, the operation happens locally on each node + else if ((dist == OnEveryoneReplicated && w.dist == OnEveryoneReplicated) || dist == Local) + { + comm = MPI_COMM_SELF; + } + // c) any is glocal + if (dist == Glocal || w.dist == Glocal) { + comm = MPI_COMM_SELF; + p_disp = p->MpiProcRank(); + } + + if (do_operation) { - superbblas::copy( - detail::safe_div(scalar, w.scalar), p->p.data(), 1, order.c_str(), from, size, - (const T**)&ptr, &*ctx, w.p->p.data(), 1, w.order.c_str(), w.from, &w_ptr, &*w.ctx, - comm, superbblas::FastToSlow, action == CopyTo ? superbblas::Copy : superbblas::Add); + superbblas::Request req; + superbblas::copy(detail::safe_div(scalar, w.scalar), + p->p.data() + p_disp, 1, order.c_str(), from, size, dim, + (const value_type**)&ptr, (const float**)&m0ptr, &ctx(), + w.p->p.data() + p_disp, 1, w.order.c_str(), w.from, w.dim, &w_ptr, + (const float**)&m1ptr, &w.ctx(), comm, superbblas::FastToSlow, + action == CopyTo ? superbblas::Copy : superbblas::Add, + &req /*, copying_trash == doCopyingTrash*/); + w.allocation->append_pending_operation(req); + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!w.is_managed()) + superbblas::sync(w.ctx()); } } /// Copy this tensor into the given one template - void copyTo(Tensor w) const + void copyTo(Tensor w, CopyingTrash copying_trash = dontCopyingTrash) const { - doAction(CopyTo, w); + doAction(CopyTo, w, {}, {}, "", copying_trash); + } + + /// Copy this tensor into the given one but only the elements where the mask is nonzero + template + void copyToWithMask(Tensor w, Tensor m, Tensor wm, + const std::string uneven_mask_labels = "") const + { + doAction(CopyTo, w, m, wm, uneven_mask_labels); } // Add `this` tensor into the given one @@ -1762,12 +3778,13 @@ namespace Chroma Coor vsize = kvcoors(order, v.kvdim(), 0, NoThrow); for (unsigned int i = 0; i < N; ++i) if (vsize[i] != 0 && vsize[i] != size[i]) - throw std::runtime_error("Invalid tensor contractions: one of the dimensions does not match"); + throw std::runtime_error( + "Invalid tensor contractions: one of the dimensions does not match"); auto new_p = std::make_shared>( p->make_suitable_for_contraction(order, *v.p, v.order)); - Tensor r(order, dim, getDev(), OnEveryone, new_p); + Tensor r(order, dim, getDev(), OnEveryone, new_p, unordered_writing, complexLabel); copyTo(r); return r; } @@ -1775,10 +3792,16 @@ namespace Chroma // Contract the dimensions with the same label in `v` and `w` than do not appear on `this` tensor. template void contract(Tensor v, const remap& mv, Conjugation conjv, Tensor w, - const remap& mw, Conjugation conjw, const remap& mr = {}, T beta = T{0}) + const remap& mw, Conjugation conjw, const remap& mr = {}, + value_type beta = value_type{0}) { - // If either v or w is on OnDevice, force both to be on device - if (v.ctx->plat != w.ctx->plat) + if (is_eg() || v.is_eg() || w.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + // NOTE: Superbblas tensor contraction is shit and does not deal with contracting a host and + // device tensor (for now) + // a) If either v or w is on OnDevice, force both to be on device + if (v.ctx().plat != w.ctx().plat) { if (v.getDev() != OnDefaultDevice) v = v.cloneOn(OnDefaultDevice); @@ -1786,13 +3809,8 @@ namespace Chroma w = w.cloneOn(OnDefaultDevice); } - // Superbblas tensor contraction is shit and those not deal with subtensors or contracting a host and - // device tensor (for now) - if (v.isSubtensor()) - v = v.clone(); - if (w.isSubtensor()) - w = w.clone(); - if (isSubtensor() || getDev() != v.getDev()) + // b) Do arrangements if the input tensors are on a different device than the result tensor + if (getDev() != v.getDev()) { Tensor aux = std::norm(beta) == 0 ? like_this(none, {}, v.getDev()) : cloneOn(v.getDev()); @@ -1801,73 +3819,271 @@ namespace Chroma return; } - if ((v.dist == Local) != (w.dist == Local) || (w.dist == Local) != (dist == Local)) - throw std::runtime_error( - "One of the contracted tensors or the output tensor is local and others are not!"); - - if ((v.dist == OnMaster && w.dist == OnEveryone) || - (v.dist == OnEveryone && w.dist == OnMaster)) - throw std::runtime_error("Incompatible layout for contractions: one of the tensors is on " - "the master node and the other is distributed"); - - if ((v.dist == OnMaster && w.dist == OnEveryoneReplicated) || - (v.dist == OnEveryoneReplicated && w.dist == OnMaster)) - { - contract(v.make_sure(none, none, OnMaster), mv, conjv, w.make_sure(none, none, OnMaster), - mw, conjw, mr, beta); - return; - } - - if (v.dist == OnEveryone && w.dist == OnEveryoneReplicated) - w = w.make_suitable_for_contraction(v); - - if (v.dist == OnEveryoneReplicated && w.dist == OnEveryone) - v = v.make_suitable_for_contraction(w); - - T* v_ptr = v.data.get(); - T* w_ptr = w.data.get(); - T* ptr = this->data.get(); - std::string orderv_ = detail::update_order(v.order, mv); - std::string orderw_ = detail::update_order(w.order, mw); - std::string order_ = detail::update_order(order, mr); + if ((v.dist == Local) != (w.dist == Local) || (w.dist == Local) != (dist == Local) || + (v.dist == Glocal) != (w.dist == Glocal) || (w.dist == Glocal) != (dist == Glocal)) + throw std::runtime_error("contract: one of the contracted tensors or the output tensor " + "is local/glocal and others are not!"); + + MPI_Comm comm = (dist == Local || dist == Glocal ? MPI_COMM_SELF : MPI_COMM_WORLD); + auto p_disp = (dist == Glocal ? p->MpiProcRank() : 0); + + value_type* v_ptr = v.data(); + value_type* w_ptr = w.data(); + value_type* ptr = std::norm(beta) == 0 ? data_for_writing() : data(); + std::string orderv_ = detail::update_order_and_check(v.order, mv); + std::string orderw_ = detail::update_order_and_check(w.order, mw); + std::string order_ = detail::update_order_and_check(order, mr); bool conjv_ = (((conjv == Conjugate) xor v.conjugate) xor conjugate); bool conjw_ = (((conjw == Conjugate) xor w.conjugate) xor conjugate); + superbblas::Request req; superbblas::contraction( detail::cond_conj(conjv_, v.scalar) * detail::cond_conj(conjw_, w.scalar) / scalar, // - v.p->p.data(), 1, orderv_.c_str(), conjv_, (const T**)&v_ptr, &*v.ctx, // - w.p->p.data(), 1, orderw_.c_str(), conjw_, (const T**)&w_ptr, &*w.ctx, // - detail::cond_conj(conjugate, beta), p->p.data(), 1, order_.c_str(), &ptr, &*ctx, - MPI_COMM_WORLD, superbblas::FastToSlow); + v.p->p.data() + p_disp, v.from, v.size, v.dim, 1, orderv_.c_str(), conjv_, + (const value_type**)&v_ptr, &v.ctx(), // + w.p->p.data() + p_disp, w.from, w.size, w.dim, 1, orderw_.c_str(), conjw_, + (const value_type**)&w_ptr, &w.ctx(), // + detail::cond_conj(conjugate, beta), p->p.data() + p_disp, from, size, dim, 1, + order_.c_str(), &ptr, &ctx(), comm, superbblas::FastToSlow, &req); + allocation->append_pending_operation(req); + + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!is_managed()) + superbblas::sync(ctx()); } - /// Return a view of this tensor where the elements are scaled by the given argument - /// \param s: scaling factor - /// \return: a new view (it doesn't create a copy of the tensor) + /// Compute the Cholesky factor of `v' and contract its inverse with `w` + /// \param v: tensor to compute the Cholesky factor + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param w: the other tensor to contract - Tensor scale(T s) const + template + void cholInv(Tensor v, const std::string& order_rows, const std::string& order_cols, + Tensor w) { - return Tensor(*this, scalar * detail::cond_conj(conjugate, s), conjugate); - } + if (is_eg() || v.is_eg() || w.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); - /// Return a view of this tensor where the elements are conjuated - /// \return: a new view (it doesn't create a copy of the tensor) - - Tensor conj() const - { - return Tensor(*this, scalar, !conjugate); - } + // Conjugacy isn't supported + if (v.conjugate || w.conjugate || conjugate) + throw std::runtime_error("cholInv: Unsupported implicit conjugate tensors"); - void release() - { - dim = {}; - data.reset(); - p.reset(); - ctx.reset(); - from = {}; - size = {}; - strides = {}; - scalar = T{0}; + // If either v or w is on OnDevice, force both to be on device + if (v.ctx().plat != w.ctx().plat) + { + if (v.getDev() != OnDefaultDevice) + v = v.cloneOn(OnDefaultDevice); + if (w.getDev() != OnDefaultDevice) + w = w.cloneOn(OnDefaultDevice); + } + + // Superbblas tensor contraction is shit and those not deal with subtensors or contracting a host and + // device tensor (for now) + if (v.isSubtensor()) + v = v.clone(); + if (w.isSubtensor()) + w = w.clone(); + if (isSubtensor() || getDev() != v.getDev()) + { + Tensor aux = make_compatible(none, {}, v.getDev()); + aux.cholInv(v, order_rows, order_cols, w); + aux.copyTo(*this); + return; + } + + // v is going to be modified and is reference, make a clone + if (v.allocation.use_count() > 1) + v = v.clone(); + + if ((v.dist == Local) != (w.dist == Local) || (w.dist == Local) != (dist == Local) || + (v.dist == Glocal) != (w.dist == Glocal) || (w.dist == Glocal) != (dist == Glocal)) + throw std::runtime_error("cholInv: one of the contracted tensors or the output tensor " + "is local/glocal and others are not!"); + + if (detail::isDistributedOnEveryone(v.dist) && w.dist == OnEveryoneReplicated) + w = w.make_suitable_for_contraction(v); + + if (v.dist == OnEveryoneReplicated && detail::isDistributedOnEveryone(w.dist)) + v = v.make_suitable_for_contraction(w); + + if (std::fabs(std::imag(v.scalar)) != 0 || std::real(v.scalar) < 0) + throw std::runtime_error("cholInv: unsupported a negative or imaginary scale"); + + MPI_Comm comm = (dist == Local || dist == Glocal ? MPI_COMM_SELF : MPI_COMM_WORLD); + auto p_disp = (dist == Glocal ? p->MpiProcRank() : 0); + + value_type* v_ptr = v.data(); + value_type* w_ptr = w.data(); + value_type* ptr = data_for_writing(); + superbblas::cholesky(v.p->p.data() + p_disp, v.dim, 1, v.order.c_str(), &v_ptr, + order_rows.c_str(), order_cols.c_str(), &v.ctx(), comm, + superbblas::FastToSlow); + superbblas::trsm( + w.scalar / std::sqrt(v.scalar) / scalar, // + v.p->p.data() + p_disp, v.dim, 1, v.order.c_str(), (const value_type**)&v_ptr, + order_rows.c_str(), order_cols.c_str(), + &v.ctx(), // + w.p->p.data() + p_disp, w.dim, 1, w.order.c_str(), (const value_type**)&w_ptr, + &w.ctx(), // + p->p.data() + p_disp, dim, 1, order.c_str(), &ptr, &ctx(), comm, superbblas::FastToSlow); + + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!is_managed()) + superbblas::sync(ctx()); + } + + /// Compute the inverse of `v' + /// \param v: tensor to compute the Cholesky factor + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param w: the other tensor to contract + + template + void inv(Tensor v, const std::string& order_rows, const std::string& order_cols) + { + if (is_eg() || v.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + if (isSubtensor() || scalar != T{1} || conjugate) + { + Tensor aux = make_compatible(); + aux.inv(v, order_rows, order_cols); + aux.copyTo(*this); + return; + } + + v.copyTo(*this); + + MPI_Comm comm = (dist == Local || dist == Glocal ? MPI_COMM_SELF : MPI_COMM_WORLD); + auto p_disp = (dist == Glocal ? p->MpiProcRank() : 0); + + value_type* ptr = data_for_writing(); + superbblas::inversion(p->p.data() + p_disp, dim, 1, order.c_str(), &ptr, + order_rows.c_str(), order_cols.c_str(), &ctx(), comm, + superbblas::FastToSlow); + + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!is_managed()) + superbblas::sync(ctx()); + } + + /// Solve the linear systems within tensor `v' and right-hand-sides `w` + /// \param v: tensor to compute the Cholesky factor + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param w: the other tensor to contract + + template + void solve(Tensor v, const std::string& order_rows, const std::string& order_cols, + Tensor w) + { + if (is_eg() || v.is_eg() || w.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + // Conjugacy isn't supported + if (v.conjugate || w.conjugate || conjugate) + throw std::runtime_error("solve: Unsupported implicit conjugate tensors"); + + // If either v or w is on OnDevice, force both to be on device + if (v.ctx().plat != w.ctx().plat) + { + if (v.getDev() != OnDefaultDevice) + v = v.cloneOn(OnDefaultDevice); + if (w.getDev() != OnDefaultDevice) + w = w.cloneOn(OnDefaultDevice); + } + + // Superbblas tensor contraction is shit and those not deal with subtensors or contracting a host and + // device tensor (for now) + if (v.isSubtensor()) + v = v.clone(); + if (w.isSubtensor()) + w = w.clone(); + if (isSubtensor() || getDev() != v.getDev()) + { + Tensor aux = make_compatible(none, {}, v.getDev()); + aux.solve(v, order_rows, order_cols, w); + aux.copyTo(*this); + return; + } + + if ((v.dist == Local) != (w.dist == Local) || (w.dist == Local) != (dist == Local) || + (v.dist == Glocal) != (w.dist == Glocal) || (w.dist == Glocal) != (dist == Glocal)) + throw std::runtime_error("solve: one of the contracted tensors or the output tensor " + "is local/glocal and others are not!"); + + // Help superbblas to get the same verbatim value in all processes for the same tensor element in all + // replicated copies + // TODO: check whether superbblas does this already + if ((v.dist == OnMaster || v.dist == OnEveryoneReplicated) || + (w.dist == OnMaster && w.dist == OnEveryoneReplicated)) + { + v = v.make_sure(none, none, OnMaster); + w = w.make_sure(none, none, OnMaster); + } + + if (detail::isDistributedOnEveryone(v.dist) && w.dist == OnEveryoneReplicated) + w = w.make_suitable_for_contraction(v); + + if (v.dist == OnEveryoneReplicated && detail::isDistributedOnEveryone(w.dist)) + v = v.make_suitable_for_contraction(w); + + MPI_Comm comm = (dist == Local || dist == Glocal ? MPI_COMM_SELF : MPI_COMM_WORLD); + auto p_disp = (dist == Glocal ? p->MpiProcRank() : 0); + + value_type* v_ptr = v.data(); + value_type* w_ptr = w.data(); + value_type* ptr = data_for_writing(); + superbblas::gesm( + w.scalar / v.scalar / scalar, // + v.p->p.data() + p_disp, v.dim, 1, v.order.c_str(), (const value_type**)&v_ptr, + order_rows.c_str(), order_cols.c_str(), + &v.ctx(), // + w.p->p.data() + p_disp, w.dim, 1, w.order.c_str(), (const value_type**)&w_ptr, + &w.ctx(), // + p->p.data() + p_disp, dim, 1, order.c_str(), &ptr, &ctx(), comm, superbblas::FastToSlow); + + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!is_managed()) + superbblas::sync(ctx()); + } + + /// Return a view of this tensor where the elements are scaled by the given argument + /// \param s: scaling factor + /// \return: a new view (it doesn't create a copy of the tensor) + + Tensor scale(value_type s) const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + return Tensor(*this, scalar * detail::cond_conj(conjugate, s), conjugate); + } + + /// Return a view of this tensor where the elements are conjugated + /// \return: a new view (it doesn't create a copy of the tensor) + + Tensor conj() const + { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + // NOTE: don't conjugate `scalar`: it's a value associated to the allocation, NOT the view + return Tensor(*this, scalar, !conjugate); + } + + void release() + { + dim = {{}}; + allocation.reset(); + p.reset(); + from = {{}}; + size = {{}}; + strides = {{}}; + scalar = value_type{0}; conjugate = false; + eg = false; + unordered_writing = false; + complexLabel = 0; } // Return whether the current view is contiguous in memory @@ -1877,7 +4093,7 @@ namespace Chroma if (dist != OnMaster && dist != Local) return false; - if (superbblas::detail::volume(size) > 0 && N > 1) + if (volume() > 0 && N > 1) { bool non_full_dim = false; // some dimension is not full for (unsigned int i = 0; i < N - 1; ++i) @@ -1908,8 +4124,19 @@ namespace Chroma if (new_order.getSome(order) != order || !detail::is_same(new_dev.getSome(getDev()), getDev()) || new_dist.getSome(dist) != dist) { - Tensor r = like_this(new_order, {}, new_dev, new_dist); - copyTo(r); + Tensor r = new_dist.getSome(dist) != dist + ? like_this(new_order, {}, new_dev, new_dist) + : make_compatible(new_order, {}, new_dev); + if (is_eg()) + { + r = r.make_eg(); + } + else + { + r.conjugate = conjugate; + r.unordered_writing = unordered_writing; + copyTo(r); + } return r; } else @@ -1924,8 +4151,113 @@ namespace Chroma Maybe new_dev = none, Maybe new_dist = none) const { - Tensor r = like_this(new_order, {}, new_dev, new_dist); - copyTo(r); + Tensor r = new_dist.getSome(dist) != dist + ? like_this(new_order, {}, new_dev, new_dist) + : make_compatible(new_order, {}, new_dev); + if (is_eg()) + { + r = r.make_eg(); + } + else + { + r.conjugate = conjugate; + r.unordered_writing = unordered_writing; + copyTo(r); + } + return r; + } + + /// Return a copy of this tensor in a different type or this tensor if the type coincides + /// \tparam Tn: new precision + + template ::value, bool>::type = true> + Tensor cast() const + { + return *this; + } + + template ::value && + detail::is_diycomplex::value == + detail::is_diycomplex::value, + bool>::type = true> + Tensor cast() const + { + auto r = make_compatible(); + if (is_eg()) + { + r = r.make_eg(); + } + else + { + r.conjugate = conjugate; + r.unordered_writing = unordered_writing; + copyTo(r); + } + return r; + } + + template ::value && detail::is_diycomplex::value && + !detail::is_diycomplex::value && + std::is_same::type, Tn>::value, + bool>::type = true> + Tensor cast() const + { + return Tensor(order, dim, allocation, p, dist, from, size, scalar, conjugate, eg, + unordered_writing, 0 /* no complexity label */); + } + + /// Return a compatible tensor in a different type or this tensor if the type coincides + /// \tparam Tn: new precision + + template ::value, bool>::type = true> + Tensor cast_like() const + { + return *this; + } + + template ::value, bool>::type = true> + Tensor cast_like() const + { + auto r = make_compatible(); + if (is_eg()) + { + r = r.make_eg(); + } + else + { + r.conjugate = conjugate; + r.unordered_writing = unordered_writing; + } + return r; + } + + /// Extend the support of each dimension by the given amount in each direction + /// \param m: amount to extend the support for each process + /// \return a new tensor with the extension + + Tensor extend_support(const std::map& m) const + { + Tensor r{ + order, + dim, + getDev(), + dist, + std::make_shared>(p->extend_support(kvcoors(order, m, 0))), + unordered_writing, + complexLabel}; + r.from = from; + r.size = size; + r.strides = strides; + r.scalar = scalar; + r.conjugate = conjugate; + if (is_eg()) + r = r.make_eg(); + else + copyTo(r); return r; } @@ -1934,7 +4266,7 @@ namespace Chroma DeviceHost getDev() const { # ifdef SUPERBBLAS_USE_GPU - return (ctx->plat != superbblas::CPU ? OnDefaultDevice : OnHost); + return (ctx().plat != superbblas::CPU ? OnDefaultDevice : OnHost); # else return OnDefaultDevice; # endif @@ -1942,24 +4274,31 @@ namespace Chroma void binaryRead(BinaryReader& bin) { - if (ctx->plat != superbblas::CPU) + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + if (ctx().plat != superbblas::CPU) throw std::runtime_error("Only supported to read on `OnHost` tensors"); if (dist != OnMaster) throw std::runtime_error("Only supported to read on `OnMaster` tensors"); if (!isContiguous()) throw std::runtime_error("Only supported contiguous views in memory"); - if (scalar != T{1} || conjugate) - throw std::runtime_error("Not allowed for tensor with a scale not being one or implicitly conjugated"); + if (scalar != value_type{1} || conjugate) + throw std::runtime_error( + "Not allowed for tensor with a scale not being one or implicitly conjugated"); // Only on primary node read the data - std::size_t vol = superbblas::detail::volume(size); + std::size_t vol = volume(); std::size_t disp = detail::coor2index(from, dim, strides); std::size_t word_size = sizeof(typename detail::WordType::type); - bin.readArrayPrimaryNode((char*)&data.get()[disp], word_size, sizeof(T) / word_size * vol); + bin.readArrayPrimaryNode((char*)&data_for_writing()[disp], word_size, + sizeof(T) / word_size * vol); } void binaryWrite(BinaryWriter& bin) const { + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + // If the writing is collective, the root process needs to hold the whole tensor if (!bin.isLocal() && dist != OnMaster) throw std::runtime_error("For collective writing, the tensor should be `OnMaster`"); @@ -1972,7 +4311,7 @@ namespace Chroma throw std::runtime_error("Not allowed for tensors implicitly conjugated"); // If the tensor has an implicit scale, view, or is not on host, make a copy - if (scalar != T{1} || isSubtensor() || ctx->plat != superbblas::CPU) + if (scalar != value_type{1} || isSubtensor() || ctx().plat != superbblas::CPU) { cloneOn(OnHost).binaryWrite(bin); return; @@ -1981,31 +4320,35 @@ namespace Chroma // Write the local data std::size_t vol = p->localVolume(); std::size_t word_size = sizeof(typename detail::WordType::type); - bin.writeArrayPrimaryNode((char*)data.get(), word_size, sizeof(T) / word_size * vol); + bin.writeArrayPrimaryNode((char*)data(), word_size, sizeof(T) / word_size * vol); } void print(const std::string& name) const { - std::stringstream ss; - auto t = toComplex(); - auto t_host = t.like_this(none, {}, OnHost, OnMaster); - t.copyTo(t_host); - if (Layout::nodeNumber() == 0) + if (is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + auto t_host = + toComplex() + .make_sure(none, OnHost, detail::compatible_oneveryone_distribution(dist, OnMaster)) + .getLocal(); + if (t_host) { + assert(!t_host.isSubtensor()); using namespace detail::repr; - ss << "% " << repr(data.get()) << std::endl; + std::stringstream ss; + ss << "% " << repr(data()) << std::endl; ss << "% dist=" << p->p << std::endl; ss << name << "=reshape(["; - std::size_t vol = superbblas::detail::volume(size); + std::size_t vol = volume(); for (std::size_t i = 0; i < vol; ++i) { - //using detail::repr::operator<<; ss << " "; - detail::repr::operator<<(ss, t_host.data.get()[i]); + detail::repr::operator<<(ss, t_host.data()[i]); } ss << "], [" << size << "]);" << std::endl; + detail::log(1, ss.str()); } - detail::log(1, ss.str()); } # if 0 /// Get where the tensor is stored @@ -2033,6 +4376,139 @@ namespace Chroma # endif }; + /// Copy v.kvslice_from_size(dir) into w.kvslice_from_size(dir) for every dir in disps + /// assuming that v and w have even-odd layout (if 'X' == 2) and the displacements `disps` + /// are given in natural coordinates. + /// + /// \param v: origin tensor + /// \param w: destination tensor + /// \param label_mu: if given, copy each displacement into a separate coordinate. + /// \param disps: displacements in natural coordinates + /// \param real_dims: even-odd dimension of the original lattice + /// \param even_mask: mask into the elements with even x natural coordinate + /// \param odd_mask: mask into the elements with odd x natural coordinate + + template + void latticeCopyToWithMask(const Tensor& v, const Tensor& w, char label_mu, + const std::vector>& disps, + const std::map& real_dims, + const Tensor& mask_even, const Tensor& mask_odd) + { + // Shortcuts + if (disps.size() == 0) + return; + + // Make sure that v is distributed as w + if (!w.isDistributedAs(v, "xyztX")) + { + auto v_ = w.template make_compatible(v.order, v.kvdim()); + v.copyTo(v_); + latticeCopyToWithMask(v_, w, label_mu, disps, real_dims, mask_even, mask_odd); + return; + } + + // Get the number of colors on the original lattice + const auto dim = v.kvdim(); + int real_maxX = real_dims.count('X') == 1 ? real_dims.at('X') : dim.at('X'); + + // Preallocate the masks for v and w + Tensor v_mask = v.create_mask(); + Tensor w_mask = w.create_mask(); + + for (unsigned int mu = 0; mu < disps.size(); ++mu) + { + const auto& dir = disps[mu]; + int sumdir = std::accumulate(dir.begin(), dir.end(), int{0}); + + for (int x = 0; x < 2; ++x) + { + // Nat coor (x+dirx,Y+diry,Z+dirz,T+dirt) to even-odd coordinate + std::map from{{'x', x / real_maxX}}; + std::map to{{'X', sumdir}, + {'x', (dir[0] + dim.at('x') * real_maxX + x) / real_maxX}, + {'y', dir[1]}, + {'z', dir[2]}, + {'t', dir[3]}}; + + // Restrict the destination tensor to label_mu if given + auto w_mu = (label_mu == 0 ? w : w.kvslice_from_size({{label_mu, mu}}, {{label_mu, 1}})); + + auto mask_mu = (x == 0 ? mask_even : mask_odd).kvslice_from_size(from, {}); + auto v_mask_mu = v_mask.kvslice_from_size(to, {}); + auto w_mask_mu = + (label_mu == 0 ? w_mask : w_mask.kvslice_from_size({{label_mu, mu}}, {{label_mu, 1}})); + w_mask_mu = w_mask_mu.kvslice_from_size(to, {}); + mask_mu.copyTo(v_mask_mu); + mask_mu.copyTo(w_mask_mu); + v.kvslice_from_size(to, {}).copyToWithMask(w_mu.kvslice_from_size(to, {}), v_mask_mu, + w_mask_mu); + } // x + } // mu + } + + /// Return an identity matrix + /// \param dim: length for each of the row dimensions + /// \param m: labels map from the row to the column dimensions and other dimensions + + template + Tensor identity(const std::map& dim, const remap& m, + const Distribution& dist = OnEveryone) + { + using value_type = typename detail::base_type::type; + + // Get the order for the rows + std::string orows; + for (const auto& it : m) + orows.push_back(it.first); + + // Get the order for the columns + std::string ocols = detail::update_order(orows, m); + + // Get the extra dimensions + std::string ot; + for (const auto& it : dim) + if (m.count(it.first) == 0) + ot.push_back(it.first); + ot = detail::remove_dimensions(ot, ocols); + + // Get the dimensions of the identity tensor + std::map iden_dim; + for (const auto& it : dim) { + iden_dim[it.first] = (detail::is_in(ot, it.first) ? 1 : it.second); + if (detail::is_in(orows, it.first)) iden_dim[m.at(it.first)] = it.second; + } + + // Create the identity tensor + const std::string order = orows + ocols + ot; + Tensor iden{order, kvcoors(order, iden_dim, 0, ThrowOnMissing), OnHost, + detail::compatible_replicated_distribution(dist)}; + iden.set_zero(); + if (iden.getLocal()) + { + value_type* p = iden.getLocal().data(); + for (unsigned int i = 0, vol = detail::volume(dim, orows); i < vol; ++i) + p[vol * i + i] = value_type{1}; + } + + // Get the dimensions of the returned tensor + std::map t_dim; + for (const auto& it : dim) { + t_dim[it.first] = (!detail::is_in(ot, it.first) ? 1 : it.second); + if (detail::is_in(orows, it.first)) t_dim[m.at(it.first)] = 1; + } + Tensor t{order, kvcoors(order, t_dim, 0, ThrowOnMissing), OnDefaultDevice, dist}; + t.set(1); + + std::map r_dim = dim; + for (const auto& it : dim) + if (detail::is_in(orows, it.first)) r_dim[m.at(it.first)] = it.second; + Tensor r{order, kvcoors(order, r_dim, 0, ThrowOnMissing), OnDefaultDevice, dist}; + + kronecker(t, iden, r); + return r; + } + /// Contract some dimension of the given tensors /// \param v: one tensor to contract /// \param w: the other tensor to contract @@ -2041,6 +4517,8 @@ namespace Chroma /// \param r: optional given tensor where to put the resulting contraction /// \param mr: map from the given `r` to the labels of the contraction /// \param beta: scale on `r` if the `action` in `AddTo` + /// \param dev: device for the resulting tensor if `action` isn't given + /// \param dist: distribution of the resulting tensor if `action` isn't given /// /// Example: /// @@ -2054,31 +4532,53 @@ namespace Chroma /// contract<2>(t, q.rename_dims({{'s','S'},{'S','s'}}).conj(), "s", CopyTo, r3, {{'s','S'}}); // r2 = q * s^* template - Tensor contract(Tensor v, Tensor w, const std::string& labels_to_contract, - Maybe action = none, Maybe> r = none, - const remap& mr = {}, T beta = T{1}) + Tensor + contract(const Tensor& v, Tensor w, const std::string& labels_to_contract, + Maybe action = none, Tensor r = Tensor{}, const remap& mr = {}, + typename detail::base_type::type beta = typename detail::base_type::type{1}, + Maybe dev = none, Maybe dist = none) { - if (action.hasSome() != r.hasSome()) - throw std::runtime_error("Invalid default value"); + // Check arguments + if (action.hasSome() != (bool)r) + throw std::runtime_error( + "contract: invalid argument, if `action` is given, `r` should be given also"); + if ((dev.hasSome() || dist.hasSome()) && action.hasSome()) + throw std::runtime_error( + "contract: invalid argument, if `action` is given, `dev` and `dist` shouldn't be given"); // Compute the labels of the output tensor: v.order + w.order - labels_to_contract std::string rorder = detail::union_dimensions(v.order, w.order, labels_to_contract); if (Nr != rorder.size()) throw std::runtime_error( "contract: The dimension of the output tensor does not match the template argument"); - if (r && union_dimensions(rorder, r.getSome().order) != rorder) + if ((bool)r && detail::union_dimensions(rorder, r.order) != rorder) throw std::runtime_error("contract: The given output tensor has an unexpected ordering"); + // If any of the input tensors is glocal, make sure both are + if ((v.dist == Glocal) != (w.dist == Glocal)) + { + Tensor v0 = v; + Tensor w0 = w; + if (v.dist != Glocal) + v0 = v.getGlocal(); + if (w.dist != Glocal) + w0 = w.getGlocal(); + return contract(v0, w0, labels_to_contract, action, r, mr, beta, dev, dist); + } + // If the output tensor is not given create a new one Tensor r0; if (!r) { - r0 = v.like_this(rorder, w.kvdim()); + r0 = (v.dist != Glocal && (dev.hasSome() || dist.hasSome())) + ? v.template like_this(rorder, w.kvdim(), dev, dist) + : (v.volume() >= w.volume() ? v.template make_compatible(rorder, w.kvdim()) + : w.template make_compatible(rorder, v.kvdim())); beta = 0; } else { - r0 = r.getSome(); + r0 = r; } // Correct beta for the action @@ -2091,103 +4591,517 @@ namespace Chroma return r0; } - template - void* getQDPPtr(const T& t) + /// Contract some dimension of the given tensors + /// \param v: one tensor to contract + /// \param w: the other tensor to contract + /// \param labels_to_contract: labels dimensions to contract from `v` and `w` + /// \param dev: device for the resulting tensor + /// \param dist: distribution of the resulting tensor + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}), q("Ss", {{Ns,Ns}}); + /// Tensor<2,Complex> r0 = contract<2>(t, q, "s"); // r0 dims are "cS" + /// Tensor<3,Complex> r1 = contract<3>(t, q, ""); // r1 dims are "csS" + /// Tensor<3,Complex> r2 = contract<3>(t, q, "", OnMaster); // r2 is supported on master + + template + Tensor contract(const Tensor& v, Tensor w, + const std::string& labels_to_contract, Maybe dev, + Maybe dist) { -# ifdef QDP_IS_QDPJIT - multi1d v(1); - v[0] = t.getId(); - void* r = QDP_get_global_cache().get_dev_ptrs(v)[0]; - assert(superbblas::detail::getPtrDevice(r) >= 0); - return r; -# else - return t.getF(); -# endif + return contract(v, w, labels_to_contract, none, {}, {}, 0, dev, dist); } - template - using LatticeColorVectorT = OLattice, Nc>>>; + /// Contract some dimension of the given tensors + /// \param v: one tensor to contract + /// \param w: the other tensor to contract + /// \param labels_to_contract: map of labels dimensions to contract from `v` to `w` + /// \param action: either to copy or add to the given output tensor if given + /// \param r: optional given tensor where to put the resulting contraction + /// \param mr: map from the given `r` to the labels of the contraction + /// \param beta: scale on `r` if the `action` in `AddTo` + /// \param dev: device for the resulting tensor if `action` isn't given + /// \param dist: distribution of the resulting tensor if `action` isn't given + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}), q("St", {{Ns,Ns}}); + /// Tensor<2,Complex> r0 = contract<2>(t, q, {{'s','t'}}); // r0 dims are "cS" + /// Tensor<2,Complex> r2("cS", {{Nc,Ns}}); + /// contract<2>(t, q, {{'s','t'}}, CopyTo, r2); // r2 = q * s + /// Tensor<2,Complex> r3("cs", {{Nc,Ns}}); + /// contract<2>(t, q, {{'s','t'}}, CopyTo, r3, {{'s','S'}}); // r2 = q * s + /// contract<2>(t, q.rename_dims({{'s','S'},{'S','s'}}).conj(), {{'s','t'}}, CopyTo, r3, {{'s','S'}}); // r2 = q * s^* - template - Tensor> asTensorView(const LatticeColorVectorT& v) + template + Tensor + contract(const Tensor& v, Tensor w, const remap& labels_to_contract, + Maybe action = none, Tensor r = Tensor{}, const remap& mr = {}, + typename detail::base_type::type beta = typename detail::base_type::type{1}, + Maybe dev = none, Maybe dist = none) { - using Complex = std::complex; - Complex* v_ptr = reinterpret_cast(v.getF()); - return Tensor("cxyztX", latticeSize("cxyztX"), OnHost, OnEveryone, - std::shared_ptr(v_ptr, [](Complex*) {})); + // Remap the labels to contract from v and w + std::string labels_to_contract_v; + for (const auto& it : labels_to_contract) + labels_to_contract_v.push_back(it.first); + remap mv = detail::getNewLabels(labels_to_contract_v, v.order + w.order); + remap mw; + for (const auto it : mv) + mw[labels_to_contract.at(it.first)] = it.second; + std::string labels_to_contract_str; + for (const auto it : mv) + labels_to_contract_str.push_back(it.second); + return contract(v.rename_dims(mv), w.rename_dims(mw), labels_to_contract_str, action, r, mr, + beta, dev, dist); } -# ifndef QDP_IS_QDPJIT - inline Tensor asTensorView(const LatticeFermion& v) - { - Complex* v_ptr = reinterpret_cast(v.getF()); - return Tensor("csxyztX", latticeSize("csxyztX"), OnHost, OnEveryone, - std::shared_ptr(v_ptr, [](Complex*) {})); - } -# else - inline Tensor asTensorView(const LatticeFermion& v) - { - REAL* v_ptr = reinterpret_cast(getQDPPtr(v)); - return Tensor("xyztXsc.", latticeSize("xyztXsc."), OnDefaultDevice, - OnEveryone, std::shared_ptr(v_ptr, [](REAL*) {})); - } -# endif + /// Do the Kronecker product of two tensors + /// \param v: one tensor to contract + /// \param w: the other tensor to contract + /// \param r: optional given tensor where to put the resulting contraction -# ifndef QDP_IS_QDPJIT - inline Tensor asTensorView(const LatticeComplex& v) - { - Complex* v_ptr = reinterpret_cast(v.getF()); - return Tensor("xyztX", latticeSize("xyztX"), OnHost, OnEveryone, - std::shared_ptr(v_ptr, [](Complex*) {})); - } -# else - inline Tensor asTensorView(const LatticeComplex& v) + template + Tensor kronecker(const Tensor& v, const Tensor& w, + Tensor r = Tensor{}) { - REAL* v_ptr = reinterpret_cast(getQDPPtr(v)); - return Tensor("xyztX.", latticeSize("xyztX."), OnDefaultDevice, - OnEveryone, std::shared_ptr(v_ptr, [](REAL*) {})); - } + // Make sure that no dimension in common has size larger than one in both tensors + auto v_kvdim = v.kvdim(); + auto w_kvdim = w.kvdim(); + for (const auto& it : v_kvdim) + if (it.second > 1 && w_kvdim.count(it.first) > 0 && w_kvdim.at(it.first) > 1) + throw std::runtime_error( + "kronecker: input tensors have a common dimension with size larger than one"); + + // Renamed all dimensions in w to avoid a common label between the tensors + remap w_m = detail::getNewLabels(w.order, v.order); + + // Do the contraction + auto k = contract(v, w.rename_dims(w_m), ""); + + // The labels of the output tensor are the union of the input tensors labels + std::string rorder = detail::union_dimensions(v.order, w.order); + + // The output tensor has the maximum size of the input tensors + auto rdims = v_kvdim; + for (const auto& it : w_kvdim) + if (rdims.count(it.first) == 0 || rdims.at(it.first) == 1) + rdims[it.first] = it.second; + + // For the common labels, rename the singleton one + remap k_m; + for (const auto& it : w_kvdim) + { + if (v_kvdim.count(it.first) == 1 && it.second > 1) + { + k_m[w_m.at(it.first)] = it.first; + k_m[it.first] = w_m.at(it.first); + } + else if (v_kvdim.count(it.first) == 0) + { + k_m[w_m.at(it.first)] = it.first; + } + } + + // If the output tensor is not given create a new one + if (!r) + { + auto r0 = v.template make_compatible(rorder, rdims); + k.rename_dims(k_m).copyTo(r0); + return r0; + } + else + { + k.rename_dims(k_m).copyTo(r); + return r; + } + } + + /// Compute the norm along some dimensions + /// \param v: tensor + /// \param order_t: labels not to contract (optional) + /// \param order_rows: labels to contract (optional, either order_rows or order_t + /// should be provided) + /// + /// Example: + /// + /// Tensor<2,Complex> t("cs", {{Nc,Ns}}), q("Ss", {{Ns,Ns}}); + /// Tensor<2,Complex> r0 = contract<2>(t, q, "s"); // r0 dims are "cS" + /// Tensor<3,Complex> r1 = contract<3>(t, q, ""); // r1 dims are "csS" + /// Tensor<2,Complex> r2("cS", {{Nc,Ns}}); + /// contract<2>(t, q, "s", CopyTo, r2); // r2 = q * s + /// Tensor<2,Complex> r3("cs", {{Nc,Ns}}); + /// contract<2>(t, q, "s", CopyTo, r3, {{'s','S'}}); // r2 = q * s + /// contract<2>(t, q.rename_dims({{'s','S'},{'S','s'}}).conj(), "s", CopyTo, r3, {{'s','S'}}); // r2 = q * s^* + + template + Tensor::type> norm(const Tensor& v, + Maybe order_t = none, + Maybe order_rows = none) + { + if (!order_t.hasSome() && !order_rows.hasSome()) + throw std::runtime_error( + "norm: invalid input, give at least either `order_t` or `order_rows`"); + + // Compute the labels to contract + std::string rorder = order_rows.hasSome() + ? order_rows.getSome() + : detail::remove_dimensions(v.order, order_t.getSome()); + std::string torder = + order_t.hasSome() ? order_t.getSome() : detail::remove_dimensions(v.order, rorder); + + // Allocate the output on the host and spread the result to every process + auto r = contract(v.conj(), v, rorder, OnHost, + detail::compatible_replicated_distribution(v.dist)) + .reorder(torder); + + // Do the square root and return the result + using Treal = typename detail::real_type::type; + return r.template transformWithCPUFun( + [](const typename detail::base_type::type& t) { return std::sqrt(std::real(t)); }); + } + + /// Compute the Cholesky factor of `v' and contract its inverse with `w` + /// \param v: tensor to compute the Cholesky factor + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param w: the other tensor to contract + /// \param labels_to_contract: labels dimensions to contract from `v` and `w` + /// \param action: either to copy or add to the given output tensor if given (only `CopyTo' supported) + /// \param r: optional given tensor where to put the resulting contraction + + template + Tensor cholInv(const Tensor& v, const std::string& order_rows, + const std::string& order_cols, const Tensor& w, + const std::string& labels_to_contract, Maybe action = none, + Tensor r = {}) + { + if (action.hasSome() != (bool)r) + throw std::runtime_error("Invalid default value"); + + // Compute the labels of the output tensor: v.order + w.order - labels_to_contract + std::string rorder = detail::union_dimensions(v.order, w.order, labels_to_contract); + if (Nr != rorder.size()) + throw std::runtime_error( + "cholInv: The dimension of the output tensor does not match the template argument"); + if (r && detail::union_dimensions(rorder, r.order) != rorder) + throw std::runtime_error("cholInv: The given output tensor has an unexpected ordering"); + + // If the output tensor is not given create a new one + Tensor r0; + if (!r) + { + r0 = v.template like_this(rorder, w.kvdim()); + } + else + { + r0 = r; + } + + // For now, only `CopyTo' action is supported + if (action.hasSome() && action.getSome() != CopyTo) + throw std::runtime_error("cholInv: unsupported action"); + + // Do the contraction + r0.cholInv(std::move(v), order_rows, order_cols, w); + + return r0; + } + + /// Solve the linear systems within tensor `v' and right-hand-sides `w` + /// \param v: tensor to compute the Cholesky factor + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param w: the other tensor to contract + /// \param labels_to_contract: labels dimensions to contract from `v` and `w` + /// \param action: either to copy or add to the given output tensor if given (only `CopyTo' supported) + /// \param r: optional given tensor where to put the resulting contraction + + template + Tensor solve(const Tensor& v, const std::string& order_rows, + const std::string& order_cols, const Tensor& w, + const std::string& labels_to_contract, Maybe action = none, + Tensor r = {}) + { + if (action.hasSome() != (bool)r) + throw std::runtime_error("solve: Invalid default value"); + + // Compute the labels of the output tensor: v.order + w.order - labels_to_contract + std::string rorder = detail::union_dimensions(v.order, w.order, labels_to_contract); + if (Nr != rorder.size()) + throw std::runtime_error( + "solve: The dimension of the output tensor does not match the template argument"); + if (r && detail::union_dimensions(rorder, r.order) != rorder) + throw std::runtime_error("solve: The given output tensor has an unexpected ordering"); + if (Nlabels != labels_to_contract.size()) + throw std::runtime_error( + "solve: The length of `order_rows` does not match the template argument `Nrows`"); + if (order_rows.size() != order_cols.size()) + throw std::runtime_error("solve: unsupported ordering for the matrix"); + + // If the output tensor is not given create a new one + Tensor r0; + if (!r) + { + r0 = v.template like_this(rorder, w.kvdim()); + } + else + { + r0 = r; + } + + // For now, only `CopyTo' action is supported + if (action.hasSome() && action.getSome() != CopyTo) + throw std::runtime_error("solve: unsupported action"); + + // Compute the solution + r0.solve(v, order_rows, order_cols, w); + + // Check the solution + if (superbblas::getDebugLevel() > 0) + { + auto res = w.clone().scale(-1); + remap m{}; + for (unsigned int i = 0; i < order_rows.size(); ++i) + { + m[order_rows[i]] = order_cols[i]; + m[order_cols[i]] = order_rows[i]; + } + contract(v, r0.rename_dims(m), labels_to_contract, AddTo, res.rename_dims(m)); + auto wnorms = norm(w, none, order_cols); + auto rnorms = norm(res, none, order_cols); + double err = 0; + for (int i = 0, i1 = wnorms.volume(); i < i1; ++i) + err = std::max(err, (double)rnorms.data()[i] / wnorms.data()[i]); + QDPIO::cout << "solve error: " << detail::tostr(err) << std::endl; + auto eps = std::sqrt(std::numeric_limits::type>::epsilon()); + if (err > eps) + throw std::runtime_error(std::string("solve: too much error in dense solution, ") + + detail::tostr(err)); + } + + return r0; + } + + /// Invert the matrices + /// \param v: tensor to compute the inversion + /// \param order_rows: labels that are rows of the matrices to factor + /// \param order_cols: labels that are columns of the matrices to factor + /// \param r: optional given tensor where to put the resulting contraction + + template + Tensor inv(const Tensor& v, const std::string& order_rows, + const std::string& order_cols, Tensor r = {}) + { + if (r && detail::union_dimensions(v.order, r.order) != v.order) + throw std::runtime_error("inv: The given output tensor has an unexpected ordering"); + if (order_rows.size() != order_cols.size()) + throw std::runtime_error("inv: unsupported ordering for the matrix"); + + // If the output tensor is not given create a new one + Tensor r0; + if (!r) + { + r0 = v.make_compatible(); + } + else + { + r0 = r; + } + + // Compute the solution + r0.inv(v, order_rows, order_cols); + + // Check the solution + if (superbblas::getDebugLevel() > 0) + { + remap m{}; + for (unsigned int i = 0; i < order_rows.size(); ++i) + m[order_rows[i]] = order_cols[i]; + auto dim = v.kvdim(); + char c = detail::get_free_label(v.order); + dim[c] = 1; + auto res = identity(dim, m).scale(-1); + contract(v, r0.split_dimension(order_rows[0], std::string({c, order_rows[0]}), 1), m, + AddTo, res); + auto err = norm<1>(res, std::string(1, c)).get({0}); + QDPIO::cout << "inv error: " << detail::tostr(err) << std::endl; + auto eps = std::sqrt(std::numeric_limits::type>::epsilon()); + if (err > eps) + throw std::runtime_error(std::string("inv: too much error in dense solution, ") + + detail::tostr(err)); + } + + return r0; + } + + /// Elementwise division + /// \param v: numerator + /// \param w: denominator + + template + Tensor div(const Tensor& v, const Tensor& w) + { + auto r = v.make_compatible(none, {}, OnHost); + v.copyTo(r); + auto w0 = r.make_compatible(); + w.copyTo(w0); + auto r_local = r.getLocal(); + auto w0_local = w0.getLocal(); + if (r_local) + { + auto rptr = r_local.data(); + auto w0ptr = w0_local.data(); + for (std::size_t i = 0, vol = r_local.volume(); i < vol; ++i) { + auto w0i = + detail::cond_conj(r_local.conjugate != w0_local.conjugate, w0ptr[i] * w0_local.scalar); + rptr[i] = std::norm(w0i) == 0 ? T{0} : rptr[i] / w0i; + } + } + return r; + } + + /// Compute the maximum for a small tensor + /// \param v: tensor + + template + typename detail::base_type::type + max(Tensor v, typename detail::base_type::type init = + std::numeric_limits::type>::lowest()) + { + using value_type = typename detail::base_type::type; + v = v.make_sure(none, OnHost, detail::compatible_replicated_distribution(v.dist)); + if (v.isSubtensor()) + v = v.clone(); + value_type r = init; + v = v.getLocal(); + value_type* p = v.data(); + for (unsigned int i = 0, vol = v.volume(); i < vol; ++i) + r = std::max(r, p[i]); + return r; + } + + /// Elementwise product + /// \param v: numerator + /// \param w: denominator + + template + Tensor mult(const Tensor& v, const Tensor& w) + { + auto r = v.make_compatible(none, {}, OnHost); + v.copyTo(r); + auto w0 = r.make_compatible(); + w.copyTo(w0); + auto r_local = r.getLocal(); + auto w0_local = w0.getLocal(); + if (r_local) + { + auto rptr = r_local.data(); + auto w0ptr = w0_local.data(); + for (std::size_t i = 0, vol = r_local.volume(); i < vol; ++i) + rptr[i] = rptr[i] * detail::cond_conj(r_local.conjugate != w0_local.conjugate, + w0ptr[i] * w0_local.scalar); + } + return r; + } + + template + void* getQDPPtr(const T& t) + { +# if defined(QDP_IS_QDPJIT) && defined(SUPERBBLAS_USE_GPU) + multi1d v(1); + v[0] = t.getId(); + void* r = QDP_get_global_cache().get_dev_ptrs(v)[0]; + assert(superbblas::detail::getPtrDevice(r) >= 0); + return r; +# else + return t.getF(); +# endif + } + + template + using LatticeColorVectorT = OLattice, Nc>>>; + + template + Tensor> asTensorView(const LatticeColorVectorT& v) + { + using Complex = std::complex; + Complex* v_ptr = reinterpret_cast(v.getF()); + return Tensor("cxyztX", latticeSize("cxyztX"), OnHost, + OnEveryoneAsChroma, v_ptr); + } + +# if !defined(QDP_IS_QDPJIT) || !defined(SUPERBBLAS_USE_GPU) + inline Tensor asTensorView(const LatticeFermion& v) + { + Complex* v_ptr = reinterpret_cast(v.getF()); + return Tensor("csxyztX", latticeSize("csxyztX"), OnHost, + OnEveryoneAsChroma, v_ptr); + } +# else + inline Tensor> asTensorView(const LatticeFermion& v) + { + REAL* v_ptr = reinterpret_cast(getQDPPtr(v)); + return Tensor>("xyztXsc.", latticeSize("xyztXsc."), + OnDefaultDevice, OnEveryoneAsChroma, v_ptr, '.'); + } +# endif + +# if !defined(QDP_IS_QDPJIT) || !defined(SUPERBBLAS_USE_GPU) + inline Tensor asTensorView(const LatticeComplex& v) + { + Complex* v_ptr = reinterpret_cast(v.getF()); + return Tensor("xyztX", latticeSize("xyztX"), OnHost, + OnEveryoneAsChroma, v_ptr); + } +# else + inline Tensor> asTensorView(const LatticeComplex& v) + { + REAL* v_ptr = reinterpret_cast(getQDPPtr(v)); + return Tensor>("xyztX.", latticeSize("xyztX."), + OnDefaultDevice, OnEveryoneAsChroma, v_ptr, '.'); + } # endif -# ifndef QDP_IS_QDPJIT +# if !defined(QDP_IS_QDPJIT) || !defined(SUPERBBLAS_USE_GPU) inline Tensor asTensorView(const LatticeColorMatrix& v) { Complex* v_ptr = reinterpret_cast(v.getF()); return Tensor("jixyztX", latticeSize("jixyztX", {{'i', Nc}, {'j', Nc}}), OnHost, - OnEveryone, std::shared_ptr(v_ptr, [](Complex*) {})); + OnEveryoneAsChroma, v_ptr); } # else - inline Tensor asTensorView(const LatticeColorMatrix& v) + inline Tensor> asTensorView(const LatticeColorMatrix& v) { REAL* v_ptr = reinterpret_cast(getQDPPtr(v)); - return Tensor( + return Tensor>( "xyztXji.", latticeSize("xyztXji.", {{'i', Nc}, {'j', Nc}}), OnDefaultDevice, - OnEveryone, std::shared_ptr(v_ptr, [](REAL*) {})); + OnEveryoneAsChroma, v_ptr, '.'); } # endif inline Tensor asTensorView(const LatticeColorVectorSpinMatrix& v) { Complex* v_ptr = reinterpret_cast(v.getF()); - return Tensor( - "cjixyztX", latticeSize("cjixyztX", {{'i', Ns}, {'j', Ns}}), OnHost, OnEveryone, - std::shared_ptr(v_ptr, [](Complex*) {})); + return Tensor("cjixyztX", + latticeSize("cjixyztX", {{'i', Ns}, {'j', Ns}}), + OnHost, OnEveryoneAsChroma, v_ptr); } template Tensor<1, COMPLEX> asTensorView(std::vector& v, Distribution dist = OnEveryoneReplicated) { - return Tensor<1, COMPLEX>("i", Coor<1>{Index(v.size())}, OnHost, dist, - std::shared_ptr(v.data(), [](COMPLEX*) {})); + return Tensor<1, COMPLEX>("i", Coor<1>{Index(v.size())}, OnHost, dist, v.data()); } - inline Tensor<2, Complex> asTensorView(SpinMatrix& smat) + inline Tensor<2, Complex> asTensorView(SpinMatrix& smat, + const Distribution& dist = OnEveryoneReplicated) { Complex* v_ptr = reinterpret_cast(smat.getF()); - return Tensor<2, Complex>("ji", Coor<2>{Ns, Ns}, OnHost, OnEveryoneReplicated, - std::shared_ptr(v_ptr, [](Complex*) {})); + return Tensor<2, Complex>("ji", Coor<2>{Ns, Ns}, OnHost, + detail::compatible_replicated_distribution(dist), v_ptr); } inline SpinMatrix SpinMatrixIdentity() @@ -2203,11 +5117,12 @@ namespace Chroma } template - Tensor<2, COMPLEX> Gamma(int gamma, DeviceHost dev = OnDefaultDevice) + Tensor<2, COMPLEX> Gamma(int gamma, DeviceHost dev = OnDefaultDevice, + const Distribution& dist = OnEveryoneReplicated) { SpinMatrix g = QDP::Gamma(gamma) * SpinMatrixIdentity(); - Tensor<2, COMPLEX> r("ij", {Ns, Ns}, dev, OnEveryoneReplicated); - asTensorView(g).copyTo(r); + Tensor<2, COMPLEX> r("ij", {Ns, Ns}, dev, detail::compatible_replicated_distribution(dist)); + asTensorView(g, dist).copyTo(r); return r; } @@ -2226,322 +5141,3466 @@ namespace Chroma return std::string(dest.begin(), dest.end()); } - template - struct StorageTensor { - static_assert(superbblas::supported_type::value, "Not supported type"); + /// Broadcast a string from process zero + inline int broadcast(int s) + { + std::vector v(1, s), dest(1, 0); + asTensorView(v, OnMaster).copyTo(asTensorView(dest)); + return dest[0]; + } - public: - std::string filename; ///< Storage file - std::string metadata; ///< metadata - std::string order; ///< Labels of the tensor dimensions - Coor dim; ///< Length of the tensor dimensions - Sparsity sparsity; ///< Sparsity of the storage - std::shared_ptr - ctx; ///< Superbblas storage handler - Coor from; ///< First active coordinate in the tensor - Coor size; ///< Number of active coordinates on each dimension - T scalar; ///< Scalar factor of the tensor + /// Broadcast a string from process zero + template + Coor broadcast(const Coor& c) + { + std::vector v(c.begin(), c.end()), dest(N); + asTensorView(v, OnMaster).copyTo(asTensorView(dest)); + Coor r; + std::copy_n(dest.begin(), N, r.begin()); + return r; + } - // Empty constructor - StorageTensor() - : filename{}, - metadata{}, - order(detail::getTrivialOrder(N)), - dim{}, - sparsity(Dense), - ctx{}, - from{}, - size{}, - scalar{0} + /// Broadcast a string from process zero + template + Maybe broadcast(const Maybe& c) + { + int has_something = broadcast(c.hasSome() ? 1 : 0); + if (has_something == 1) + return Maybe(broadcast(c ? c.getSome() : T{})); + return none; + } + + /// Return a tensor with local support; the first dimension is the process index + template + Tensor local_support_tensor(const std::string& order, Coor dim, + DeviceHost dev = OnDefaultDevice) + { + char proc_label = detail::get_free_label(order); + std::string this_order = std::string{proc_label} + order; + auto this_dim = detail::insert_coor(dim, 0, Layout::numNodes()); + return Tensor(this_order, this_dim, dev, std::string{proc_label}, + std::make_shared>(detail::TensorPartition( + this_order, this_dim, std::string{proc_label})), + false /*= unordered_writing */, 0 /* no complexLabel*/); + } + + /// Broadcast a string from process zero + inline int global_max(int s) + { + auto r = local_support_tensor("i", {1}, OnHost); + r.getLocal().set({{0, 0}}, s); + return max(r.make_sure(none, none, OnEveryoneReplicated)); + } + + /// Class for operating sparse tensors + /// \tparam ND: number of domain dimensions + /// \tparam NI: number of image dimensions + /// \tparam T: datatype + /// + /// The class may support several variants of Column Sparse Row (CSR) format for representing + /// sparse matrices, but for now only Block Sparse Row (BSR) with the same number of nonzeros + /// on all rows is supported. Superbblas has some support for blocked ELL (BSR but with a negative + /// column index for the unused blocks in a row), but most of the methods of this class aren't ready + /// for that. + /// + /// Besides, this class implements an extension of the BSR in which the nonzero blocks are the result + /// of the tensor product of two matrices one of them being constant among all edges on the same + /// direction. This extension is referred as BSR Kronecker. When `kron_data` is given, the nonzeros + /// should be ordered such that the nonzero blocks with the same `u` label are multiplied by the nonzero + /// block in `kron_data` with that `u`. + + template + struct SpTensor { + using value_type = typename detail::base_type::type; + static_assert(superbblas::supported_type::value, "Not supported type"); + + public: + Tensor d; ///< Tensor example for the domain + Tensor i; ///< Tensor example for the image + Coor blkd; ///< blocking for the domain + Coor blki; ///< blocking for the image + Coor krond; ///< Kronecker blocking for the domain + Coor kroni; ///< Kronecker blocking for the image + Tensor ii; ///< Number of blocks in each row + Tensor jj; ///< Coordinate of the first element on each block + Tensor data; ///< Nonzero values + Tensor kron; ///< Nonzero values for the Kronecker values + std::shared_ptr handle; ///< suparbblas sparse tensor handle + value_type scalar; ///< Scalar factor of the tensor + bool isImgFastInBlock; ///< whether the BSR blocks are in row-major + unsigned int nblockd; ///< Number of blocked domain dimensions + unsigned int nblocki; ///< Number of blocked image dimensions + unsigned int nkrond; ///< Number of Kronecker blocked domain dimensions + unsigned int nkroni; ///< Number of Kronecker blocked image dimensions + + /// Low-level constructor with the Kronecker BSR extension + SpTensor(Tensor d, Tensor i, Coor blkd, Coor blki, Coor krond, + Coor kroni, Tensor ii, Tensor jj, + Tensor data, Tensor kron_data, value_type scalar, + bool isImgFastInBlock, unsigned int nblockd, unsigned int nblocki, + unsigned int nkrond, unsigned int nkroni) + : d(d.make_eg()), + i(i.make_eg()), + blkd(blkd), + blki(blki), + krond(krond), + kroni(kroni), + ii(ii), + jj(jj), + data(data), + kron(kron_data), + scalar(scalar), + isImgFastInBlock(isImgFastInBlock), + nblockd(nblockd), + nblocki(nblocki), + nkrond(nkrond), + nkroni(nkroni) { } - // Create storage construct - StorageTensor(const std::string& filename, const std::string& metadata, - const std::string& order, Coor dim, Sparsity sparsity = Dense, - checksum_type checksum = checksum_type::NoChecksum) - : filename(filename), - metadata(metadata), - order(order), - dim(dim), - sparsity(sparsity), - from{}, - size{dim}, - scalar{1} + /// Low-level constructor without the Kronecker BSR extension + SpTensor(Tensor d, Tensor i, Coor blkd, Coor blki, Tensor ii, + Tensor jj, Tensor data, value_type scalar, + bool isImgFastInBlock, unsigned int nblockd, unsigned int nblocki) + : SpTensor(d, i, blkd, blki, detail::ones(), detail::ones(), ii, jj, data, + Tensor(), scalar, isImgFastInBlock, nblockd, nblocki, 0, 0) { - checkOrder(); - superbblas::Storage_handle stoh; - superbblas::create_storage(dim, superbblas::FastToSlow, filename.c_str(), - metadata.c_str(), metadata.size(), checksum, - MPI_COMM_WORLD, &stoh); - ctx = std::shared_ptr( - stoh, [=](superbblas::detail::Storage_context_abstract* ptr) { - superbblas::close_storage(ptr, MPI_COMM_WORLD); - }); - - // If the tensor to store is dense, create the block here; otherwise, create the block on copy - if (sparsity == Dense) - { - superbblas::PartitionItem p{Coor{}, dim}; - superbblas::append_blocks(&p, 1, stoh, MPI_COMM_WORLD, superbblas::FastToSlow); - } } - // Open storage construct - StorageTensor(const std::string& filename, bool read_order = true, - const Maybe& order_tag = none) - : filename(filename), sparsity(Sparse), from{}, scalar{1} - { - // Read information from the storage + /// Return a string describing the tensor + /// \param ptr: pointer to the memory allocation + /// \return: the string representing the tensor + + std::string repr() const + { + using namespace detail::repr; + std::stringstream ss; + ss << "SpTensor{"; + if (data.data()) + ss << "data:" << data.data() << ", "; + std::size_t sizemb = (ii.getLocal().volume() * sizeof(int) + // + jj.getLocal().volume() * sizeof(int) + // + data.getLocal().volume() * sizeof(value_type)) / + 1024 / 1024; + ss << "domain_order: " << d.order << ", domain_dim:" << d.dim << "image_order: " << i.order + << ", image_dim:" << i.dim << ", local_storage:" << sizemb << " MiB}"; + return ss.str(); + } + + /// Constructor + /// \param d: example tensor for the domain + /// \param i: example tensor for the image + /// \param nblockd: the first `nblockd` domain labels will be blocked + /// \param nblocki: the first `nblocki` image labels will blocked + /// \param num_neighbors: number of nonzeros for each blocked row + + SpTensor(Tensor d, Tensor i, unsigned int nblockd, unsigned int nblocki, + unsigned int nkrond, unsigned int nkroni, unsigned int num_neighbors, + bool isImgFastInBlock = false) + : d{d.make_eg()}, + i{i.make_eg()}, + scalar{value_type{1}}, + isImgFastInBlock{isImgFastInBlock}, + nblockd(nblockd), + nblocki(nblocki), + nkrond(nkrond), + nkroni(nkroni) + { + // Check that the examples are on the same device + if (d.getDev() != i.getDev()) + throw std::runtime_error("Please give example vectors on the same device"); + + // Check that `d` and `i` are not subtensors + if (this->d.isSubtensor() || this->i.isSubtensor()) + throw std::runtime_error("unsupported subtensors for domain/image distributions"); + + // Check that the domain and image labels are different and do not contain `u` or `~` + detail::check_order(i.order + d.order + std::string("u~")); + + // Get the blocking and the Kronecker blocking + krond = blkd = kvcoors(d.order, d.kvdim()); + for (unsigned int i = nblockd; i < ND; ++i) + blkd[i] = 1; + kroni = blki = kvcoors(i.order, i.kvdim()); + for (unsigned int i = nblocki; i < NI; ++i) + blki[i] = 1; + for (unsigned int i = 0; i < ND; ++i) + if (i < nblockd || i >= nblockd + nkrond) + krond[i] = 1; + for (unsigned int i = 0; i < NI; ++i) + if (i < nblocki || i >= nblocki + nkroni) + kroni[i] = 1; + + // Create the tensor containing the number of neighbors for each blocking + std::map nonblki; + for (unsigned int j = 0; j < NI; ++j) + nonblki[i.order[j]] = i.size[j] / blki[j] / kroni[j]; + ii = i.template make_compatible(none, nonblki); + ii.set(num_neighbors); + + // Create the tensor containing the domain coordinates of the first nonzero in each block + jj = ii.template make_compatible(std::string("~u") + i.order, + {{'~', (int)ND}, {'u', (int)num_neighbors}}); + + // Compute the data dimensions as + // image_blocked_dims + domain_dims + u + image_nonblocked_dims, for isImgFastInBlock + // domain_blocked_dims + image_blocked_dims + u + image_nonblockd_dims otherwise + std::map data_dims; + for (unsigned int j = 0; j < NI; ++j) + data_dims[i.order[j]] = i.size[j] / kroni[j]; + for (unsigned int i = 0; i < ND; ++i) + data_dims[d.order[i]] = blkd[i]; + data_dims['u'] = num_neighbors; + std::string data_order = + (isImgFastInBlock + ? std::string(i.order.begin(), i.order.begin() + nblocki + nkroni) + d.order + : std::string(d.order.begin(), d.order.begin() + nblockd + nkrond) + + std::string(i.order.begin(), i.order.begin() + nblocki + nkroni) + + std::string(d.order.begin() + nblockd + nkrond, d.order.end())) + + std::string("u") + std::string(i.order.begin() + nblocki + nkroni, i.order.end()); + data = ii.template make_compatible(data_order, data_dims); + + // Compute the Kronecker dimensions as `data` + if (nkrond + nkroni > 0) + { + std::map kron_dims; + for (unsigned int i = 0; i < ND; ++i) + kron_dims[d.order[i]] = krond[i]; + for (unsigned int j = 0; j < NI; ++j) + kron_dims[i.order[j]] = kroni[j]; + kron_dims['u'] = num_neighbors; + std::string kron_order = data_order; + kron = data.like_this(kron_order, kron_dims, none, + detail::compatible_replicated_distribution(data.dist)); + } + + std::string nonblock_img_labels(i.order.begin() + nblocki + nkroni, i.order.end()); + if (!ii.isDistributedAs(this->i, nonblock_img_labels) || + !ii.isDistributedAs(jj, nonblock_img_labels) || + !ii.isDistributedAs(data, nonblock_img_labels)) + throw std::runtime_error("SpTensor: the dense tensors representing the sparse tensor " + "have incompatible distributions"); + } + + /// Empty constructor + + SpTensor() + : blki{{}}, + blkd{{}}, + kroni{{}}, + krond{{}}, + scalar{0}, + isImgFastInBlock{false}, + nblockd{0}, + nblocki{0}, + nkrond{0}, + nkroni{0} + { + } + + /// Return whether the tensor is not empty + + explicit operator bool() const noexcept + { + return (bool)d; + } + + /// Return whether the sparse tensor has Kronecker form + + bool is_kronecker() const noexcept + { + return (bool)kron; + } + + /// Construct the sparse operator + void construct() + { + if ((ii.dist != OnEveryone && ii.dist != OnEveryoneAsChroma && ii.dist != Local && + ii.dist != Glocal) || + ii.dist != jj.dist || ii.dist != data.dist || + (kron && kron.dist != detail::compatible_replicated_distribution(ii.dist))) + throw std::runtime_error("SpTensor::construct: unexpected distribution of the data"); + + // Superbblas needs the column coordinates to be local + // Remove the local domain coordinates to jj + const auto localFrom = d.p->localFrom(); + const auto domDim = d.dim; + auto localjj = + jj.template transformWithCPUFunWithCoor([&](const Coor& c, const int& t) { + return (t - localFrom[c[0]] + domDim[c[0]]) % domDim[c[0]]; + }); + + std::string nonblock_img_labels(i.order.begin() + nblocki + nkroni, i.order.end()); + if (!ii.isDistributedAs(this->i, nonblock_img_labels) || + !ii.isDistributedAs(localjj, nonblock_img_labels) || + !ii.isDistributedAs(data, nonblock_img_labels)) + throw std::runtime_error("SpTensor: the dense tensors representing the sparse tensor " + "have incompatible distributions"); + int* iiptr = ii.data(); + Coor* jjptr = (Coor*)localjj.data(); + // NOTE: despite jj being a vector of `int`, superbblas will use jj as a vector of Coor, so check that the alignment + if (localjj.getLocal().volume() > 0 && + superbblas::detail::align(alignof(Coor), sizeof(int), jjptr, sizeof(int)) == + nullptr) + throw std::runtime_error("Ups! Look into this"); + const value_type* ptr = data.data(); + const value_type* kron_ptr = kron.data(); + MPI_Comm comm = (ii.dist == Local ? MPI_COMM_SELF : MPI_COMM_WORLD); + superbblas::BSR_handle* bsr = nullptr; + if (nkrond == 0 && nkroni == 0) + { + superbblas::create_bsr( + i.p->p.data(), i.dim, d.p->p.data(), d.dim, 1, blki, blkd, isImgFastInBlock, &iiptr, + &jjptr, &ptr, &data.ctx(), comm, superbblas::FastToSlow, &bsr); + } + else + { + superbblas::create_kron_bsr( + i.p->p.data(), i.dim, d.p->p.data(), d.dim, 1, blki, blkd, kroni, krond, + isImgFastInBlock, &iiptr, &jjptr, &ptr, &kron_ptr, &data.ctx(), comm, + superbblas::FastToSlow, &bsr); + } + handle = std::shared_ptr( + bsr, [=](superbblas::BSR_handle* bsr) { destroy_bsr(bsr); }); + } + + /// Return a local support of the tensor + + SpTensor getLocal() const + { + // Shortcut for empty and local tensors + if (!*this || ii.dist == Local) + return *this; + + // Create the returning tensor + SpTensor r{d.getLocal(), + i.getLocal(), + nblockd, + nblocki, + nkrond, + nkroni, + (unsigned int)jj.kvdim().at('u'), + isImgFastInBlock}; + + r.ii = ii.getLocal(); + const auto localFrom = d.p->localFrom(); + const auto domDim = d.dim; + r.jj = jj.getLocal().template transformWithCPUFunWithCoor( + [&](const Coor& c, const int& t) { + return (t - localFrom[c[0]] + domDim[c[0]]) % domDim[c[0]]; + }); + r.data = data.getLocal(); + r.kron = kron.getLocal(); + + if (is_constructed()) + r.construct(); + + return r; + } + + /// Return a local support of the tensor + + SpTensor getGlocal() const + { + // Shortcut for empty and local tensors + if (!*this || ii.dist == Local) + return *this; + + // Create the returning tensor + SpTensor r{d.getGlocal(), + i.getGlocal(), + nblockd, + nblocki, + nkrond, + nkroni, + (unsigned int)jj.kvdim().at('u'), + isImgFastInBlock}; + + r.ii = ii.getGlocal(); + r.jj = jj.getGlocal(); + r.data = data.getGlocal(); + r.kron = kron.getGlocal(); + + if (is_constructed()) + r.construct(); + + return r; + } + + /// Split a dimension into another dimensions + /// \param dom_dim_label: dominion dimension to split + /// \param dom_new_labels: the labels of the new dominion dimensions + /// \param dom_step: length of the first label in `dom_new_labels` + /// \param img_dim_label: image dimension to split + /// \param img_new_labels: the labels of the image new dimensions + /// \param img_step: length of the first label in `img_new_labels` + + SpTensor + split_dimension(char dom_dim_label, const std::string& dom_new_labels, Index dom_step, + char img_dim_label, const std::string& img_new_labels, Index img_step) const + { + if (dom_new_labels.size() != 2) + throw std::runtime_error( + "split_dimension: invalid `dom_new_labels`, it should have size two"); + if (img_new_labels.size() != 2) + throw std::runtime_error( + "split_dimension: invalid `dom_new_labels`, it should have size two"); + if (d.kvdim().at(dom_dim_label) % dom_step != 0) + throw std::runtime_error( + "split_dimension: invalid `dom_step`, it should divide the dimension size"); + if (i.kvdim().at(img_dim_label) % img_step != 0) + throw std::runtime_error( + "split_dimension: invalid `img_step`, it should divide the dimension size"); + + std::string::size_type d_pos = d.order.find(dom_dim_label); + std::string::size_type i_pos = i.order.find(img_dim_label); + + if (blkd[d_pos] > 1 && (blkd[d_pos] % dom_step != 0 || krond[d_pos] % dom_step != 0)) + throw std::runtime_error( + "split_dimension: invalid `dom_step`, it should divide the block size"); + if (blki[i_pos] > 1 && (blki[i_pos] % img_step != 0 || kroni[i_pos] % img_step != 0)) + throw std::runtime_error( + "split_dimension: invalid `img_step`, it should divide the block size"); + + // Transform the distribution of the domain and the image spaces + // NOTE: blocking does not operate well with range intersection in the sense that + // intersection(range_a, range_b) != unblock(intersection(block(range_a), block(range_b))). + // A way to guarantee that is by enforcing that the first coordinate and the size of all ranges are + // divisible by the blocking. That's enforced by `coarse_support`. + // FIXME: enforce that all contracted dense tensors with this sparse tensor have divisible partitions by + // the blocking. + + auto new_d = d.coarse_support({{dom_dim_label, dom_step}}) + .split_dimension(dom_dim_label, dom_new_labels, dom_step); + auto new_i = i.split_dimension(img_dim_label, img_new_labels, img_step); + + int new_blkd_pos = blkd[d_pos] == 1 ? 1 : dom_step; + auto new_blkd = detail::insert_coor(blkd, d_pos, new_blkd_pos); + new_blkd[d_pos + 1] /= new_blkd_pos; + int new_blki_pos = blki[i_pos] == 1 ? 1 : img_step; + auto new_blki = detail::insert_coor(blki, i_pos, new_blki_pos); + new_blki[i_pos + 1] /= new_blki_pos; + + // Create the returning tensor + SpTensor r{ + new_d, + new_i, + nblockd + (d_pos < nblockd ? 1 : 0), + nblocki + (i_pos < nblocki ? 1 : 0), + nkrond + (nblockd <= d_pos && d_pos < nblockd + nkrond ? 1 : 0), + nkroni + (nblocki <= i_pos && i_pos < nblocki + nkroni ? 1 : 0), + (unsigned int)jj.kvdim().at('u'), + isImgFastInBlock}; + + ii.split_dimension(img_dim_label, img_new_labels, img_step).copyTo(r.ii); + auto new_jj = r.jj.make_compatible(none, {}, OnHost); + jj.split_dimension(img_dim_label, img_new_labels, img_step) + .copyTo(new_jj.kvslice_from_size({}, {{'~', (int)ND}})); + { + auto local_new_jj = new_jj.getLocal(); + int* p = local_new_jj.data(); + auto new_dom_dim = detail::insert_coor(d.size, d_pos, dom_step); + new_dom_dim[d_pos + 1] /= dom_step; + std::size_t i1 = local_new_jj.volume() / (ND + 1); +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < i1; ++i) + { + Coor c; + std::copy_n(p + (ND + 1) * i, ND, c.begin()); + Coor new_c = detail::split_dimension(d_pos, c, new_dom_dim, detail::From); + std::copy_n(new_c.begin(), ND + 1, p + (ND + 1) * i); + } + } + new_jj.copyTo(r.jj); + + data.split_dimension(dom_dim_label, dom_new_labels, dom_step) + .split_dimension(img_dim_label, img_new_labels, img_step) + .copyTo(r.data); + + if (kron) + { + kron.split_dimension(dom_dim_label, dom_new_labels, dom_step) + .split_dimension(img_dim_label, img_new_labels, img_step) + .copyTo(r.kron); + } + + if (is_constructed()) + r.construct(); + + return r; + } + + /// Return a slice of the tensor starting at coordinate `dom_kvfrom`, `img_kvfrom` and taking + /// `dom_kvsize`, `img_kvsize` elements in each direction. The missing dimensions in `*_kvfrom` + /// are set to zero and the missing directions in `*_kvsize` are set to the size of the tensor. + /// + /// \param dom_kvfrom: dictionary with the index of the first element in each domain direction + /// \param dom_kvsize: dictionary with the number of elements in each domain direction + /// \param img_kvfrom: dictionary with the index of the first element in each domain direction + /// \param img_kvsize: dictionary with the number of elements in each domain direction + /// \return: a copy of the tensor + + SpTensor kvslice_from_size(const std::map& dom_kvfrom = {}, + const std::map& dom_kvsize = {}, + const std::map& img_kvfrom = {}, + const std::map& img_kvsize = {}) const + { + // Check that the object is not local or glocal (FIXME) + if (ii.dist == Local || ii.dist == Glocal) + throw std::runtime_error("SpTensor::kvslice_from_size: unexpected distribution of the data"); + + // Check that we aren't slicing the blocking dimensions + bool fail = false; + std::string o_blk_d = std::string(d.order.begin(), d.order.begin() + nblockd); + for (auto& it : dom_kvfrom) + if (detail::is_in(o_blk_d, it.first) && it.second != 0) + fail = true; + auto dim_d = d.kvdim(); + for (auto& it : dom_kvsize) + if (detail::is_in(o_blk_d, it.first) && it.second != dim_d.at(it.first)) + fail = true; + + std::string o_blk_i = std::string(i.order.begin(), i.order.begin() + nblocki); + for (auto& it : img_kvfrom) + if (detail::is_in(o_blk_i, it.first) && it.second != 0) + fail = true; + auto dim_i = i.kvdim(); + for (auto& it : img_kvsize) + if (detail::is_in(o_blk_i, it.first) && it.second != dim_i.at(it.first)) + fail = true; + + if (fail) + throw std::runtime_error( + "SpTensor::kvslice_from_size: unsupported slicing on blocked dimensions"); + + // We aren't free to redistribute `d` and `i`, because the support of the domain + // in each process depends on the image support + auto new_d = d.kvslice_from_size(dom_kvfrom, dom_kvsize).make_eg(); + auto new_i = i.kvslice_from_size(img_kvfrom, img_kvsize).make_eg(); + + // Get the nonzeros in the slice + auto ii_slice = ii.kvslice_from_size(img_kvfrom, img_kvsize).cloneOn(OnHost); + auto new_ii = ii_slice.make_compatible(none, {}, OnHost); + new_ii.set_zero(); + auto jj_slice = jj.kvslice_from_size(img_kvfrom, img_kvsize).cloneOn(OnHost); + auto new_jj = jj_slice.make_compatible(none, {}, OnHost); + auto new_jj_mask = jj_slice.template make_compatible( + detail::remove_dimensions(jj_slice.order, "~"), {}, OnHost); + new_jj_mask.set_zero(); + unsigned int num_neighbors = jj.kvdim().at('u'); + + if (ii_slice.isSubtensor() || new_ii.isSubtensor() || jj_slice.isSubtensor() || + new_jj.isSubtensor() || new_jj_mask.isSubtensor()) + { + throw std::runtime_error("This shouldn't happen"); + } + if (!ii_slice.is_compatible(jj_slice) || !ii_slice.is_compatible(new_ii) || + !ii_slice.is_compatible(new_jj) || !ii_slice.is_compatible(new_jj_mask)) + { + throw std::runtime_error("kvslice_from_size: hit corner case, sorry"); + } + + Tensor<1, float> dirs("u", {(int)num_neighbors}, OnHost, OnMaster); + dirs.set_zero(); + auto dirs_local = dirs.getLocal(); + { + Coor from_dom = kvcoors(d.order, dom_kvfrom); + std::map updated_dom_kvsize = d.kvdim(); + Coor size_dom = kvcoors(d.order, updated_dom_kvsize); + for (const auto& it : dom_kvsize) + updated_dom_kvsize[it.first] = it.second; + Coor updated_size_dom = kvcoors(d.order, updated_dom_kvsize); + auto ii_slice_local = ii_slice.getLocal(); + int* ii_slice_ptr = ii_slice_local.data(); + auto jj_slice_local = jj_slice.getLocal(); + int* jj_slice_ptr = jj_slice_local.data(); + auto new_ii_local = new_ii.getLocal(); + int* new_ii_ptr = new_ii_local.data(); + auto new_jj_local = new_jj.getLocal(); + int* new_jj_ptr = new_jj_local.data(); + auto new_jj_mask_local = new_jj_mask.getLocal(); + float* new_jj_mask_ptr = new_jj_mask_local.data(); + Coor size_nnz = d.size; + Tensor<1, float> dirs_global("u", {(int)num_neighbors}, OnHost, OnMaster); + dirs_global.set_zero(); + auto dirs_local = dirs_global.getLocal(); + for (unsigned int i = nblockd + nkrond; i < ND; ++i) + size_nnz[i] = 1; + for (std::size_t i = 0, i_acc = 0, i1 = new_ii_local.volume(); i < i1; + i_acc += ii_slice_ptr[i], ++i) + { + for (unsigned int j = i_acc, j1 = i_acc + ii_slice_ptr[i]; j < j1; ++j) + { + Coor from_nnz; + std::copy_n(jj_slice_ptr + j * ND, ND, from_nnz.begin()); + Coor lfrom, lsize; + superbblas::detail::intersection(from_dom, updated_size_dom, from_nnz, size_nnz, + d.dim, lfrom, lsize); + if (superbblas::detail::volume(lsize) == 0) + continue; + + using superbblas::detail::operator-; + Coor new_from_nnz = normalize_coor(from_nnz - from_dom, size_dom); + std::copy_n(new_from_nnz.begin(), ND, new_jj_ptr + (i_acc + new_ii_ptr[i]) * ND); + new_ii_ptr[i]++; + new_jj_mask_ptr[j] = 1; + if (dirs_local) + dirs_local.data()[j - i_acc] = 1; + } + if (i > 0 && new_ii_ptr[i] != new_ii_ptr[0]) + throw std::runtime_error("SpTensor::kvslice_from_size: unsupported slices ending up " + "in different number of nonzero values in each row"); + if (i > 0 && kron) + { + for (unsigned int j = i_acc, j1 = i_acc + ii_slice_ptr[i]; j < j1; ++j) + if (new_jj_mask_ptr[j] != new_jj_mask_ptr[j - i_acc]) + throw std::runtime_error("SpTensor::kvslice_from_size unsupported slices ending " + "up in selecting different directions for each row"); + } + } + + // Make sure that all nodes with support have the same number of neighbors + if (Layout::nodeNumber() == 0 && new_ii_local.volume() == 0) + throw std::runtime_error("kvslice_from_size: unsupported distribution, master process " + "should have support on the origin tensor"); + if (new_ii_local.volume() > 0) + num_neighbors = new_ii_ptr[0]; + int global_num_neighbors = broadcast(num_neighbors); + if (new_ii_local.volume() > 0 && global_num_neighbors != num_neighbors) + throw std::runtime_error("SpTensor::kvslice_from_size: unsupported distribution"); + num_neighbors = global_num_neighbors; + + dirs = dirs_global.make_sure(none, OnDefaultDevice, + detail::compatible_replicated_distribution(i.dist)); + } + + // Create the returning tensor + SpTensor r{new_d, new_i, nblockd, nblocki, + nkrond, nkroni, num_neighbors, isImgFastInBlock}; + new_ii.copyTo(r.ii); + new_jj.kvslice_from_size({}, {{'u', num_neighbors}}).copyTo(r.jj); + + auto data_mask = data.create_mask(); + data_mask.set_zero(); + std::map blk_m; + auto data_dim = data.kvdim(); + for (unsigned int i = 0; i < ND; ++i) + blk_m[d.order[i]] = data_dim.at(d.order[i]); + for (unsigned int i = 0; i < NI; ++i) + blk_m[this->i.order[i]] = (i < nblocki ? data_dim.at(this->i.order[i]) : 1); + auto data_blk = data.template like_this( + "%", '%', "u", blk_m, none, detail::compatible_replicated_distribution(new_d.dist)); + data_blk.set(1); + kronecker(new_jj_mask, data_blk) + .copyTo(data_mask.kvslice_from_size(img_kvfrom, img_kvsize)); + auto r_data_mask = r.data.create_mask(); + r_data_mask.set(1); + r.data.set(detail::NaN::get()); + data.kvslice_from_size(img_kvfrom, img_kvsize) + .copyToWithMask(r.data, data_mask.kvslice_from_size(img_kvfrom, img_kvsize), r_data_mask, + "u"); + + if (kron) + { + auto kron_mask = kron.create_mask(); + kron_mask.set_zero(); + auto blk_m = kron.kvdim(); + blk_m.erase('u'); + auto kron_blk = kron.template like_this( + "%", '%', "u", blk_m, none, detail::compatible_replicated_distribution(new_d.dist)); + kron_blk.set(1); + kronecker(dirs, kron_blk) + .copyTo(kron_mask.kvslice_from_size(img_kvfrom, img_kvsize)); + auto r_kron_mask = r.kron.create_mask(); + r_kron_mask.set(1); + r.kron.set(detail::NaN::get()); + kron.kvslice_from_size(img_kvfrom, img_kvsize) + .copyToWithMask(r.kron, kron_mask.kvslice_from_size(img_kvfrom, img_kvsize), + r_kron_mask, "u"); + } + + if (is_constructed()) + r.construct(); + + // Do a test + if (superbblas::getDebugLevel() > 0) + { + auto x0 = d.template like_this("%n", '%', "", {{'n', 2}}); + x0.set_zero(); + urand(x0.kvslice_from_size(dom_kvfrom, dom_kvsize), -1, 1); + auto y0 = i.template like_this("%n", '%', "", {{'n', 2}}); + contractWith(x0, {}, y0, {}); + y0 = y0.kvslice_from_size(img_kvfrom, img_kvsize); + + auto y = r.i.template like_this("%n", '%', "", {{'n', 2}}); + if (!is_constructed()) + r.construct(); + r.contractWith(x0.kvslice_from_size(dom_kvfrom, dom_kvsize), {}, y, {}); + + y0.scale(-1).addTo(y); + auto norm0 = norm<1>(y0, "n"); + auto normdiff = norm<1>(y, "n"); + double max_err = 0; + for (int i = 0, vol = normdiff.volume(); i < vol; ++i) + max_err = std::max(max_err, (double)normdiff.get({{i}}) / norm0.get({{i}})); + QDPIO::cout << "kvslice_from_size error: " << detail::tostr(max_err) << std::endl; + } + + return r; + } + + /// Return a slice of the tensor starting at coordinate `dom_kvfrom`, `img_kvfrom` and taking + /// `dom_kvsize`, `img_kvsize` elements in each direction. The missing dimensions in `*_kvfrom` + /// are set to zero and the missing directions in `*_kvsize` are set to the size of the tensor. + /// + /// \param f: f(dom_coor, img_coor) return whether the nonzero block starting at the given blocks + /// will be on the returning matrix. + /// \param dom_kvfrom: dictionary with the index of the first element in each domain direction + /// \param dom_kvsize: dictionary with the number of elements in each domain direction + /// \param img_kvfrom: dictionary with the index of the first element in each domain direction + /// \param img_kvsize: dictionary with the number of elements in each domain direction + /// \return: a copy of the tensor + + template + SpTensor + kvslice_from_size_no_test(const F& f, const std::map& dom_kvfrom = {}, + const std::map& dom_kvsize = {}, + const std::map& img_kvfrom = {}, + const std::map& img_kvsize = {}) const + { + // Check that we aren't slicing the blocking dimensions + bool fail = false; + std::string o_blk_d = std::string(d.order.begin(), d.order.begin() + nblockd); + for (auto& it : dom_kvfrom) + if (detail::is_in(o_blk_d, it.first) && it.second != 0) + fail = true; + auto dim_d = d.kvdim(); + for (auto& it : dom_kvsize) + if (detail::is_in(o_blk_d, it.first) && it.second != dim_d.at(it.first)) + fail = true; + + std::string o_blk_i = std::string(i.order.begin(), i.order.begin() + nblocki); + for (auto& it : img_kvfrom) + if (detail::is_in(o_blk_i, it.first) && it.second != 0) + fail = true; + auto dim_i = i.kvdim(); + for (auto& it : img_kvsize) + if (detail::is_in(o_blk_i, it.first) && it.second != dim_i.at(it.first)) + fail = true; + + if (fail) + throw std::runtime_error( + "SpTensor::kvslice_from_size: unsupported slicing on blocked dimensions"); + + // We aren't free to redistribute `d` and `i`, because the support of the domain + // in each process depends on the image support + auto new_d = d.kvslice_from_size(dom_kvfrom, dom_kvsize).make_eg(); + auto new_i = i.kvslice_from_size(img_kvfrom, img_kvsize).make_eg(); + + // Get the nonzeros in the slice + auto ii_slice = ii.kvslice_from_size(img_kvfrom, img_kvsize).cloneOn(OnHost); + auto new_ii = ii_slice.make_compatible(none, {}, OnHost); + new_ii.set_zero(); + auto jj_slice = jj.kvslice_from_size(img_kvfrom, img_kvsize).cloneOn(OnHost); + auto new_jj = jj_slice.make_compatible(none, {}, OnHost); + new_jj.set_zero(); + auto new_jj_mask = jj_slice.template make_compatible( + detail::remove_dimensions(jj_slice.order, "~"), {}, OnHost); + new_jj_mask.set_zero(); + unsigned int num_neighbors = jj.kvdim().at('u'); + + if (ii_slice.isSubtensor() || new_ii.isSubtensor() || jj_slice.isSubtensor() || + new_jj.isSubtensor() || new_jj_mask.isSubtensor()) + { + throw std::runtime_error("This shouldn't happen"); + } + if (!ii_slice.is_compatible(jj_slice) || !ii_slice.is_compatible(new_ii) || + !ii_slice.is_compatible(new_jj) || !ii_slice.is_compatible(new_jj_mask)) + { + throw std::runtime_error("kvslice_from_size: hit corner case, sorry"); + } + + Tensor<1, float> dirs("u", {(int)num_neighbors}, OnHost, + detail::compatible_replicated_distribution(i.dist)); + auto next_jj_mask = new_jj_mask; + { + Coor from_dom = kvcoors(d.order, dom_kvfrom); + std::map updated_dom_kvsize = d.kvdim(); + Coor size_dom = kvcoors(d.order, updated_dom_kvsize); + for (const auto& it : dom_kvsize) + updated_dom_kvsize[it.first] = it.second; + Coor updated_size_dom = kvcoors(d.order, updated_dom_kvsize); + auto ii_slice_local = ii_slice.getLocal(); + int* ii_slice_ptr = ii_slice_local.data(); + auto jj_slice_local = jj_slice.getLocal(); + int* jj_slice_ptr = jj_slice_local.data(); + auto new_ii_local = new_ii.getLocal(); + int* new_ii_ptr = new_ii_local.data(); + auto new_jj_local = new_jj.getLocal(); + int* new_jj_ptr = new_jj_local.data(); + auto new_jj_mask_local = new_jj_mask.getLocal(); + float* new_jj_mask_ptr = new_jj_mask_local.data(); + Coor size_nnz = d.size; + for (unsigned int i = nblockd + nkrond; i < ND; ++i) + size_nnz[i] = 1; + auto dirs_global = + local_support_tensor("u", Coor<1>{(int)num_neighbors}, OnHost); + dirs_global.set_zero(); + auto dirs_global_local = dirs_global.getLocal(); + int max_nnz_per_row = 0; + using superbblas::detail::operator+; + Coor from_img = kvcoors(i.order, img_kvfrom) + this->i.p->localFrom(); + Coor local_img_size = new_ii_local.size; + for (int i = 0; i < nblocki + nkroni; ++i) + local_img_size[i] = 1; + Stride img_local_stride = + superbblas::detail::get_strides(local_img_size, superbblas::FastToSlow); + + // Find the maximum number of nonzeros per row and the active directions + // in case of using the Kronecker format + for (std::size_t i = 0, i_acc = 0, i1 = new_ii_local.volume(); i < i1; + i_acc += ii_slice_ptr[i], ++i) + { + Coor row_global_coor = normalize_coor( + superbblas::detail::index2coor(i, local_img_size, img_local_stride) + from_img, + this->i.dim); + + int nnz_per_row = 0; + for (unsigned int j = i_acc, j1 = i_acc + ii_slice_ptr[i]; j < j1; ++j) + { + Coor from_nnz; + std::copy_n(jj_slice_ptr + j * ND, ND, from_nnz.begin()); + Coor lfrom, lsize; + superbblas::detail::intersection(from_dom, updated_size_dom, from_nnz, size_nnz, + d.dim, lfrom, lsize); + if (superbblas::detail::volume(lsize) == 0 || !f(lfrom, row_global_coor)) + continue; + + dirs_global_local.data()[j - i_acc] = 1; + nnz_per_row++; + } + max_nnz_per_row = std::max(max_nnz_per_row, nnz_per_row); + } + + std::vector new_dirs_idx(num_neighbors); + if (!kron) + { + // Get the maximum number of nonzeros in a row + max_nnz_per_row = global_max(max_nnz_per_row); + } + else + { + // Gather the directions present in all processes, and update `global_dirs_local` such that + // the direction is present if it is on some process. + auto dirs_collective = + dirs_global.make_sure(none, none, detail::compatible_replicated_distribution(i.dist)); + dirs.set_zero(); + for (int i = 0; i < num_neighbors; ++i) + for (int proc = 0; proc < Layout::numNodes(); ++proc) + if (dirs_collective.data()[proc + i * Layout::numNodes()] > 0) + dirs.data()[i] = 1; + + // Map the old direction indices into the new directions, and count the maximum number + // of nonzeros per rows as the total amount of different directions + max_nnz_per_row = 0; + for (int i = 0; i < num_neighbors; ++i) + if (dirs.data()[i] > 0) + new_dirs_idx[i] = max_nnz_per_row++; + } + + // Collect the nonzeros + next_jj_mask = new_jj_mask.make_compatible(none, {{'u', max_nnz_per_row}}); + next_jj_mask.set_zero(); + auto next_jj_mask_local = next_jj_mask.getLocal(); + float* next_jj_mask_ptr = next_jj_mask_local.data(); + for (std::size_t i = 0, i_acc = 0, i1 = new_ii_local.volume(); i < i1; + i_acc += ii_slice_ptr[i], ++i) + { + using superbblas::detail::operator+; + Coor row_global_coor = normalize_coor( + superbblas::detail::index2coor(i, local_img_size, img_local_stride) + from_img, + this->i.dim); + + unsigned int new_nnz_in_row = 0; + for (unsigned int j = i_acc, j1 = i_acc + ii_slice_ptr[i]; j < j1; ++j) + { + Coor from_nnz; + std::copy_n(jj_slice_ptr + j * ND, ND, from_nnz.begin()); + Coor lfrom, lsize; + superbblas::detail::intersection(from_dom, updated_size_dom, from_nnz, size_nnz, + d.dim, lfrom, lsize); + if (superbblas::detail::volume(lsize) == 0 || !f(lfrom, row_global_coor)) + { + // If using Kronecker format and the direction is active on the new matrix, don't jump to next iteration, + // although the nonzero doesn't belong to the new matrix + if (!kron || dirs.data()[j - i_acc] == 0) + continue; + } + else + { + // Copy the coordinates of the nonzero to new matrix + using superbblas::detail::operator-; + Coor new_from_nnz = normalize_coor(from_nnz - from_dom, size_dom); + std::copy_n(new_from_nnz.begin(), ND, new_jj_ptr + (i_acc + new_nnz_in_row) * ND); + new_jj_mask_ptr[j] = 1; + next_jj_mask_ptr[max_nnz_per_row * i + new_dirs_idx[j - i_acc]] = 1; + } + new_nnz_in_row++; + } + new_ii_ptr[i] = max_nnz_per_row; + } + + num_neighbors = max_nnz_per_row; + } + + // Create the returning tensor + SpTensor r{new_d, new_i, nblockd, nblocki, + nkrond, nkroni, num_neighbors, isImgFastInBlock}; + new_ii.copyTo(r.ii); + new_jj.kvslice_from_size({}, {{'u', num_neighbors}}).copyTo(r.jj); + + auto data_mask = data.create_mask(); + data_mask.set_zero(); + std::map blk_m; + auto data_dim = data.kvdim(); + for (unsigned int i = 0; i < ND; ++i) + blk_m[d.order[i]] = data_dim.at(d.order[i]); + for (unsigned int i = 0; i < NI; ++i) + blk_m[this->i.order[i]] = (i < nblocki ? data_dim.at(this->i.order[i]) : 1); + auto data_blk = data.template like_this( + "%", '%', "u", blk_m, none, detail::compatible_replicated_distribution(new_d.dist)); + data_blk.set(1); + kronecker(new_jj_mask, data_blk) + .copyTo(data_mask.kvslice_from_size(img_kvfrom, img_kvsize)); + auto r_data_mask = r.data.create_mask(); + kronecker(next_jj_mask, data_blk).copyTo(r_data_mask); + r.data.set_zero(); + data.kvslice_from_size(img_kvfrom, img_kvsize) + .copyToWithMask(r.data, data_mask.kvslice_from_size(img_kvfrom, img_kvsize), r_data_mask, + "u"); + + if (kron) + { + auto kron_mask = kron.create_mask(); + kron_mask.set_zero(); + auto blk_m = kron.kvdim(); + blk_m.erase('u'); + auto kron_blk = kron.template like_this( + "%", '%', "u", blk_m, none, detail::compatible_replicated_distribution(new_d.dist)); + kron_blk.set(1); + kronecker(dirs, kron_blk) + .copyTo(kron_mask.kvslice_from_size(img_kvfrom, img_kvsize)); + auto r_kron_mask = r.kron.create_mask(); + r_kron_mask.set(1); + r.kron.set(detail::NaN::get()); + kron.kvslice_from_size(img_kvfrom, img_kvsize) + .copyToWithMask(r.kron, kron_mask.kvslice_from_size(img_kvfrom, img_kvsize), + r_kron_mask, "u"); + } + + if (is_constructed()) + r.construct(); + + return r; + } + + /// Return a slice of the tensor starting at coordinate `dom_kvfrom`, `img_kvfrom` and taking + /// `dom_kvsize`, `img_kvsize` elements in each direction. The missing dimensions in `*_kvfrom` + /// are set to zero and the missing directions in `*_kvsize` are set to the size of the tensor. + /// + /// \param f: f(dom_coor, img_coor) returns whether the nonzero block starting at the given + /// coordinates will be on the returning matrix. + /// \param dom_kvfrom: dictionary with the index of the first element in each domain direction + /// \param dom_kvsize: dictionary with the number of elements in each domain direction + /// \param img_kvfrom: dictionary with the index of the first element in each domain direction + /// \param img_kvsize: dictionary with the number of elements in each domain direction + /// \return: a copy of the tensor + + template + SpTensor kvslice_from_size(const F& f, const std::map& dom_kvfrom = {}, + const std::map& dom_kvsize = {}, + const std::map& img_kvfrom = {}, + const std::map& img_kvsize = {}) const + { + // Do the slice + auto r = kvslice_from_size_no_test(f, dom_kvfrom, dom_kvsize, img_kvfrom, img_kvsize); + + // Do a test + if (superbblas::getDebugLevel() > 0) + { + // Do the slice + auto rcomp = kvslice_from_size_no_test( + [=](const Coor& cdom, const Coor& cimg) { return !f(cdom, cimg); }, dom_kvfrom, + dom_kvsize, img_kvfrom, img_kvsize); + + auto x0 = d.template like_this("%n", '%', "", {{'n', 2}}); + x0.set_zero(); + urand(x0.kvslice_from_size(dom_kvfrom, dom_kvsize), -1, 1); + auto y0 = i.template like_this("%n", '%', "", {{'n', 2}}); + contractWith(x0, {}, y0, {}); + y0 = y0.kvslice_from_size(img_kvfrom, img_kvsize); + + auto y = r.i.template like_this("%n", '%', "", {{'n', 2}}); + if (!is_constructed()) + r.construct(); + r.contractWith(x0.kvslice_from_size(dom_kvfrom, dom_kvsize), {}, y, {}); + auto ycomp = rcomp.i.template like_this("%n", '%', "", {{'n', 2}}); + if (!is_constructed()) + rcomp.construct(); + rcomp.contractWith(x0.kvslice_from_size(dom_kvfrom, dom_kvsize), {}, ycomp, {}); + ycomp.addTo(y); + + y0.scale(-1).addTo(y); + auto norm0 = norm<1>(y0, "n"); + auto normdiff = norm<1>(y, "n"); + double max_err = 0; + for (int i = 0, vol = normdiff.volume(); i < vol; ++i) + max_err = std::max(max_err, (double)normdiff.get({{i}}) / norm0.get({{i}})); + QDPIO::cout << "kvslice_from_size error: " << detail::tostr(max_err) << std::endl; + } + + return r; + } + + /// Reorder the domain and image orders + /// \param new_dom_order: new ordering for the domain + /// \param new_img_order: new ordering for the image + /// \param remaining_char: if it isn't the null char, placeholder for the remaining dimensions + + SpTensor reorder(const std::string& new_dom_order, + const std::string& new_img_order, char remaining_char = 0) const + { + if (remaining_char == '~' || remaining_char == 'u') + throw std::runtime_error("reorder: invalid remaining char, it shouldn't be `~` or `u`"); + + std::string new_dom_order0 = d.get_order_for_reorder(new_dom_order, remaining_char); + std::string new_img_order0 = i.get_order_for_reorder(new_img_order, remaining_char); + auto new_d = d.reorder(new_dom_order0); + auto new_i = i.reorder(new_img_order0); + + Coor d_perm = superbblas::detail::find_permutation( + detail::to_sb_order(d.order), detail::to_sb_order(new_dom_order0)); + Coor i_perm = superbblas::detail::find_permutation( + detail::to_sb_order(i.order), detail::to_sb_order(new_img_order0)); + auto new_blkd = superbblas::detail::reorder_coor(blkd, d_perm); + auto new_blki = superbblas::detail::reorder_coor(blki, i_perm); + auto new_krond = superbblas::detail::reorder_coor(krond, d_perm); + auto new_kroni = superbblas::detail::reorder_coor(kroni, i_perm); + + // Check the blocking + for (unsigned int i = 0; i < ND; ++i) + if ((i < nblockd && new_blkd[i] != new_d.size[i]) || (i >= nblockd && new_blkd[i] != 1)) + throw std::runtime_error("reorder: invalid domain reordering, it is mixing blocking " + "and nonblocking dimensions"); + for (unsigned int i = 0; i < NI; ++i) + if ((i < nblocki && new_blki[i] != new_i.size[i]) || (i >= nblocki && new_blki[i] != 1)) + throw std::runtime_error("reorder: invalid image reordering, it is mixing blocking " + "and nonblocking dimensions"); + for (unsigned int i = 0; i < ND; ++i) + if ((i >= nblockd && i < nblockd + nkrond && new_krond[i] != new_d.size[i]) || + ((i < nblockd || i >= nblockd + nkrond) && new_krond[i] != 1)) + throw std::runtime_error("reorder: invalid domain reordering, it is mixing blocking " + "and nonblocking dimensions"); + for (unsigned int i = 0; i < NI; ++i) + if ((i >= nblocki && i < nblocki + nkroni && new_kroni[i] != new_i.size[i]) || + ((i < nblocki || i >= nblocki + nkroni) && new_kroni[i] != 1)) + throw std::runtime_error("reorder: invalid image reordering, it is mixing blocking " + "and nonblocking dimensions"); + + auto new_ii = ii.reorder(new_img_order0); + auto new_jj = jj.reorder(std::string("~u") + new_img_order0); + if (new_jj.order != jj.order) + { + auto host_new_jj = new_jj.make_sure(none, OnHost); + auto local_new_jj = host_new_jj.getLocal(); + int* new_p = local_new_jj.data(); + for (std::size_t i = 0, i1 = local_new_jj.volume(); i < i1; i += ND) + { + Coor c; + std::copy_n(new_p + i, ND, c.begin()); + Coor new_c = superbblas::detail::reorder_coor(c, d_perm); + std::copy_n(new_c.begin(), ND, new_p + i); + } + host_new_jj.copyTo(new_jj); + } + + std::string data_order = + (isImgFastInBlock + ? std::string(new_i.order.begin(), new_i.order.begin() + nblocki + nkroni) + + new_d.order + : std::string(new_d.order.begin(), new_d.order.begin() + nblockd + nkrond) + + std::string(new_i.order.begin(), new_i.order.begin() + nblocki + nkroni) + + std::string(new_d.order.begin() + nblockd + nkrond, new_d.order.end())) + + std::string("u") + std::string(new_i.order.begin() + nblocki + nkrond, new_i.order.end()); + auto new_data = data.reorder(data_order); + auto new_kron = kron ? kron.reorder(data_order) : kron; + + SpTensor r(new_d, new_i, new_blkd, new_blki, new_krond, new_kroni, new_ii, + new_jj, new_data, new_kron, scalar, isImgFastInBlock, nblockd, + nblocki, nkrond, nkroni); + if (is_constructed()) + r.construct(); + + return r; + } + + /// Return a view of this tensor with an extra label for the real and the imaginary parts + + template + typename std::enable_if::value && detail::is_complex::value, + SpTensor>>::type + toFakeReal() const + { + using newT = DIYComplex; + + // Get the new domain and image + char d_complexLabel = detail::get_free_label(d.order + i.order); + auto new_d = d.toFakeReal(d_complexLabel); + char i_complexLabel = detail::get_free_label(new_d.order + i.order); + auto new_i = i.toFakeReal(i_complexLabel); + + // Create the returning tensor + bool is_kron = (nkrond > 0 || nkroni > 0); + SpTensor r{new_d, + new_i, + nblockd + 1, + nblocki + 1, + is_kron ? nkrond + 1 : 0, + is_kron ? nkroni + 1 : 0, + (unsigned int)jj.kvdim().at('u'), + isImgFastInBlock}; + + // Copy the data to the new tensor + // a) same number of nonzeros per row + ii.copyTo(r.ii); + // b) the nonzero blocks start at the same position + r.jj.kvslice_from_size({}, {{'~', 1}}).set(0); + jj.copyTo(r.jj.kvslice_from_size({{'~', 1}}, {{'~', ND}})); + // c) each element in data xr+xi*i -> [xr -xi; xi xr] + auto data0 = data.toFakeReal(d_complexLabel); + data0.kvslice_from_size({}, {{d_complexLabel, 1}}) + .copyTo(r.data.kvslice_from_size({}, {{d_complexLabel, 1}, {i_complexLabel, 1}})); + data0.kvslice_from_size({}, {{d_complexLabel, 1}}) + .copyTo(r.data.kvslice_from_size({{d_complexLabel, 1}, {i_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + data0.kvslice_from_size({{d_complexLabel, 1}}, {{d_complexLabel, 1}}) + .scale(-1) + .copyTo(r.data.kvslice_from_size({{d_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + data0.kvslice_from_size({{d_complexLabel, 1}}, {{d_complexLabel, 1}}) + .copyTo(r.data.kvslice_from_size({{i_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + // d) each element in kron xr+xi*i -> [xr -xi; xi xr] + if (kron) + { + auto kron0 = kron.toFakeReal(d_complexLabel); + kron0.kvslice_from_size({}, {{d_complexLabel, 1}}) + .copyTo(r.kron.kvslice_from_size({}, {{d_complexLabel, 1}, {i_complexLabel, 1}})); + kron0.kvslice_from_size({}, {{d_complexLabel, 1}}) + .copyTo(r.kron.kvslice_from_size({{d_complexLabel, 1}, {i_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + kron0.kvslice_from_size({{d_complexLabel, 1}}, {{d_complexLabel, 1}}) + .scale(-1) + .copyTo(r.kron.kvslice_from_size({{d_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + kron0.kvslice_from_size({{d_complexLabel, 1}}, {{d_complexLabel, 1}}) + .copyTo(r.kron.kvslice_from_size({{i_complexLabel, 1}}, + {{d_complexLabel, 1}, {i_complexLabel, 1}})); + } + + if (is_constructed()) + r.construct(); + + return r; + } + + template + typename std::enable_if::value || !detail::is_complex::value, + SpTensor>::type + toFakeReal() const + { + return *this; + } + + /// Extend the support of each dimension by the given amount in each direction + /// \param m: amount to extend the support for each process + + SpTensor extend_support(const std::map& m) const + { + std::map md, mi; + for (const auto& it : m) + { + if (detail::is_in(d.order, it.first)) + md[it.first] = it.second; + else if (detail::is_in(i.order, it.first)) + mi[it.first] = it.second; + else + throw std::runtime_error("extend_support: unmatched label"); + } + auto new_d = d.extend_support(md); + auto new_i = i.extend_support(mi); + + // Create the returning tensor + SpTensor r{new_d, + new_i, + nblockd, + nblocki, + nkrond, + nkroni, + (unsigned int)jj.kvdim().at('u'), + isImgFastInBlock}; + + // Populate the new tensor + ii.copyTo(r.ii); + jj.copyTo(r.jj); + data.copyTo(r.data); + if (kron) + kron.copyTo(r.kron); + + if (is_constructed()) + r.construct(); + + return r; + } + + /// Return whether the sparse tensor has been constructed + + bool is_constructed() const + { + return (bool)handle; + } + + /// Get where the tensor is stored + + DeviceHost getDev() const + { + return i.getDev(); + } + + // Contract the dimensions with the same label in this tensor and in `v` than do not appear on `w`. + template ::value || !std::is_same::value), bool>::type = true> + void contractWith(Tensor v, const remap& mv, const Tensor& w, + const remap& mw = {}, char power_label = 0) const + { + if (data.is_eg() || v.is_eg() || w.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + if (!is_constructed()) + throw std::runtime_error("invalid operation on an not constructed tensor"); + + auto w0 = w.template cast_like(); + contractWith(std::move(v).template cast(), mv, w0); + w0.copyTo(w); + } + + template + void contractWith(Tensor v, const remap& mv, Tensor w, const remap& mw = {}, + char power_label = 0) const + { + if (data.is_eg() || v.is_eg() || w.is_eg()) + throw std::runtime_error("Invalid operation from an example tensor"); + + if (!is_constructed()) + throw std::runtime_error("invalid operation on an not constructed tensor"); + + // If either this tensor or v are on OnDevice, force both to be on the same device as this tensor. + if (v.ctx().plat != data.ctx().plat) + { + v = v.cloneOn(getDev()); + } + + if (getDev() != w.getDev()) + { + Tensor aux = w.like_this(none, {}, getDev()); + contractWith(v, mv, aux, mw, power_label); + aux.copyTo(w); + return; + } + + // Check unsupported distributions for contraction + if ((v.dist == Local) != (w.dist == Local) || (v.dist == Glocal) != (w.dist == Glocal) || + (v.dist != Local && data.dist == Local)) + throw std::runtime_error("contractWith: One of the contracted tensors or the output " + "tensor is local and others are not!"); + + // We don't support conjugacy for now + if (v.conjugate || w.conjugate) + throw std::runtime_error("contractWith: unsupported implicit conjugacy"); + + // Check the power label + if (power_label != 0 && detail::is_in(v.order, power_label) && + v.kvdim().at(power_label) > 1) + throw std::runtime_error("contractWith: `power_label` for `v` does not have size one"); + if (power_label != 0 && !detail::is_in(w.order, power_label)) + throw std::runtime_error("contractWith: `power_label` isn't in `w`"); + + value_type* v_ptr = v.data(); + value_type* w_ptr = w.data_for_writing(); + std::string orderv = detail::update_order_and_check(v.order, mv); + std::string orderw = detail::update_order_and_check(w.order, mw); + superbblas::bsr_krylov( + scalar * v.scalar / w.scalar, handle.get(), i.order.c_str(), d.order.c_str(), // + v.p->p.data(), 1, orderv.c_str(), v.from, v.size, v.dim, (const value_type**)&v_ptr, // + T{0}, w.p->p.data(), orderw.c_str(), w.from, w.size, w.dim, power_label, + (value_type**)&w_ptr, // + &data.ctx(), v.dist == Local ? MPI_COMM_SELF : MPI_COMM_WORLD, superbblas::FastToSlow, + nullptr, v.dist == Glocal); + + // Force synchronization in superbblas stream if the destination allocation isn't managed by superbblas + if (!w.is_managed()) + superbblas::sync(data.ctx()); + } + + void print(const std::string& name) const + { + std::stringstream ss; + + auto ii_host = ii.make_sure(none, OnHost, OnMaster).getLocal(); + auto jj_host = jj.make_sure(none, OnHost, OnMaster).getLocal(); + auto data_host = data.make_sure(none, OnHost, OnMaster).getLocal(); + auto kron_host = kron.make_sure(none, OnHost, OnMaster).getLocal(); + assert(!ii_host.isSubtensor() && !jj_host.isSubtensor() && !data_host.isSubtensor() && + !kron_host.isSubtensor()); + + std::size_t volblki = superbblas::detail::volume(blki); + std::size_t volblkj = superbblas::detail::volume(blkd); + std::size_t volkroni = superbblas::detail::volume(kroni); + std::size_t volkronj = superbblas::detail::volume(krond); + std::size_t volbi = volblki * volkroni; + std::size_t volbj = volblkj * volkronj; + + auto ii_host_ptr = ii_host.data(); + auto jj_host_ptr = jj_host.data(); + auto data_host_ptr = data_host.data(); + auto kron_host_ptr = kron_host.data(); + + // Print general tensor description in a matlab comment + if (ii_host) + ss << "% " << repr() << std::endl; + + // If using the Kronecker format, reconstruct the nonzeros explicitly + if (kron_host) + { + // Print the Kronecker tensor (the spin-spin tensors) + int num_neighbors = data.kvdim().at('u'); + ss << name << "_kron=reshape(["; + for (std::size_t i = 0, kron_vol = kron_host.volume(); i < kron_vol; ++i) + detail::repr::operator<<(ss << " ", kron_host_ptr[i]); + ss << "], [" << (isImgFastInBlock ? volkroni : volkronj) << " " + << (isImgFastInBlock ? volkronj : volkroni) << " " << num_neighbors << "]);" + << std::endl; + + // Print the data (the color-color tensors) + std::size_t numblks = data.volume() / volblki / volblkj; + ss << name << "_data0=reshape(["; + for (std::size_t i = 0, data_vol = data_host.volume(); i < data_vol; ++i) + detail::repr::operator<<(ss << " ", data_host_ptr[i]); + ss << "], [" << (isImgFastInBlock ? volblki : volblkj) << " " + << (isImgFastInBlock ? volblkj : volblki) << " " << numblks << "]);" << std::endl; + + // Preallocate and populate data in explicit format (no Kronecker format) + ss << name << "_data=zeros([" << (isImgFastInBlock ? volbi : volbj) << " " + << (isImgFastInBlock ? volbj : volbi) << " " << numblks << "]);" << std::endl; + ss << "for i=1:" << numblks << std::endl; + ss << " " << name << "_data(:,:,i)=kron(squeeze(" << name << "_kron(:,:,mod(i-1," + << num_neighbors << ")+1)), squeeze(" << name << "_data0(:,:,i)));" << std::endl; + ss << "end" << std::endl; + } + + // Only master node prints + if (ii_host) + { + // Print for non-Kronecker variant + ss << name << "=sparse(["; + + // Print the row indices + for (std::size_t i = 0, iivol = ii_host.volume(); i < iivol; ++i) + { + for (unsigned int neighbor = 0, num_neighbors = ii_host_ptr[i]; + neighbor < num_neighbors; ++neighbor) + { + if (isImgFastInBlock) + { + for (unsigned int bj = 0; bj < volbj; ++bj) + for (unsigned int bi = 0; bi < volbi; ++bi) + ss << " " << i * volbi + bi + 1; + } + else + { + for (unsigned int bi = 0; bi < volbi; ++bi) + for (unsigned int bj = 0; bj < volbj; ++bj) + ss << " " << i * volbi + bi + 1; + } + } + } + ss << "], ["; + + // Print the column indices + Stride dstrides = + superbblas::detail::get_strides(d.size, superbblas::FastToSlow); + for (std::size_t j = 0, jjvol = jj_host.volume(); j < jjvol; j += ND) + { + Coor j_coor; + std::copy_n(jj_host_ptr + j, ND, j_coor.begin()); + auto j_idx = superbblas::detail::coor2index(j_coor, d.size, dstrides); + if (isImgFastInBlock) + { + for (unsigned int bj = 0; bj < volbj; ++bj) + for (unsigned int bi = 0; bi < volbi; ++bi) + ss << " " << j_idx + bj + 1; + } + else + { + for (unsigned int bi = 0; bi < volbi; ++bi) + for (unsigned int bj = 0; bj < volbj; ++bj) + ss << " " << j_idx + bj + 1; + } + } + ss << "], "; + + if (!kron) + { + ss << "["; + + // Print the data values + for (std::size_t i = 0, data_vol = data_host.volume(); i < data_vol; ++i) + detail::repr::operator<<(ss << " ", data_host_ptr[i]); + ss << "]"; + } + else + { + ss << name << "_data(:)"; + } + ss << ");" << std::endl; + } + + detail::log(1, ss.str()); + } + + /// Return a copy of the tensor in a different precision + /// + /// \tparam Q: new precision + + template ::value, bool>::type = true> + SpTensor cast() const + { + return *this; + } + + template ::value, bool>::type = true> + SpTensor cast() const + { + SpTensor r{d.template cast(), + i.template cast(), + blkd, + blki, + krond, + kroni, + ii, + jj, + data.template cast(), + kron.template cast(), + (Q)scalar, + isImgFastInBlock, + nblockd, + nblocki, + nkrond, + nkroni}; + if (is_constructed()) + r.construct(); + return r; + } + }; + + /// Return a sparse identity matrix with the same dimensions as the given one + /// \param sp: sparse matrix given + /// \param m: map from rows (image) to columns (domain) + + template + SpTensor getSparseIdentity(const SpTensor& sp, const remap& m) + { + SpTensor r(sp.d, sp.i, sp.nblockd, sp.nblocki, sp.nkrond, sp.nkroni, 1, + sp.isImgFastInBlock); + r.ii.set(1); + int tilde_pos = std::find(r.jj.order.begin(), r.jj.order.end(), '~') - r.jj.order.begin(); + int u_pos = std::find(r.jj.order.begin(), r.jj.order.end(), 'u') - r.jj.order.begin(); + if (tilde_pos != 0 || u_pos != 1) + throw std::runtime_error("getSparseIdentity: unsupported ordering"); + std::array perm; + const auto rev_m = detail::reverse(m); + for (unsigned int i = 0; i < ND; ++i) + perm[i] = + std::find(r.i.order.begin(), r.i.order.end(), rev_m.at(r.d.order[i])) - r.i.order.begin(); + r.jj.fillCpuFunCoor([&](const Coor& c) { return c[perm[c[tilde_pos]] + 2]; }); + + // Remove nonblocking dimensions from m + auto m_nonblk = m; + for (int i = sp.nblocki + sp.nkroni; i < NI; ++i) + m_nonblk.erase(sp.i.order[i]); + + identity(r.data.kvdim(), m_nonblk).copyTo(r.data); + if (r.kron) + identity(r.kron.kvdim(), m_nonblk).copyTo(r.kron); + + r.construct(); + return r; + } + + template + struct StorageTensor { + static_assert(superbblas::supported_type::value, "Not supported type"); + + public: + std::string filename; ///< Storage file + std::string metadata; ///< metadata + std::string order; ///< Labels of the tensor dimensions + Coor dim; ///< Length of the tensor dimensions + Sparsity sparsity; ///< Sparsity of the storage + std::shared_ptr + ctx; ///< Superbblas storage handler + Coor from; ///< First active coordinate in the tensor + Coor size; ///< Number of active coordinates on each dimension + T scalar; ///< Scalar factor of the tensor + LocalSharedFile filesystem_type; ///< whether the file is in a local/share filesystem + + // Empty constructor + StorageTensor() + : filename{}, + metadata{}, + order(detail::getTrivialOrder(N)), + dim{{}}, + sparsity(Dense), + ctx{}, + from{{}}, + size{{}}, + scalar{0}, + filesystem_type(LocalFSFile) + { + } + + // Create storage construct + StorageTensor(const std::string& filename, const std::string& metadata, + const std::string& order, Coor dim, Sparsity sparsity = Dense, + checksum_type checksum = checksum_type::NoChecksum, + LocalSharedFile filesystem_type = SharedFSFile) + : filename(filename), + metadata(metadata), + order(order), + dim(dim), + sparsity(sparsity), + from{{}}, + size{dim}, + scalar{1}, + filesystem_type(filesystem_type) + { + checkOrder(); + std::string use_filename = + filename + (filesystem_type == SharedFSFile + ? std::string() + : std::string(".part_") + std::to_string(Layout::nodeNumber())); + MPI_Comm comm = filesystem_type == SharedFSFile ? MPI_COMM_WORLD : MPI_COMM_SELF; + superbblas::Storage_handle stoh; + superbblas::create_storage( + dim, superbblas::FastToSlow, use_filename.c_str(), metadata.c_str(), metadata.size(), + checksum, comm, &stoh); + ctx = std::shared_ptr( + stoh, [=](superbblas::detail::Storage_context_abstract* ptr) { + superbblas::close_storage(ptr, comm); + }); + + // If the tensor to store is dense, create the block here; otherwise, create the block on copy + if (sparsity == Dense) + { + superbblas::PartitionItem p{Coor{{}}, dim}; + superbblas::append_blocks(&p, 1, dim, stoh, comm, superbblas::FastToSlow); + } + } + + // Open storage construct + StorageTensor(const std::string& filename, bool read_order = true, + const Maybe& order_tag = none) + : filename(filename), sparsity(Sparse), from{{}}, scalar{1}, filesystem_type(SharedFSFile) + { + // Read information from the storage superbblas::values_datatype values_dtype; std::vector metadatav; std::vector dimv; superbblas::read_storage_header(filename.c_str(), superbblas::FastToSlow, values_dtype, metadatav, dimv, MPI_COMM_WORLD); - // Check that storage tensor dimension and value type match template arguments - if (dimv.size() != N) + // Check that storage tensor dimension and value type match template arguments + if (dimv.size() != N) + throw std::runtime_error( + "The storage tensor dimension does not match the template parameter N"); + if (superbblas::detail::get_values_datatype() != values_dtype) + throw std::runtime_error("Storage type does not match template argument T"); + + // Fill out the information of this class with storage header information + std::copy(dimv.begin(), dimv.end(), dim.begin()); + size = dim; + metadata = std::string(metadatav.begin(), metadatav.end()); + + // Read the order + if (read_order) + { + std::istringstream is(metadata); + XMLReader xml_buf(is); + read(xml_buf, order_tag.getSome("order"), order); + checkOrder(); + } + + superbblas::Storage_handle stoh; + superbblas::open_storage(filename.c_str(), false /* don't allow writing */, + MPI_COMM_WORLD, &stoh); + ctx = std::shared_ptr( + stoh, [=](superbblas::detail::Storage_context_abstract* ptr) { + superbblas::close_storage(ptr, MPI_COMM_WORLD); + }); + } + + protected: + // Construct a slice/scale storage + StorageTensor(const StorageTensor& t, const std::string& order, Coor from, Coor size, + T scalar) + : filename(t.filename), + metadata(t.metadata), + order(order), + dim(t.dim), + ctx(t.ctx), + sparsity(t.sparsity), + from(normalize_coor(from, t.dim)), + size(size), + scalar{t.scalar}, + filesystem_type(t.filesystem_type) + { + checkOrder(); + } + + public: + /// Return whether the tensor is not empty + explicit operator bool() const noexcept + { + return superbblas::detail::volume(size) > 0; + } + + // Return the dimensions of the tensor + std::map kvdim() const + { + std::map d; + for (unsigned int i = 0; i < N; ++i) + d[order[i]] = size[i]; + return d; + } + + /// Rename dimensions + StorageTensor rename_dims(const SB::remap& m) const + { + return StorageTensor(*this, detail::update_order_and_check(order, m), this->from, + this->size); + } + + // Return a slice of the tensor starting at coordinate `kvfrom` and taking `kvsize` elements in each direction. + // The missing dimension in `kvfrom` are set to zero and the missing direction in `kvsize` are set to the active size of the tensor. + StorageTensor kvslice_from_size(const std::map& kvfrom = {}, + const std::map& kvsize = {}) const + { + std::map updated_kvsize = this->kvdim(); + for (const auto& it : kvsize) + updated_kvsize[it.first] = it.second; + return slice_from_size(kvcoors(order, kvfrom), kvcoors(order, updated_kvsize)); + } + + // Return a slice of the tensor starting at coordinate `from` and taking `size` elements in each direction. + StorageTensor slice_from_size(Coor from, Coor size) const + { + for (unsigned int i = 0; i < N; ++i) + { + if (size[i] > this->size[i]) + throw std::runtime_error( + "The size of the slice cannot be larger than the original tensor"); + if (normalize_coor(from[i], this->size[i]) + size[i] > this->size[i] && + this->size[i] != this->dim[i]) + throw std::runtime_error( + "Unsupported to make a view on a non-contiguous range on the tensor"); + } + + using superbblas::detail::operator+; + return StorageTensor(*this, order, this->from + from, size, scalar); + } + + StorageTensor scale(T s) const + { + return StorageTensor(*this, order, from, scalar * s); + } + + void release() + { + dim = {{}}; + ctx.reset(); + from = {{}}; + size = {{}}; + scalar = T{0}; + filename = ""; + metadata = ""; + filesystem_type = LocalFSFile; + } + + /// Check that the dimension labels are valid + + void checkOrder() const + { + // Check that all labels are different there are N + detail::check_order(order); + + for (auto s : size) + if (s < 0) + std::runtime_error("Invalid tensor size: it should be positive"); + } + + /// Preallocate space for the storage file + /// \param size: expected final file size in bytes + + void preallocate(std::size_t size) + { + superbblas::preallocate_storage(ctx.get(), size); + } + + /// Save content from the storage into the given tensor + template ::value == detail::is_complex::value, bool>::type = true> + void copyFrom(const Tensor& w) const + { + Coor wsize = kvcoors(order, w.kvdim(), 1, NoThrow); + for (unsigned int i = 0; i < N; ++i) + if (wsize[i] > size[i]) + throw std::runtime_error("The destination tensor is smaller than the source tensor"); + + if (detail::is_distribution_local(w.dist) && filesystem_type != LocalFSFile) + throw std::runtime_error("A local tensor cannot be stored on a global tensor storage"); + + MPI_Comm comm = filesystem_type == SharedFSFile ? MPI_COMM_WORLD : MPI_COMM_SELF; + auto w0 = w; + auto w0_p = w0.p->p.data(); + std::size_t w0_p_size = w0.p->p.size(); + if (filesystem_type == LocalFSFile && w.dist != Local) + { + w0 = w.getGlocal(); + w0_p = w0.p->p.data() + Layout::nodeNumber(); + w0_p_size = 1; + } + + // If the storage is sparse, add blocks for the new content + if (sparsity == Sparse) + { + superbblas::append_blocks(w0_p, w0_p_size, w0.order.c_str(), w0.from, + w0.size, w0.dim, order.c_str(), from, ctx.get(), comm, + superbblas::FastToSlow); + } + + Tw* w_ptr = w0.data(); + superbblas::save(detail::safe_div(w.scalar, scalar), w0_p, 1, + w0.order.c_str(), w0.from, w0.size, w0.dim, (const Tw**)&w_ptr, + &w0.ctx(), order.c_str(), from, ctx.get(), comm, + superbblas::FastToSlow); + } + + /// Load content from the storage into the given tensor + template ::value == detail::is_complex::value, bool>::type = true> + void copyTo(const Tensor& w) const + { + Coor wsize = kvcoors(order, w.kvdim(), 1, NoThrow); + for (unsigned int i = 0; i < N; ++i) + if (size[i] > wsize[i]) + throw std::runtime_error("The destination tensor is smaller than the source tensor"); + + if (filesystem_type == LocalFSFile && !detail::is_distribution_local(w.dist)) + throw std::runtime_error("Unsupported a collective tensor from reading from a local file"); + + MPI_Comm comm = filesystem_type == SharedFSFile ? MPI_COMM_WORLD : MPI_COMM_SELF; + auto w0 = w; + auto w0_p = w0.p->p.data(); + std::size_t w0_p_size = w0.p->p.size(); + if (filesystem_type == LocalFSFile && w.dist != Local) + { + w0 = w.getGlocal(); + w0_p = w0.p->p.data() + Layout::nodeNumber(); + w0_p_size = 1; + } + Tw* w_ptr = w0.data_for_writing(); + superbblas::load(detail::safe_div(scalar, w.scalar), ctx.get(), + order.c_str(), from, size, w0_p, 1, w0.order.c_str(), + w0.from, w0.dim, &w_ptr, &w0.ctx(), comm, + superbblas::FastToSlow, superbblas::Copy); + if (!w0.is_managed()) + superbblas::sync(w0.ctx()); + } + }; + + /// Return a tensor filled with the value of the function applied to each element + /// \param order: dimension labels, they should start with "xyztX" + /// \param from: coordinates of the first element + /// \param size: length of each tensor dimension + /// \param dim: length of each global dimension (which is usually equal to size) + /// \param dev: either OnHost or OnDefaultDevice + /// \param func: function (Coor) -> COMPLEX + /// \param zero_is_even: (optional) whether the first element (`from`) is an even site (usually true) + + template + Tensor fillLatticeField(const std::string& order, const std::map& from, + const std::map& size, + const std::map& dim, DeviceHost dev, Func func, + bool zero_is_even = true, + const Distribution& dist = OnEveryone) + { + using superbblas::detail::operator+; + + static_assert(N >= 5, "The minimum number of dimensions should be 5"); + if (order.size() < 5 || order.compare(0, 5, "xyztX") != 0) + throw std::runtime_error("Wrong `order`, it should start with xyztX"); + + // Get final object dimension + Coor dim_c = latticeSize(order, dim); + std::map size0 = dim; + for (const auto& it : size) + size0[it.first] = it.second; + Coor size_c = latticeSize(order, size0); + Coor from_c = kvcoors(order, from); + + // Populate the tensor on CPU + Tensor r(order, size_c, OnHost, dist); + Coor local_latt_size = r.p->localSize(); // local dimensions for xyztX + Stride stride = + superbblas::detail::get_strides(local_latt_size, superbblas::FastToSlow); + Coor local_latt_from = + r.p->localFrom(); // coordinates of first elements stored locally for xyztX + local_latt_from = local_latt_from + from_c; + std::size_t vol = superbblas::detail::volume(local_latt_size); + Index nX = r.kvdim()['X']; + COMPLEX* ptr = r.data(); + int d = (zero_is_even ? 0 : 1); + +# ifdef _OPENMP +# pragma omp parallel for schedule(static) +# endif + for (std::size_t i = 0; i < vol; ++i) + { + // Get the global coordinates + Coor c = normalize_coor( + superbblas::detail::index2coor(i, local_latt_size, stride) + local_latt_from, dim_c); + + // Translate even-odd coordinates to natural coordinates + Coor coor; + coor[0] = c[0] * nX + (c[1] + c[2] + c[3] + c[4] + d) % nX; // x + coor[1] = c[1]; // y + coor[2] = c[2]; // z + coor[3] = c[3]; // t + std::copy_n(c.begin() + 5, N - 5, coor.begin() + 4); + + // Call the function + ptr[i] = func(coor); + } + + return r.make_sure(none, dev); + } + + /// + /// Operators + /// + + /// Ordering of matrices + + enum ColOrdering { + RowMajor, ///< row-major ordering, the fastest index is the column + ColumnMajor, ///< row-major ordering, the fastest index is the row + }; + + /// Operator's layout + enum OperatorLayout { + NaturalLayout, ///< natural ordering + XEvenOddLayout, ///< X:(x+y+z+t)%2, x:x/2, y:y, z:z, t:t + XEvenOddLayoutZeroOdd, ///< X:(x+y+z+t+1)%2, x:x/2, y:y, z:z, t:t + EvensOnlyLayout ///< x:x/2, y:y, z:z, t:t for all (x+y+z+t)%2==0 + }; + + /// Return whether the layout is XEvenOddLayout or XEvenOddLayoutZeroOdd + /// \param layout: layout to test + + namespace detail + { + inline bool isEvenOddLayout(OperatorLayout layout) + { + return layout == XEvenOddLayout || layout == XEvenOddLayoutZeroOdd; + } + } + + /// Representation of an operator, function of type tensor -> tensor where the input and the + /// output tensors have the same dimensions + + template + using OperatorFun = + std::function&, Tensor)>; + + /// Representation of an operator, function of type tensor -> tensor where the output tensor + /// has powers of the operator applied to the input tensor + + template + using OperatorPowerFun = + std::function&, Tensor, char)>; + + /// Representation of an eigensolver, function of type Operator, int -> {std::vector, tensor} + /// where the output tensor + + template + using EigensolverFun = + std::function, Tensor>(int, double)>; + + /// Displacements of each site nonzero edge for every operator's site + + using NaturalNeighbors = std::vector>; + + /// Matrix to contract on each direction + + template + using SpinMatrixDir = std::map, Tensor<2, T>>; + + /// Special value to indicate that the operator is dense + + inline const NaturalNeighbors& DenseOperator() + { + static const NaturalNeighbors dense{{{(char)0, 0}}}; + return dense; + } + + /// Representation of a function that takes and returns tensors with the same labels, although the + /// dimensions may be different. + + template + struct Operator { + /// Function that the operators applies (optional) + OperatorFun fop; + /// Example tensor for the input tensor (domain) + Tensor d; + /// Example tensor for the output tensor (image) + Tensor i; + /// Function to apply when conjugate transposed (optional) + OperatorFun fop_tconj; + /// Labels that distinguish different operator instantiations + std::string order_t; + /// Operator's domain space layout + OperatorLayout domLayout; + /// Operator's image space layout + OperatorLayout imgLayout; + /// Neighbors for each site in this operator + NaturalNeighbors neighbors; + /// Preferred ordering + ColOrdering preferred_col_ordering; + /// Operator based on sparse tensor (optional) + SpTensor sp; + /// Sparse tensor map from image labels to domain labels (optional) + remap rd; + /// Operator maximum power support (optional) + unsigned int max_power; + /// Whether the spin-color block nonzeros are the tensor product of spin-spin and color-color matrices + bool kron; + /// Identity + std::shared_ptr id; + + /// Empty constructor + Operator() + { + } + + /// Constructor + Operator(const OperatorFun& fop, Tensor d, Tensor i, + const OperatorFun& fop_tconj, const std::string& order_t, + OperatorLayout domLayout, OperatorLayout imgLayout, NaturalNeighbors neighbors, + ColOrdering preferred_col_ordering, bool kron, const std::string& id = "") + : fop(fop), + d(d), + i(i), + fop_tconj(fop_tconj), + order_t(order_t), + domLayout(domLayout), + imgLayout(imgLayout), + neighbors(neighbors), + preferred_col_ordering(preferred_col_ordering), + sp{}, + rd{}, + max_power{0}, + kron(kron), + id(std::make_shared(id)) + { + } + + /// Constructor for a power-supported function + Operator(const SpTensor& sp, const remap& rd, unsigned int max_power, + Tensor d, Tensor i, const std::string& order_t, + OperatorLayout domLayout, OperatorLayout imgLayout, NaturalNeighbors neighbors, + ColOrdering preferred_col_ordering, const std::string& id = "") + : fop{}, + d(d), + i(i), + fop_tconj{}, + order_t(order_t), + domLayout(domLayout), + imgLayout(imgLayout), + neighbors(neighbors), + preferred_col_ordering(preferred_col_ordering), + sp{sp}, + rd{rd}, + max_power{max_power}, + kron(sp.is_kronecker()), + id(std::make_shared(id)) + { + } + + /// Constructor from other operator + Operator(const OperatorFun& fop, Tensor d, Tensor i, + const OperatorFun& fop_tconj, const Operator& op, + const std::string& id = "") + : fop(fop), + d(d), + i(i), + fop_tconj(fop_tconj), + order_t(op.order_t), + domLayout(op.domLayout), + imgLayout(op.imgLayout), + neighbors(op.neighbors), + preferred_col_ordering(op.preferred_col_ordering), + sp{}, + rd{}, + max_power{0}, + kron(op.is_kronecker()), + id(std::make_shared(id)) + { + } + + /// Return the local support of this tensor as a subset of the global tensor + Operator getGlocal() const + { + return fop ? Operator{fop, d.getGlocal(), i.getGlocal(), fop_tconj, *this} + : Operator{ + sp, rd, max_power, d.getGlocal(), i.getGlocal(), + order_t, domLayout, imgLayout, neighbors, preferred_col_ordering}; + } + + /// Return whether the operator is not empty + explicit operator bool() const noexcept + { + return (bool)d; + } + + /// Return the transpose conjugate of the operator + Operator tconj() const + { + if (sp || !fop_tconj) + throw std::runtime_error("Operator does not have conjugate transpose form"); + return { + fop_tconj, i, d, fop, order_t, imgLayout, domLayout, neighbors, preferred_col_ordering, + kron}; + } + + /// Return whether the operator has transpose conjugate + bool has_tconj() const + { + return !sp && fop_tconj; + } + + /// Return whether the spin-color nonzero blocks are the tensor product of two matrices + bool is_kronecker() const + { + return sp ? sp.is_kronecker() : kron; + } + + /// Return compatible domain tensors + /// \param col_order: order for the columns + /// \param m: column dimension size + + template + Tensor make_compatible_dom(const std::string& col_order, + const std::map& m) const + { + return d.template make_compatible( + preferred_col_ordering == ColumnMajor ? std::string("%") + col_order : col_order + "%", + '%', "", m); + } + + /// Return compatible image tensors + /// \param col_order: order for the columns + /// \param m: column dimension size + + template + Tensor make_compatible_img(const std::string& col_order, + const std::map& m) const + { + return i.template make_compatible( + preferred_col_ordering == ColumnMajor ? std::string("%") + col_order : col_order + "%", + '%', "", m); + } + + /// Apply the operator + template + Tensor operator()(const Tensor& t) const + { + // The `t` labels that are not in `d` are the column labels + std::string cols = detail::remove_dimensions(t.order, d.order); // t.order - d.order + + if (sp) + { + remap mcols = detail::getNewLabels(cols, sp.d.order + sp.i.order); + auto y = make_compatible_img(cols, t.kvdim()); + if (t.dist == Glocal) + y = y.getGlocal(); + sp.contractWith(t.rename_dims(mcols), rd, y.rename_dims(mcols), {}); + return y; + } + else + { + auto x = + t.template collapse_dimensions(cols, 'n', true).template make_sure(); + auto y = make_compatible_img("n", {{'n', x.kvdim()['n']}}); + if (t.dist == Glocal) + y = y.getGlocal(); + fop(x, y); + return y.template split_dimension('n', cols, t.kvdim()).template make_sure(); + } + } + + /// Apply the operator + template + void operator()(const Tensor& x, Tensor y, char power_label = 0) const + { + // The `x` labels that are not in `d` are the column labels + std::string cols_and_power = + detail::remove_dimensions(x.order, d.order); // x.order - d.order + std::string cols = + power_label == 0 ? cols_and_power + : detail::remove_dimensions(cols_and_power, std::string(1, power_label)); + int power = power_label == 0 ? 1 : y.kvdim().at(power_label); + if (power <= 0) + return; + + if (sp) + { + remap mcols = detail::getNewLabels(cols_and_power, sp.d.order + sp.i.order); + if (power == 1) + { + sp.contractWith(x.rename_dims(mcols), rd, y.rename_dims(mcols), {}); + } + else + { + char power_label0 = mcols.count(power_label) == 1 ? mcols.at(power_label) : power_label; + auto x0 = x.rename_dims(mcols); + auto y0 = y.rename_dims(mcols); + int power0 = (max_power == 0 ? power : std::min(power, (int)max_power)); + sp.contractWith(x0, rd, y0.kvslice_from_size({}, {{power_label0, power0}}), {}, + power_label0); + for (int i = power0, ni = std::min((int)max_power, power - i); i < power; + i += ni, ni = std::min((int)max_power, power - i)) + { + sp.contractWith(y0.kvslice_from_size({{power_label0, i - 1}}, {{power_label0, 1}}), + rd, y0.kvslice_from_size({{power_label0, i}}, {{power_label0, ni}}), + {}, power_label0); + } + } + } + else + { + if (power_label == 0) + { + auto x0 = x.template collapse_dimensions(cols_and_power, 'n', true) + .template cast(); + auto y0 = y.template collapse_dimensions(cols_and_power, 'n', true) + .template cast_like(); + fop(x0, y0); + y0.copyTo(y); + } + else if (power > 0) + { + operator()(x.kvslice_from_size({}, {{power_label, 1}}), + y.kvslice_from_size({}, {{power_label, 1}})); + for (int i = 1, p = power; i < p; ++i) + operator()(y.kvslice_from_size({{power_label, i - 1}}, {{power_label, 1}}), + y.kvslice_from_size({{power_label, i}}, {{power_label, 1}})); + } + } + } + + /// Return this operator with an implicit different type + /// \tparam T: new implicit precision + + template ::value, bool>::type = true> + Operator cast() const + { + return *this; + } + + template ::value, bool>::type = true> + Operator cast() const + { + if (!*this) + return {}; + + if (sp) + { + return Operator(sp.template cast(), rd, max_power, d.template cast(), + i.template cast(), order_t, domLayout, imgLayout, neighbors, + preferred_col_ordering); + } + else + { + const Operator op = *this, + op_tconj = has_tconj() ? tconj() : Operator{}; + return Operator( + [=](const Tensor& x, Tensor y) { op(x, y); }, + d.template cast(), i.template cast(), + op_tconj ? [=](const Tensor& x, Tensor y) { op_tconj(x, y); } + : OperatorFun{}, + order_t, domLayout, imgLayout, neighbors, preferred_col_ordering, is_kronecker()); + } + } + + /// Return a slice of the tensor starting at coordinate `dom_kvfrom`, `img_kvfrom` and taking + /// `dom_kvsize`, `img_kvsize` elements in each direction. The missing dimensions in `*_kvfrom` + /// are set to zero and the missing directions in `*_kvsize` are set to the size of the tensor. + /// + /// \param dom_kvfrom: dictionary with the index of the first element in each domain direction + /// \param dom_kvsize: dictionary with the number of elements in each domain direction + /// \param img_kvfrom: dictionary with the index of the first element in each domain direction + /// \param img_kvsize: dictionary with the number of elements in each domain direction + /// \return: a copy of the tensor or an implicit operator + + Operator kvslice_from_size(const std::map& dom_kvfrom = {}, + const std::map& dom_kvsize = {}, + const std::map& img_kvfrom = {}, + const std::map& img_kvsize = {}) const + { + if (!*this) + return {}; + + // Update the eg layouts and the data layouts + auto new_d = d.kvslice_from_size(dom_kvfrom, dom_kvsize); + auto new_i = i.kvslice_from_size(img_kvfrom, img_kvsize); + OperatorLayout new_domLayout = + (new_d.kvdim().at('X') != d.kvdim().at('X') && detail::isEvenOddLayout(domLayout) + ? EvensOnlyLayout + : domLayout); + OperatorLayout new_imgLayout = + (new_i.kvdim().at('X') != i.kvdim().at('X') && detail::isEvenOddLayout(imgLayout) + ? EvensOnlyLayout + : imgLayout); + if (sp) + { + return Operator(sp.kvslice_from_size(detail::update_kvcoor(dom_kvfrom, rd), + detail::update_kvcoor(dom_kvsize, rd), + img_kvfrom, img_kvsize), + rd, max_power, new_d, new_i, order_t, new_domLayout, + new_imgLayout, neighbors, preferred_col_ordering); + } + else + { + const Operator op = *this, + op_tconj = has_tconj() ? tconj() : Operator{}; + return Operator( + [=](const Tensor& x, Tensor y) { + auto x0 = op.d.template like_this( + op.preferred_col_ordering == ColumnMajor ? "%n" : "n%", '%', "", + {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.kvslice_from_size(dom_kvfrom, dom_kvsize)); + auto y0 = op.i.template like_this( + op.preferred_col_ordering == ColumnMajor ? "%n" : "n%", '%', "", + {{'n', x.kvdim().at('n')}}); + op(x0, y0); + y0.kvslice_from_size(img_kvfrom, img_kvsize).copyTo(y); + }, + new_d, new_i, + op_tconj ? [=](const Tensor& x, Tensor y) { + auto x0 = op_tconj.d.template like_this( + op_tconj.preferred_col_ordering == ColumnMajor ? "%n" : "n%", '%', "", + {{'n', x.kvdim().at('n')}}); + x0.set_zero(); + x.copyTo(x0.kvslice_from_size(img_kvfrom, img_kvsize)); + auto y0 = op_tconj.i.template like_this( + op_tconj.preferred_col_ordering == ColumnMajor ? "%n" : "n%", '%', "", + {{'n', x.kvdim().at('n')}}); + op_tconj(x0, y0); + y0.kvslice_from_size(dom_kvfrom, dom_kvsize).copyTo(y); + } : OperatorFun{}, + order_t, new_domLayout, new_imgLayout, neighbors, preferred_col_ordering, is_kronecker()); + } + } + + /// Return a slice of the tensor starting at coordinate `dom_kvfrom`, `img_kvfrom` and taking + /// `dom_kvsize`, `img_kvsize` elements in each direction. The missing dimensions in `*_kvfrom` + /// are set to zero and the missing directions in `*_kvsize` are set to the size of the tensor. + /// + /// \param f: f(dom_coor, img_coor) returns whether the nonzero block starting at the given + /// coordinates will be on the returning matrix. + /// \param dom_kvfrom: dictionary with the index of the first element in each domain direction + /// \param dom_kvsize: dictionary with the number of elements in each domain direction + /// \param img_kvfrom: dictionary with the index of the first element in each domain direction + /// \param img_kvsize: dictionary with the number of elements in each domain direction + /// \return: a copy of the tensor or an implicit operator + + template + Operator kvslice_from_size(const F& f, + const std::map& dom_kvfrom = {}, + const std::map& dom_kvsize = {}, + const std::map& img_kvfrom = {}, + const std::map& img_kvsize = {}) const + { + if (!*this) + return {}; + + if (!sp) throw std::runtime_error( - "The storage tensor dimension does not match the template parameter N"); - if (superbblas::detail::get_values_datatype() != values_dtype) - throw std::runtime_error("Storage type does not match template argument T"); + "Operator::kvslice_from_size: unsupported on implicit operators"); + + // Update the eg layouts and the data layouts + auto new_d = d.kvslice_from_size(dom_kvfrom, dom_kvsize); + auto new_i = i.kvslice_from_size(img_kvfrom, img_kvsize); + OperatorLayout new_domLayout = + (new_d.kvdim().at('X') != d.kvdim().at('X') && detail::isEvenOddLayout(domLayout) + ? EvensOnlyLayout + : domLayout); + OperatorLayout new_imgLayout = + (new_i.kvdim().at('X') != i.kvdim().at('X') && detail::isEvenOddLayout(imgLayout) + ? EvensOnlyLayout + : imgLayout); + return Operator(sp.kvslice_from_size(f, detail::update_kvcoor(dom_kvfrom, rd), + detail::update_kvcoor(dom_kvsize, rd), + img_kvfrom, img_kvsize), + rd, max_power, new_d, new_i, order_t, new_domLayout, + new_imgLayout, neighbors, preferred_col_ordering); + } + + /// Return an identity operator with the same dimensions as this operator + + Operator get_identiy() const + { + NaturalNeighbors self(1, std::map{}); + if (sp) + { + return Operator{getSparseIdentity(sp, rd), + rd, + 0 /* = max_power, local operator */, + d, + i, + order_t, + domLayout, + imgLayout, + self, + preferred_col_ordering}; + } + else + { + return Operator{ + [&](const Tensor& x, Tensor y) { x.copyTo(y); }, + d, + i, + [&](const Tensor& x, Tensor y) { x.copyTo(y); }, + order_t, + domLayout, + imgLayout, + self, + preferred_col_ordering, + kron}; + } + } + }; + + namespace detail + { + enum BlockingAsSparseDimensions { + ConsiderBlockingSparse, ///< Dimensions 0,1,2,3 will be sparse and part of the lattice + ConsiderBlockingDense ///< Dimensions 0,1,2,3 will be dense and not lattice dimensions + }; + + /// Return the natural lattice dimensions + /// \param dim: dimension for each label + /// \param layout: operator's layout + + inline std::map + getNatLatticeDims(const std::map& dim, OperatorLayout layout, + BlockingAsSparseDimensions blockDims = ConsiderBlockingSparse) + { + int nX = (layout == EvensOnlyLayout ? 2 : (dim.count('X') == 1 ? dim.at('X') : 1)); + if (blockDims == ConsiderBlockingSparse) + { + return std::map{ + {'x', dim.at('x') * (dim.count('0') == 1 ? dim.at('0') : 1) * nX}, + {'y', dim.at('y') * (dim.count('1') == 1 ? dim.at('1') : 1)}, + {'z', dim.at('z') * (dim.count('2') == 1 ? dim.at('2') : 1)}, + {'t', dim.at('t') * (dim.count('3') == 1 ? dim.at('3') : 1)}}; + } + else + { + return std::map{ + {'x', dim.at('x') * nX}, {'y', dim.at('y')}, {'z', dim.at('z')}, {'t', dim.at('t')}}; + } + } - // Fill out the information of this class with storage header information - std::copy(dimv.begin(), dimv.end(), dim.begin()); - size = dim; - metadata = std::string(metadatav.begin(), metadatav.end()); + /// Return the neighbors as displacements from origin in natural coordinates + /// \param blocking: blocking for each natural direction + /// \param dim: operator dimensions + /// \param neighbors: operator's neighbors in natural coordinates + /// \param layout: operator's layout - // Read the order - if (read_order) + inline NaturalNeighbors + getNeighborsAfterBlocking(const std::map& blocking, + const std::map& dim, const NaturalNeighbors& neighbors, + OperatorLayout layout) + { + using superbblas::detail::operator/; + using superbblas::detail::operator+; + + // Get the natural dimensions of the lattice + Coor blk = kvcoors("xyzt", blocking, 1); + Coor nat_dims = kvcoors("xyzt", getNatLatticeDims(dim, layout)); + + // Filter out odd neighbors if `Xsubrange` and block them + std::set idx_neighbors; + Coor blk_strides = superbblas::detail::get_strides(blk, superbblas::FastToSlow); + Coor blk_nat_dims = nat_dims / blk; + Coor blk_nat_dims_strides = + superbblas::detail::get_strides(blk_nat_dims, superbblas::FastToSlow); + std::size_t blk_vol = superbblas::detail::volume(blk); + for (const auto& kvcoor : neighbors) { - std::istringstream is(metadata); - XMLReader xml_buf(is); - read(xml_buf, order_tag.getSome("order"), order); - checkOrder(); + Coor c = kvcoors("xyzt", kvcoor); + for (Index i = 0; i < blk_vol; ++i) + { + Coor blk_c = + normalize_coor(c + superbblas::detail::index2coor(i, blk, blk_strides), nat_dims) / + blk; + Index idx_blk_c = + superbblas::detail::coor2index(blk_c, blk_nat_dims, blk_nat_dims_strides); + idx_neighbors.insert(idx_blk_c); + } } - superbblas::Storage_handle stoh; - superbblas::open_storage(filename.c_str(), false /* don't allow writing */, - MPI_COMM_WORLD, &stoh); - ctx = std::shared_ptr( - stoh, [=](superbblas::detail::Storage_context_abstract* ptr) { - superbblas::close_storage(ptr, MPI_COMM_WORLD); - }); - } + // Convert the indices into maps + NaturalNeighbors r; + for (Index idx : idx_neighbors) + { + Coor c = superbblas::detail::index2coor(idx, blk_nat_dims, blk_nat_dims_strides); + r.push_back(std::map{{{'x', c[0]}, {'y', c[1]}, {'z', c[2]}, {'t', c[3]}}}); + } - protected: - // Construct a slice/scale storage - StorageTensor(const StorageTensor& t, const std::string& order, Coor from, Coor size, - T scalar) - : filename(t.filename), - metadata(t.metadata), - order(order), - dim(t.dim), - ctx(t.ctx), - sparsity(t.sparsity), - from(normalize_coor(from, t.dim)), - size(size), - scalar{t.scalar} - { - checkOrder(); + return r; } - public: - /// Return whether the tensor is not empty - explicit operator bool() const noexcept - { - return superbblas::detail::volume(size) > 0; - } + /// Return the Manhattan distance of the furthest neighbor + /// \param neighbors: operator's neighbors in natural coordinates + /// \param dim: operator dimensions + /// \param layout: operator's layout - // Return the dimensions of the tensor - std::map kvdim() const + inline unsigned int getFurthestNeighborDistance(const NaturalNeighbors& neighbors, + const std::map& dim, + OperatorLayout layout) { - std::map d; - for (unsigned int i = 0; i < N; ++i) - d[order[i]] = size[i]; - return d; + const auto natdim = getNatLatticeDims(dim, layout); + unsigned int max_distance = 0; + for (const auto& kvcoor : neighbors) + { + unsigned int dist = 0; + for (const auto& it : kvcoor) + { + int label_dim = natdim.at(it.first); + int label_coor = normalize_coor(it.second, label_dim); + dist += std::min(label_coor, label_dim - label_coor); + } + max_distance = std::max(max_distance, dist); + } + + return max_distance; } - /// Rename dimensions - StorageTensor rename_dims(const SB::remap& m) const + /// Return the Manhattan distance of the furthest neighbor in an operator + /// \param op: given operator + + template + unsigned int getFurthestNeighborDistance(Operator op) { - return StorageTensor(*this, detail::update_order(order, m), this->from, - this->size); + return getFurthestNeighborDistance(op.neighbors, op.i.kvdim(), op.imgLayout); } - // Return a slice of the tensor starting at coordinate `kvfrom` and taking `kvsize` elements in each direction. - // The missing dimension in `kvfrom` are set to zero and the missing direction in `kvsize` are set to the active size of the tensor. - StorageTensor kvslice_from_size(const std::map& kvfrom = {}, - const std::map& kvsize = {}) const + /// Return the neighbors as displacements from origin in natural coordinates + /// \param dim: operator dimensions + /// \param max_dist_neighbors: the distance of the farthest neighbor for each site + /// \param layout: operator's layout + + inline NaturalNeighbors getNeighbors(const std::map& dim, + unsigned int max_dist_neighbors, OperatorLayout layout) { - std::map updated_kvsize = this->kvdim(); - for (const auto& it : kvsize) - updated_kvsize[it.first] = it.second; - return slice_from_size(kvcoors(order, kvfrom), kvcoors(order, updated_kvsize)); + // Get the natural dimensions of the lattice and all the neighbors up to distance `max_dist_neighbors` + Coor nat_dims = kvcoors("xyzt", getNatLatticeDims(dim, layout)); + std::vector> neighbors = Coloring::all_neighbors(max_dist_neighbors, nat_dims); + + // Filter out odd neighbors if the layout is `EvensOnlyLayout` + std::set idx_neighbors; + Stride strides = + superbblas::detail::get_strides(nat_dims, superbblas::FastToSlow); + for (const auto& c : neighbors) + { + if (layout == EvensOnlyLayout && std::accumulate(c.begin(), c.end(), Index{0}) % 2 != 0) + continue; + idx_neighbors.insert(superbblas::detail::coor2index(c, nat_dims, strides)); + } + + // Convert the indices into maps + NaturalNeighbors r; + for (std::size_t idx : idx_neighbors) + { + Coor c = superbblas::detail::index2coor(idx, nat_dims, strides); + r.push_back(std::map{{'x', c[0]}, {'y', c[1]}, {'z', c[2]}, {'t', c[3]}}); + } + + return r; } - // Return a slice of the tensor starting at coordinate `from` and taking `size` elements in each direction. - StorageTensor slice_from_size(Coor from, Coor size) const - { - for (unsigned int i = 0; i < N; ++i) + /// Return the color for each site + /// \param dim: operator dimensions + /// \param layout: operator's layout + /// \param neighbors: operator's neighbors in natural coordinates + /// \param power: maximum distance to recover the nonzeros: + /// 0, block diagonal; 1: near-neighbors... + + template + std::pair, std::size_t> + getColors(const std::map& dim, OperatorLayout layout, + const NaturalNeighbors& neighbors, unsigned int power, + const Distribution& dist = OnEveryone) + { + // Unsupported other powers than zero or one + unsigned int max_dist_neighbors = getFurthestNeighborDistance(neighbors, dim, layout); + if (power != 0 && power != max_dist_neighbors) + throw std::runtime_error("getColors: unsupported value for `power`: either zero or the " + "distance to the furthest neighbor"); + + // Compute the coloring + Coor nat_dims = + kvcoors("xyzt", getNatLatticeDims(dim, layout, ConsiderBlockingDense)); + Coloring coloring{power == 0 ? max_dist_neighbors + 1 + : max_dist_neighbors * 2 + 1, // k-distance coloring + nat_dims}; + + // Create a field with the color of each site + std::string order("xyztX"); + for (const auto& it : dim) + if (std::find(order.begin(), order.end(), it.first) == order.end()) + order.push_back(it.first); + auto real_dim = dim; + real_dim['X'] = (layout == EvensOnlyLayout ? 2 : (dim.count('X') == 1 ? dim.at('X') : 1)); + auto t = fillLatticeField( + order, {}, real_dim, real_dim, OnDefaultDevice, + [&](Coor c) { + return (float)coloring.getColor({{c[0], c[1], c[2], c[3]}}); + }, + layout == XEvenOddLayout, dist) + .kvslice_from_size({}, {{'X', dim.at('X')}}); + + return {t, coloring.numColors()}; + } + + /// Return a mask for the sites with even or odd x coordinate + /// \param xoddity: 0 for even, 1 for odd x coordinates + /// \param t: return a tensor with this distribution + /// \param layout: tensor's layout + /// NOTE: this implementation may be too slow for large tensors + + template + Tensor<5, float> getXOddityMask_aux(int xoddity, const Tensor& t, OperatorLayout layout) + { + if (xoddity != 0 && xoddity != 1) + throw std::runtime_error("getXOddityMask: invalid input argument `xoddity`"); + auto dim = t.kvdim(); + auto r = t.template make_compatible<5, float>("Xxyzt"); + if (layout == NaturalLayout) { - if (size[i] > this->size[i]) - throw std::runtime_error( - "The size of the slice cannot be larger than the original tensor"); - if (normalize_coor(from[i], this->size[i]) + size[i] > this->size[i] && - this->size[i] != this->dim[i]) - throw std::runtime_error( - "Unsupported to make a view on a non-contiguous range on the tensor"); + if (dim.at('X') != 1 && dim.at('X') != 2) + throw std::runtime_error("getXOddityMask: invalid dimension size `X`"); + int dimX = dim.at('X'); + r.fillCpuFunCoor([&](const Coor<5>& coor) { + return (coor[0] + coor[1] * dimX) % 2 == xoddity ? float{1} : float{0}; + }); + } + else if (isEvenOddLayout(layout)) + { + if (dim.at('X') != 2) + throw std::runtime_error("getXOddityMask: invalid dimension size `X`"); + if (layout == XEvenOddLayoutZeroOdd) + xoddity = (xoddity + 1) % 2; + r.fillCpuFunCoor([&](const Coor<5>& coor) { + return (coor[0] + coor[2] + coor[3] + coor[4]) % 2 == xoddity ? float{1} : float{0}; + }); + } + else if (layout == EvensOnlyLayout) + { + if (dim.at('X') != 1) + throw std::runtime_error("getXOddityMask: invalid dimension size `X`"); + r.set(xoddity == 0 ? float{1} : float{0}); } + else + throw std::runtime_error("getXOddityMask: unsupported layout"); - using superbblas::detail::operator+; - return StorageTensor(*this, order, this->from + from, size, scalar); + return r; } - StorageTensor scale(T s) const + /// Return a mask for the sites with even or odd x coordinate + /// \param xoddity: 0 for even, 1 for odd x coordinates + /// \param t: return a tensor with this distribution + /// \param layout: tensor's layout + + template 5), bool>::type = true> + Tensor getXOddityMask(int xoddity, const Tensor& t, OperatorLayout layout) { - return StorageTensor(*this, order, from, scalar * s); + // Create the mask on the lattice components + auto r_lat = getXOddityMask_aux(xoddity, t, layout); + + // Create a matrix of ones to extend the mask onto the other components + auto dim_dense = t.kvdim(); + for (auto& it : dim_dense) + if (is_in("xyztX", it.first)) + it.second = 1; + auto r_dense = t.template like_this( + "%", '%', "xyztX", dim_dense, none, compatible_replicated_distribution(t.dist)); + r_dense.set(1); + + // Contract both to create the output tensor + auto r = t.template make_compatible(); + contract(r_lat, r_dense, "", CopyTo, r); + + return r; } - void release() + template ::type = true> + Tensor getXOddityMask(int xoddity, const Tensor& t, OperatorLayout layout) { - dim = {}; - ctx.reset(); - from = {}; - size = {}; - scalar = T{0}; - filename = ""; - metadata = ""; + // Create the mask on the lattice components + return getXOddityMask_aux(xoddity, t, layout); } - /// Check that the dimension labels are valid + /// Return a copy of the given tensor in natural ordering into an even-odd ordering. + /// + /// \param v: origin tensor - void checkOrder() const + template + Tensor toEvenOddOrdering(const Tensor& v) { - // Check that all labels are different there are N - detail::check_order(order); + // If the tensor is already in even-odd ordering, return it + auto vdim = v.kvdim(); + if (vdim.at('X') == 2) + return v; - for (auto s : size) - if (s < 0) - std::runtime_error("Invalid tensor size: it should be positive"); + // Check that the tensor can be reordered in even-ordering compressed on the x-direction + if (!(vdim.at('x') % 2 == 0 && // + (vdim.at('y') == 1 || vdim.at('y') % 2 == 0) && + (vdim.at('z') == 1 || vdim.at('z') % 2 == 0) && + (vdim.at('t') == 1 || vdim.at('t') % 2 == 0))) + throw std::runtime_error("toEvenOddOrdering: invalid tensor dimensions"); + + // All even/odd coordinate x elements cannot be selected with slicing at once for even-odd + // ordering, so we use arbitrary selection of elements: masks. The approach to convert between + // orderings is to mask all elements with even/odd x coordinate and copy them to a new tensor + // with the target layout. The copying with mask superbblas operation wasn't design to support + // different mask on the origin and destination tensor. But it's going to produce the desired + // effect if the following properties match: + // a) the origin and destination masks are active in all dimensions excepting some dimensions, + // only X in this case; + // b) for all coordinates only one element is active on the excepting dimensions, the even or the odd + // x coordinates in the X dimension in this case; + // c) the excepting dimensions are fully supported on all processes; and + // d) the excepting dimensions are the fastest index and have the same ordering in the origin and + // the destination tensors. + + auto v0 = v.reshape_dimensions({{"Xx", "Xx"}}, {{'X', 2}}).reorder("Xxyzt%", '%'); + auto r = v0.make_compatible(); + for (int oddity = 0; oddity < 2; ++oddity) + { + auto nat_mask = getXOddityMask(oddity, v0, NaturalLayout); + auto eo_mask = getXOddityMask(oddity, r, XEvenOddLayout); + v0.copyToWithMask(r, nat_mask, eo_mask); + } + return r; } - /// Preallocate space for the storage file - /// \param size: expected final file size in bytes + /// Return a copy of the given tensor in even-odd ordering into a natural ordering. + /// + /// \param v: origin tensor - void preallocate(std::size_t size) + template + Tensor toNaturalOrdering(const Tensor& v, int v_oddity = 0) { - superbblas::preallocate_storage(ctx.get(), size); + // If the tensor is already in natural ordering, return it + auto vdim = v.kvdim(); + if (vdim.at('X') == 1) + return v; + + // All even/odd coordinate x elements cannot be selected with slicing at once for even-odd + // ordering, so we use arbitrary selection of elements: masks. The approach to convert between + // orderings is to mask all elements with even/odd x coordinate and copy them to a new tensor + // with the target layout. The copying with mask superbblas operation wasn't design to support + // different mask on the origin and destination tensor. But it's going to produce the desired + // effect if the following properties match: + // a) the origin and destination masks are active in all dimensions excepting some dimensions, + // only X in this case; + // b) for all coordinates only one element is active on the excepting dimensions, the even or the odd + // x coordinates in the X dimension in this case; + // c) the excepting dimensions are fully supported on all processes; and + // d) the excepting dimensions are the fastest index and have the same ordering in the origin and + // the destination tensors. + + auto v0 = v.reorder("Xxyzt%", '%'); + auto r = v0.make_compatible(); + for (int oddity = 0; oddity < 2; ++oddity) + { + auto eo_mask = getXOddityMask( + oddity, v0, v_oddity % 2 == 0 ? XEvenOddLayout : XEvenOddLayoutZeroOdd); + auto nat_mask = getXOddityMask(oddity, r, NaturalLayout); + v0.copyToWithMask(r, eo_mask, nat_mask); + } + return r.reshape_dimensions({{"Xx", "Xx"}}, {{'X', 1}, {'x', vdim.at('x') * 2}}); } - /// Save content from the storage into the given tensor - template ::value == detail::is_complex::value, bool>::type = true> - void copyFrom(Tensor w) const + /// Return a sparse tensor with the content of the given operator + /// \param op: operator to extract the nonzeros from + /// \param power: maximum distance to recover the nonzeros: + /// 0, block diagonal; 1: near-neighbors... + /// \param coBlk: ordering of the nonzero blocks of the sparse operator + /// \param useKronFormat: whether to create a Kronecker BSR variant if the given operator is in that format + /// \return: a pair of a sparse tensor and a remap; the sparse tensor has the same image + /// labels as the given operator and domain labels are indicated by the returned remap. + /// + /// NOTE: Encoding the Dirac-Wilson with the clover term into the Kronecker format gets convoluted. + /// We treat differently the block-diagonal (the self direction) from the others (the x,y,z,t directions). + /// The nonzeros of the latter directions are the addition of two matrices which are the result of + /// the tensor product of a 4x4 (spin matrix) and a 3x3 (color matrix). One of the spin matrices is + /// is the identity always and the other is the same for all nonzeros in a direction. The block-diagonal + /// doesn't follow this pattern but it is block diagonal on the chirality, that is, there are nonzeros only + /// for the combination of spin i,j such that floor(i/2) == floor(j/2). + + template = Nd + 1), bool>::type = true> + std::pair, remap> + cloneUnblockedOperatorToSpTensor(const Operator& op, unsigned int power = 1, + ColOrdering coBlk = RowMajor, bool useKronFormat = true, + const std::string& prefix = "") { - Coor wsize = kvcoors(order, w.kvdim(), 1, NoThrow); - for (unsigned int i = 0; i < N; ++i) - if (wsize[i] > size[i]) - throw std::runtime_error("The destination tensor is smaller than the source tensor"); + using value_type = typename detail::base_type::type; - MPI_Comm comm = (w.dist == Local ? MPI_COMM_SELF : MPI_COMM_WORLD); + log(1, "starting cloneUnblockedOperatorToSpTensor"); - // If the storage is sparse, add blocks for the new content - if (sparsity == Sparse) + Tracker _t(std::string("clone unblocked operator ") + prefix); + + // Unsupported explicitly colorized operators + if (op.d.kvdim().count('X') == 0) + throw std::runtime_error( + "cloneUnblockedOperatorToSpTensor: unsupported not explicitly colored operators"); + + // If the operator is empty, just return itself + if (op.d.volume() == 0 || op.i.volume() == 0) + return {{}, {}}; + + // TODO: add optimizations for multiple operators + if (op.order_t.size() > 0) + throw std::runtime_error("Not implemented"); + + // The spin label if the spin-color matrices are the tensor product of a spin matrix + // and a color matrix + char kronecker_label = op.is_kronecker() && useKronFormat ? 's' : 0; + + // Create the ordering for the domain and the image where the dense dimensions indices run faster than the sparse dimensions. + // If using Kronecker variant, the Kronecker label (the spin) runs the slowest of the dense labels. + // NOTE: assuming that x,y,z,t are the only sparse dimensions; X remains sparse + std::string sparse_labels("xyztX"); + std::string dense_labels = remove_dimensions(op.i.order, sparse_labels); + if (kronecker_label) { - superbblas::append_blocks(w.p->p.data(), w.p->p.size(), w.order.c_str(), w.from, - w.size, order.c_str(), from, ctx.get(), comm, - superbblas::FastToSlow); + dense_labels = remove_dimensions(dense_labels, std::string(1, kronecker_label)) + + std::string(1, kronecker_label); } + remap rd = getNewLabels(op.d.order, op.i.order + "u~0123"); + auto i = + op.i.reorder(dense_labels + std::string("xyztX")) + .like_this(none, {}, OnDefaultDevice, compatible_oneveryone_distribution(op.i.dist)) + .make_eg(); + + // Get the blocking for the domain and the image + std::map blkd, blki; + for (const auto& it : i.kvdim()) + blki[it.first] = (is_in(dense_labels, it.first) ? it.second : 1); + for (const auto& it : blki) + blkd[rd.at(it.first)] = it.second; + + // Check that the Kroneker label is a dense label if given + if (kronecker_label != 0 && std::find(dense_labels.begin(), dense_labels.end(), + kronecker_label) == dense_labels.end()) + throw std::runtime_error("The Kronecker label should be a blocking label"); + + // Construct the probing vectors, which they have as the rows the domain labels and as + // columns the domain blocking dimensions + + constexpr int Nblk = NOp - Nd - 1; + std::map blki_id; + for (char c : dense_labels) + blki_id[c] = blki[c]; + remap rd_id; + for (char c : dense_labels) + rd_id[c] = rd[c]; + auto t_blk = identity(blki_id, rd_id, i.dist); + + // Compute the coloring + auto t_ = getColors(i.kvdim(), op.imgLayout, op.neighbors, power, i.dist); + Tensor colors = t_.first; + unsigned int num_colors = t_.second; + + // The first half of the colors are for even nodes + int maxX = op.i.kvdim().at('X'); + int real_maxX = (op.imgLayout == EvensOnlyLayout ? 2 : maxX); + + // Get the neighbors + unsigned int max_dist_neighbors = getFurthestNeighborDistance(op); + std::vector> neighbors; + if (power == 0) + { + neighbors.push_back(Coor{{}}); + } + else if (power == max_dist_neighbors) + { + for (const auto& it : op.neighbors) + neighbors.push_back(kvcoors("xyzt", it, 0)); + } + else + throw std::runtime_error("Unsupported power"); - Tw* w_ptr = w.data.get(); - superbblas::save(detail::safe_div(w.scalar, scalar), w.p->p.data(), 1, - w.order.c_str(), w.from, w.size, (const Tw**)&w_ptr, &*w.ctx, - order.c_str(), from, ctx.get(), comm, - superbblas::FastToSlow); + // Extend directions in case of using the Kronecker form + if (kronecker_label != 0 && i.kvdim().at(kronecker_label) != 4) + throw std::runtime_error( + "Unsupported extraction of the Kronecker format from this operator"); + std::vector> spin_matrix; + if (kronecker_label != 0) + { + std::vector> new_neighbors; + int spin = i.kvdim().at(kronecker_label); + for (const auto& dir : neighbors) + { + if (dir == Coor{{}}) + { + // If self direction, create a single matrix on each combination of spins + // with the same chirality + + for (int s = 0; s < spin * spin; ++s) + { + int si = s % spin, sj = s / spin; + Tensor<2, COMPLEX> mat( + std::string(1, kronecker_label) + std::string(1, rd.at(kronecker_label)), + {{spin, spin}}, OnHost, compatible_replicated_distribution(i.dist)); + mat.set_zero(); + mat.set({{si, sj}}, 1); + new_neighbors.push_back(dir); + spin_matrix.push_back(mat); + } + } + else + { + // For the remaining directions, we capture the spin block diagonal on the first term + // and put an empty matrix on the second term so that will be guess further down + Tensor<2, COMPLEX> mat( + std::string(1, kronecker_label) + std::string(1, rd.at(kronecker_label)), + {{spin, spin}}, OnHost, compatible_replicated_distribution(i.dist)); + mat.set_zero(); + for (int s = 0; s < spin; ++s) + mat.set({{s, s}}, 1); + new_neighbors.push_back(dir); + spin_matrix.push_back(mat); + new_neighbors.push_back(dir); + spin_matrix.push_back(Tensor<2, COMPLEX>()); + } + } + neighbors = new_neighbors; + } + + // Chose a dense label that is not the spin + const char color_label = 'c'; + + // Extract the kronecker values with probing + Tensor<3, COMPLEX> kron; + std::vector> nonzero_spins; + if (kronecker_label != 0) + { + unsigned int color = 0; + + // Extracting the proving vector + auto t_l = colors.template transformWithCPUFun( + [&](float site_color) { return site_color == color ? value_type{1} : value_type{0}; }); + + // Skip empty masks + // NOTE: the call to split_dimension add a fake dimension that acts as columns + if (std::norm(norm<1>(t_l.split_dimension('X', "Xn", maxX), "n").get({{0}})) == 0) + throw std::runtime_error("Ups! We should do something more sophisticated here"); + + // Contracting the proving vector with the blocking components + auto site_size = blki; + for (char c : dense_labels) + site_size[rd[c]] = (c == kronecker_label ? blki[c] : 1); + + // Compute the matvecs + auto mv = + op(contract(t_l, t_blk.kvslice_from_size({}, {{rd[color_label], 1}}), "")); + + // Take a source + auto source_coor = + colors.find([&](float site_color) { return site_color == color; }) + .getSome(); + std::map source; + for (std::size_t i = 0; i < colors.order.size(); ++i) + if (is_in("xyztX", colors.order[i])) + source[colors.order[i]] = source_coor[i]; + + // Find the spin values for each direction + std::string kron_order(3, 0); + kron_order[0] = kronecker_label; + kron_order[1] = rd[kronecker_label]; + kron_order[2] = 'u'; + kron = Tensor<3, COMPLEX>( + kron_order, + Coor<3>{blki[kronecker_label], blki[kronecker_label], (int)neighbors.size()}, + OnDefaultDevice, compatible_replicated_distribution(i.dist)); + kron.set_zero(); + for (int mu = 0; mu < neighbors.size(); ++mu) { + // site = source + neighbors[mu], where the latter is in natural + // coordinates + const auto& coor_dir = neighbors[mu]; + int sumdir = std::accumulate(coor_dir.begin(), coor_dir.end(), int{0}); + std::map site{{'X', source['X'] + sumdir}, + {'x', (source['x'] * real_maxX + coor_dir[0]) / real_maxX}, + {'y', source['y'] + coor_dir[1]}, + {'z', source['z'] + coor_dir[2]}, + {'t', source['t'] + coor_dir[3]}}; + auto site_size = blki; + for (char c : dense_labels) + site_size[rd[c]] = (c == kronecker_label ? blki[c] : 1); + + auto site_data = mv.kvslice_from_size(site, site_size) + .make_sure(none, OnHost, compatible_replicated_distribution(i.dist)); + + if (!spin_matrix[mu]) { + // Search for a nonzero element, we take the largest. + // NOTE: don't take from spin block diagonal matrix, those + // nonzeros are captured already + auto spin_vals = + norm<2>(site_data, std::string(1, kronecker_label) + + std::string(1, rd[kronecker_label])); + int s_ref = 0; + double val_ref = 0; + for (int s = 0; s < blki[kronecker_label] * blki[kronecker_label]; + ++s) { + if (s % blki[kronecker_label] == s / blki[kronecker_label]) + continue; + double val = spin_vals.get( + kvcoors<2>(spin_vals.order, {{kronecker_label, s % blki[kronecker_label]}, + {rd[kronecker_label], s / blki[kronecker_label]}})); + if (val > val_ref) { + s_ref = s; + val_ref = val; + } + } + + // If the direction is empty, remove it! + if (val_ref == 0) { + neighbors.erase(neighbors.begin() + mu); + spin_matrix.erase(spin_matrix.begin() + mu); + mu--; + continue; + } + + nonzero_spins.push_back( + {{kronecker_label, s_ref % blki[kronecker_label]}, + {rd[kronecker_label], s_ref / blki[kronecker_label]}}); + + // Get the values + auto val0 = site_data.get(kvcoors( + site_data.order, {{kronecker_label, s_ref % blki[kronecker_label]}, + {rd[kronecker_label], s_ref / blki[kronecker_label]}, + {color_label, 0}, + {rd.at(color_label), 0}})); + + for (int s = 0; s < blki[kronecker_label] * blki[kronecker_label]; + ++s) { + if (s % blki[kronecker_label] == s / blki[kronecker_label]) + continue; + kron.set(kvcoors<3>(kron.order, {{kronecker_label, s % blki[kronecker_label]}, + {rd[kronecker_label], s / blki[kronecker_label]}, + {'u', mu}}), + site_data.get(kvcoors( + site_data.order, {{kronecker_label, s % blki[kronecker_label]}, + {rd[kronecker_label], s / blki[kronecker_label]}, + {color_label, 0}, + {rd.at(color_label), 0}})) / + val0); + } + } else { + if (std::norm( + norm<1>(contract(spin_matrix[mu].template reshape_dimensions<3>( + {{"s", "su"}}, {{'u', 1}}), + site_data, ""), + "u") + .get(Coor<1>{0})) == 0) + { + neighbors.erase(neighbors.begin() + mu); + spin_matrix.erase(spin_matrix.begin() + mu); + mu--; + continue; + } + + // Copy the know spin matrix into sop.kron + spin_matrix[mu].copyTo( + kron.kvslice_from_size({{'u', mu}}, {{'u', 1}})); + + // Set as the reference spin, the first nonzero + for (int s = 0; s < blki[kronecker_label] * blki[kronecker_label]; + ++s) { + if (std::norm(spin_matrix[mu].get( + {{s % blki[kronecker_label], + s / blki[kronecker_label]}})) > 0) { + nonzero_spins.push_back( + {{kronecker_label, s % blki[kronecker_label]}, + {rd[kronecker_label], s / blki[kronecker_label]}}); + break; + } + } + } + } + } + + // Create masks for the elements with even natural x coordinate and with odd natural x coordinate + auto even_x_mask = getXOddityMask(0, i, op.imgLayout); + auto odd_x_mask = getXOddityMask(1, i, op.imgLayout); + auto ones_blk = t_blk.template like_this(); + ones_blk.set(1); + + // Create the sparse tensor + auto d_sop = + (power == 0 ? i + : i.extend_support({{'x', (max_dist_neighbors + real_maxX - 1) / real_maxX}, + {'y', max_dist_neighbors}, + {'z', max_dist_neighbors}, + {'t', max_dist_neighbors}})) + .rename_dims(rd); + const unsigned int Nkron = kronecker_label == 0 ? 0u : 1u; + SpTensor sop{d_sop, + i, + Nblk - Nkron, + Nblk - Nkron, + Nkron, + Nkron, + (unsigned int)neighbors.size(), + coBlk == ColumnMajor}; + + // Copy the kronecker values + if (kronecker_label != 0) + { + kron.kvslice_from_size({}, {{'u', neighbors.size()}}).copyTo(sop.kron); + } + + // Extract the nonzeros with probing + sop.data.set_zero(); // all values may not be populated when using blocking + for (unsigned int color = 0; color < num_colors; ++color) + { + // Generate the proving vectors for the given color + auto t_l = colors.template transformWithCPUFun( + [&](float site_color) { return site_color == color ? value_type{1} : value_type{0}; }); + + // Skip empty masks + // NOTE: the call to split_dimension add a fake dimension that acts as columns + if (std::norm(norm<1>(t_l.split_dimension('X', "Xn", maxX), "n") + .get({{0}})) == 0) + continue; + + for (int color_idx = 0; color_idx < blki[color_label]; ++color_idx) { + std::map colorFrom{{rd[color_label], color_idx}}; + std::map colorSize{{rd[color_label], 1}}; + + // Contracting the proving vector with the blocking components + auto probs = + contract(t_l, t_blk.kvslice_from_size(colorFrom, colorSize), ""); + + // Compute the matvecs + auto mv = op(std::move(probs)); + + // Construct an indicator tensor where all blocking dimensions but + // only the nodes colored `color` are copied + auto color_mask = t_l.template transformWithCPUFun( + [](const value_type &t) { return (float)std::real(t); }); + auto sel_x_even = contract( + contract(color_mask, even_x_mask, ""), + ones_blk.kvslice_from_size(colorFrom, colorSize), ""); + auto sel_x_odd = contract( + contract(color_mask, odd_x_mask, ""), + ones_blk.kvslice_from_size(colorFrom, colorSize), ""); + + // Populate the nonzeros by copying pieces from `mv` into sop.data. + // We want to copy only the nonzeros in `mv`, which are `neighbors` + // away from the nonzeros of `probs`. + auto sop_data = sop.data.kvslice_from_size(colorFrom, colorSize); + if (kronecker_label == 0) { + latticeCopyToWithMask(mv, sop_data, 'u', neighbors, + {{'X', real_maxX}}, sel_x_even, sel_x_odd); + } else { + std::map single_spin{{kronecker_label, 1}, + {rd[kronecker_label], 1}}; + for (int dir = 0; dir < neighbors.size(); ++dir) + latticeCopyToWithMask( + mv.kvslice_from_size(nonzero_spins[dir], single_spin), + sop_data.kvslice_from_size({{'u', dir}}, {{'u', 1}}), 'u', + std::vector>(1, neighbors[dir]), + {{'X', real_maxX}}, + sel_x_even.kvslice_from_size(nonzero_spins[dir], + single_spin), + sel_x_odd.kvslice_from_size(nonzero_spins[dir], + single_spin)); + } + } + } + + // Populate the coordinate of the columns, that is, to give the domain coordinates of first nonzero in each + // BSR nonzero block. Assume that we are processing nonzeros block for the image coordinate `c` on the + // direction `dir`, that is, the domain coordinates will be (cx-dirx,cy-diry,cz-dirz,dt-dirt) in natural + // coordinates. But we get the image coordinate `c` in even-odd coordinate, (cX,cx,cy,cz,ct), which has the + // following natural coordinates (cx*2+(cX+cy+cz+ct)%2,cy,cz,ct). After subtracting the direction we get the + // natural coordinates (cx*2+(cX+cy+cz+ct)%2-dirx,cy-diry,cz-dirz,ct-dirt), which corresponds to the following + // even-odd coordinates ((cX-dirx-diry-dirz-dirt)%2,(cx*2+(cX+cy+cz+ct)%2-dirx)/2,cy-diry,cz-dirz,ct-dirt). + + Coor real_dims = kvcoors("xyzt", getNatLatticeDims(i.kvdim(), op.imgLayout)); + int d = op.imgLayout == XEvenOddLayoutZeroOdd ? 1 : 0; + sop.jj.fillCpuFunCoor([&](const Coor& c) { + // c has order '~u%xyztX' where xyztX were remapped by ri + int domi = c[0]; // the domain label to evaluate, label ~ + int mu = c[1]; // the direction, label u + int base = c[domi + 2]; // the image coordinate value for label `domi` + + // Do nothing for a blocking direction + if (domi < Nblk) + return 0; + + const auto& dir = neighbors[mu]; + + // For labels X and x + if (domi == Nblk || domi == Nblk + Nd) + { + int sumdir = std::accumulate(dir.begin(), dir.end(), int{0}); + if (domi == Nblk + Nd) + return (base + sumdir + real_maxX * Nd) % real_maxX; + int sumyzt = std::accumulate(c.begin() + 2 + Nblk + 1, c.end() - 1, d); + const auto& cX = c[2 + Nblk + Nd]; + return ((base * real_maxX + (cX + sumyzt) % real_maxX + real_dims[0] - dir[0]) / + real_maxX) % + (real_dims[0] / real_maxX); + } + + int latd = domi - Nblk; + return (base - dir[latd] + real_dims[latd]) % real_dims[latd]; + }); + + // Construct the sparse operator + sop.construct(); + + // Return the sparse tensor and the remap from original operator to domain of the sparse tensor + return {sop, rd}; + } + + /// Return a sparse tensor with the content of the given operator + /// \param op: operator to extract the nonzeros from + /// \param power: maximum distance to recover the nonzeros: + /// 0, block diagonal; 1: near-neighbors... + /// \param coBlk: ordering of the nonzero blocks of the sparse operator + /// \param useKronFormat: whether to create a Kronecker BSR variant if the given operator is in that format + /// \return: a pair of a sparse tensor and a remap; the sparse tensor has the same image + /// labels as the given operator and domain labels are indicated by the returned remap. + + template 9), bool>::type = true> + std::pair, remap> + cloneOperatorToSpTensor(const Operator& op, unsigned int power, + ColOrdering coBlk = RowMajor, bool useKronFormat = true, + const std::string& prefix = "") + { + Tracker _t(std::string("clone blocked operator ") + prefix); + + // Unblock the given operator, the code of `cloneUnblockedOperatorToSpTensor` is too complex as it is + auto unblki = op.i + .template reshape_dimensions( + {{"0x", "x"}, {"1y", "y"}, {"2z", "z"}, {"3t", "t"}}, {}, true) + .make_eg(); + auto opdim = op.i.kvdim(); + Operator unblocked_op{ + [=](const Tensor& x, Tensor y) { + op(x.template reshape_dimensions( + {{"x", "0x"}, {"y", "1y"}, {"z", "2z"}, {"t", "3t"}}, opdim, true)) + .template reshape_dimensions( + {{"0x", "x"}, {"1y", "y"}, {"2z", "z"}, {"3t", "t"}}, {}, true) + .copyTo(y); + }, + unblki, + unblki, + nullptr, + op.order_t, + op.domLayout, + op.imgLayout, + op.neighbors, + coBlk, + op.is_kronecker()}; + + // Get a sparse tensor representation of the operator + unsigned int op_dist = getFurthestNeighborDistance(op); + if (op_dist > 1 && power % op_dist != 0) + throw std::runtime_error("cloneOperatorToSpTensor: invalid power value, it isn't " + "divisible by the furthest neighbor distance"); + auto opdims = op.i.kvdim(); + auto t = cloneUnblockedOperatorToSpTensor(unblocked_op, std::min(power, op_dist), coBlk, + useKronFormat, prefix); + remap rd = t.second; + for (const auto& it : + detail::getNewLabels("0123", op.d.order + op.i.order + "0123" + t.first.d.order)) + rd[it.first] = it.second; + int max_op_power = (op_dist == 0 ? 0 : std::max(power / op_dist, 1u) - 1u); + std::map m_power{}; + for (char c : update_order("xyzt", rd)) + m_power[c] = max_op_power; + auto sop = t.first.extend_support(m_power) + .split_dimension(rd['x'], update_order("0x", rd), opdim.at('0'), 'x', "0x", + opdim.at('0')) + .split_dimension(rd['y'], update_order("1y", rd), opdim.at('1'), 'y', "1y", + opdim.at('1')) + .split_dimension(rd['z'], update_order("2z", rd), opdim.at('2'), 'z', "2z", + opdim.at('2')) + .split_dimension(rd['t'], update_order("3t", rd), opdim.at('3'), 't', "3t", + opdim.at('3')) + .reorder(std::string("%") + update_order("0123xyztX", rd), "%0123xyztX", '%'); + + return {sop, rd}; + } + + template ::type = true> + std::pair, remap> + cloneOperatorToSpTensor(const Operator& op, unsigned int power, + ColOrdering coBlk = RowMajor, bool = true, + const std::string& prefix = "") + { + throw std::runtime_error("trying to clone an unblock operator with a blocking function"); + } + + /// Return an efficient operator application + /// \param op: operator to extract the nonzeros from + /// \param power: maximum distance to recover the nonzeros: + /// 0, block diagonal; 1: near-neighbors... + /// \param co: preferred ordering of dense input and output tensors + /// \param coBlk: ordering of the nonzero blocks of the sparse operator + + template + Operator + cloneOperator(const Operator& op, unsigned int power, ColOrdering co, + ColOrdering coBlk, + BlockingAsSparseDimensions blockingAsSparseDimensions = ConsiderBlockingSparse, + const std::string& prefix = "") + { + // If the operator is empty, just return itself + if (op.d.volume() == 0 || op.i.volume() == 0) + return op; + + // Get a sparse tensor representation of the operator + unsigned int op_dist = getFurthestNeighborDistance(op); + if (op_dist > 1 && power % op_dist != 0) + throw std::runtime_error("cloneOperator: invalid power value"); + remap rd; + SpTensor sop; + if (blockingAsSparseDimensions == ConsiderBlockingSparse) + { + auto t = cloneOperatorToSpTensor(op, power, coBlk, true /* use Kron format if possible */, + prefix); + sop = t.first; + rd = t.second; + } + else + { + auto t = cloneUnblockedOperatorToSpTensor(op, power, coBlk, + true /* use Kron format if possible */, prefix); + sop = t.first; + rd = t.second; + } + + // Construct the operator to return + Operator rop{sop, rd, power, sop.i, sop.i, + op.order_t, op.domLayout, op.imgLayout, op.neighbors, co}; + + // Skip tests if power < op_dist + if (power < op_dist) + return rop; + + // Do a test + Tracker _t(std::string("clone blocked operator (testing) ") + prefix); + for (const auto& test_order : std::vector{"%n", "n%"}) + { + auto x = op.d.template like_this(test_order, '%', "", {{'n', 2}}); + auto y_rop = op.d.template like_this(test_order, '%', "", {{'n', 2}}); + urand(x, -1, 1); + auto y_op = op(x); + for (int nfrom = 0; nfrom < 2; ++nfrom) + { + for (int nsize = 1; nsize <= 2; ++nsize) + { + y_rop.set(detail::NaN::get()); + auto x0 = x.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto y_op0 = y_op.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto y_rop0 = y_rop.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto base_norm0 = norm<1>(y_op0, "n"); + rop(x0, y_rop0); // y_rop0 = rop(x0) + y_op0.scale(-1).addTo(y_rop0); + auto error = norm<1>(y_rop0, "n"); + auto eps = + std::sqrt(std::numeric_limits::type>::epsilon()); + for (int i = 0; i < base_norm0.volume(); ++i) + if (error.get({{i}}) > eps * base_norm0.get({{i}})) + throw std::runtime_error("cloneOperator: too much error on the cloned operator"); + } + } + } + + // Test for powers + for (const auto& test_order : std::vector{"%n^", "n%^"}) + { + const int max_power = 3; + auto x = op.d.template like_this(test_order, '%', "", {{'n', 2}, {'^', 1}}); + auto y_op = + op.d.template like_this(test_order, '%', "", {{'n', 2}, {'^', max_power}}); + urand(x, -1, 1); + op(x).copyTo(y_op.kvslice_from_size({}, {{'^', 1}})); + for (unsigned int i = 1; i < max_power; ++i) + op(y_op.kvslice_from_size({{'^', i - 1}}, {{'^', 1}})) + .copyTo(y_op.kvslice_from_size({{'^', i}}, {{'^', 1}})); + auto y_rop = y_op.like_this(); + for (int nfrom = 0; nfrom < 2; ++nfrom) + { + for (int nsize = 1; nsize <= 2; ++nsize) + { + y_rop.set(detail::NaN::get()); + auto x0 = x.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto y_op0 = y_op.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto y_rop0 = y_rop.kvslice_from_size({{'n', nfrom}, {'n', nsize}}); + auto base_norm0 = norm<2>(y_op0, "n^").template collapse_dimensions<1>("n^", 'n'); + rop(x0, y_rop0, '^'); // y_rop0 = {rop(x0), rop(rop(x0)), ...} + y_op0.scale(-1).addTo(y_rop0); + auto error = norm<2>(y_rop0, "n^").template collapse_dimensions<1>("n^", 'n'); + auto eps = + std::sqrt(std::numeric_limits::type>::epsilon()); + for (int i = 0; i < base_norm0.volume(); ++i) + if (error.get({{i}}) > eps * base_norm0.get({{i}})) + throw std::runtime_error( + "cloneOperator: too much error on the cloned operator for the power"); + } + } + } + + return rop; } - /// Load content from the storage into the given tensor - template ::value == detail::is_complex::value, bool>::type = true> - void copyTo(Tensor w) const - { - Coor wsize = kvcoors(order, w.kvdim(), 1, NoThrow); - for (unsigned int i = 0; i < N; ++i) - if (size[i] > wsize[i]) - throw std::runtime_error("The destination tensor is smaller than the source tensor"); + /// Return an efficient operator application + /// \param op: operator to extract the nonzeros from + /// \param co: preferred ordering of dense input and output tensors + /// \param coBlk: ordering of the nonzero blocks of the sparse operator - Tw* w_ptr = w.data.get(); - MPI_Comm comm = (w.dist == Local ? MPI_COMM_SELF : MPI_COMM_WORLD); - superbblas::load(detail::safe_div(scalar, w.scalar), ctx.get(), - order.c_str(), from, size, w.p->p.data(), 1, w.order.c_str(), - w.from, &w_ptr, &*w.ctx, comm, superbblas::FastToSlow, - superbblas::Copy); + template + Operator + cloneOperator(const Operator& op, ColOrdering co, ColOrdering coBlk, + BlockingAsSparseDimensions blockingAsSparseDimensions = ConsiderBlockingSparse, + const std::string& prefix = "") + { + return cloneOperator(op, getFurthestNeighborDistance(op), co, coBlk, + blockingAsSparseDimensions, prefix); } - }; - /// Return a tensor filled with the value of the function applied to each element - /// \param order: dimension labels, they should start with "xyztX" - /// \param size: length of each dimension - /// \param dev: either OnHost or OnDefaultDevice - /// \param func: function (Coor) -> COMPLEX + } - template - Tensor fillLatticeField(const std::string& order, const std::map& from, - const std::map& size, - const std::map& dim, DeviceHost dev, Func func) + namespace detail { - using superbblas::detail::operator+; + inline std::mt19937_64& getSeed() + { + // This is quick and dirty and nonreproducible if the lattice is distributed + // among the processes is different ways. + static std::mt19937_64 twister_engine(10 + Layout::nodeNumber()); + return twister_engine; + } + } - static_assert(N >= 5, "The minimum number of dimensions should be 5"); - if (order.size() < 5 || order.compare(0, 5, "xyztX") != 0) - throw std::runtime_error("Wrong `order`, it should start with xyztX"); + /// Modify with complex random uniformly distributed numbers with the real and the imaginary part between [a,b] + /// \param t: tensor to fill with random numbers + /// \param a: minimum random value + /// \param b: maximum random value - // Get final object dimension - Coor dim_c = latticeSize(order, dim); - std::map size0 = dim; - for (const auto& it : size) - size0[it.first] = it.second; - Coor size_c = latticeSize(order, size0); - Coor from_c = kvcoors(order, from); + template ::value, bool>::type = true> + void urand(Tensor t, typename T::value_type a = 0, typename T::value_type b = 1) + { + std::uniform_real_distribution d(a, b); + t.fillWithCPUFuncNoArgs( + [&]() { + return T{d(detail::getSeed()), d(detail::getSeed())}; + }, + false); + } - // Populate the tensor on CPU - Tensor r(order, size_c, OnHost); - Coor local_latt_size = r.p->localSize(); // local dimensions for xyztX - Coor stride = - superbblas::detail::get_strides(local_latt_size, superbblas::FastToSlow); - Coor local_latt_from = - r.p->localFrom(); // coordinates of first elements stored locally for xyztX - //int latt_from_oddity = latt_from[1] + latt_from[2] + latt_from[3] + latt_from[4]; - local_latt_from = local_latt_from + from_c; - std::size_t vol = superbblas::detail::volume(local_latt_size); - Index nX = r.kvdim()['X']; - COMPLEX* ptr = r.data.get(); + /// Modify with random uniformly distributed numbers between [a,b] + /// \param t: tensor to fill with random numbers + /// \param a: minimum random value + /// \param b: maximum random value + + template ::value, bool>::type = true> + void urand(Tensor t, typename detail::base_type::type a = 0, + typename detail::base_type::type b = 1) + { + std::uniform_real_distribution::type> d(a, b); + t.fillWithCPUFuncNoArgs([&]() { return d(detail::getSeed()); }, false); + } -# ifdef _OPENMP -# pragma omp parallel for schedule(static) -# endif - for (std::size_t i = 0; i < vol; ++i) - { - // Get the global coordinates - Coor c = normalize_coor( - superbblas::detail::index2coor(i, local_latt_size, stride) + local_latt_from, dim_c); + /// Modify with complex random normal distributed numbers + /// \param t: tensor to fill with random numbers - // Translate even-odd coordinates to natural coordinates - Coor coor; - coor[0] = c[0] * 2 + (c[1] + c[2] + c[3] + c[4]) % nX; // x - coor[1] = c[1]; // y - coor[2] = c[2]; // z - coor[3] = c[3]; // t - std::copy_n(c.begin() + 5, N - 5, coor.begin() + 4); + template ::value, bool>::type = true> + void nrand(Tensor t) + { + std::normal_distribution d{}; + t.fillWithCPUFuncNoArgs( + [&]() { + return T{d(detail::getSeed()), d(detail::getSeed())}; + }, + false); + } - // Call the function - ptr[i] = func(coor); - } + /// Modify with random normal distributed numbers + /// \param t: tensor to fill with random numbers - return r.make_sure(none, dev); + template ::value, bool>::type = true> + void nrand(Tensor t) + { + std::normal_distribution::type> d{}; + t.fillWithCPUFuncNoArgs([&]() { return d(detail::getSeed()); }, false); } /// Compute a shift of v onto the direction dir @@ -2551,22 +8610,22 @@ namespace Chroma /// \param dir: 0 is x; 1 is y... template - Tensor shift(const Tensor v, Index first_tslice, int len, int dir, - Maybe action = none, Maybe> w=none) + Tensor shift(const Tensor& v, Index first_tslice, int len, int dir, + Maybe action = none, Tensor w = {}) { if (dir < 0 || dir >= Nd - 1) throw std::runtime_error("Invalid direction"); - if (action.hasSome() != w.hasSome()) + if (action.hasSome() != (bool)w) throw std::runtime_error("Invalid default value"); // Address zero length case if (len == 0) { - if (!w.hasSome()) + if (!w) return v; - v.doAction(action.getSome(), w.getSome()); - return w.getSome(); + v.doAction(action.getSome(), w); + return w; } // NOTE: chroma uses the reverse convention for direction: shifting FORWARD moves the sites on the negative direction @@ -2577,49 +8636,49 @@ namespace Chroma // If we are not using red-black ordering, return a view where the tensor is shifted on the given direction v = v.kvslice_from_size({{dir_label[dir], -len}}); - if (!w.hasSome()) + if (!w) return v; - v.doAction(action, w.getSome()); - return w.getSome(); + v.doAction(action, w); + return w; # elif QDP_USE_CB2_LAYOUT - // Assuming that v has support on the origin and destination lattice elements - int dimX = v.kvdim()['X']; - if (dimX != 2 && len % 2 != 0) - throw std::runtime_error("Unsupported shift"); - - if (dir != 0) + int dimX = v.kvdim().at('X'); + if (dir != 0 || dimX == 1) { - if (!w.hasSome()) + if (!w) return v.kvslice_from_size({{'X', -len}, {dir_label[dir], -len}}); - v.doAction(action.getSome(), - w.getSome().kvslice_from_size({{'X', len}, {dir_label[dir], len}})); - return w.getSome(); + v.doAction(action.getSome(), w.kvslice_from_size({{'X', len}, {dir_label[dir], len}})); + return w; } else { - int t = v.kvdim()['t']; + auto dims = v.kvdim(); + int t = dims.at('t'); if (t > 1 && t % 2 == 1) throw std::runtime_error( "The t dimension should be zero, one, or even when doing shifting on the X dimension"); - int maxT = std::min(2, t); - auto v_eo = v.split_dimension('y', "Yy", 2) - .split_dimension('z', "Zz", 2) + int maxX = dims.at('X'); + int maxY = std::min(2, dims.at('y')); + int maxZ = std::min(2, dims.at('z')); + int maxT = std::min(2, dims.at('t')); + auto v_eo = v.split_dimension('y', "Yy", maxY) + .split_dimension('z', "Zz", maxZ) .split_dimension('t', "Tt", maxT); - Tensor r = w.hasSome() ? w.getSome() : v.like_this(); - auto r_eo = r.split_dimension('y', "Yy", 2) - .split_dimension('z', "Zz", 2) + Tensor r = w ? w : v.like_this(); + auto r_eo = r.split_dimension('y', "Yy", maxY) + .split_dimension('z', "Zz", maxZ) .split_dimension('t', "Tt", maxT); + //.make_writing_nonatomic(); while (len < 0) - len += v.kvdim()['x'] * 2; + len += dims.at('x') * maxX; for (int T = 0; T < maxT; ++T) { - for (int Z = 0; Z < 2; ++Z) + for (int Z = 0; Z < maxZ; ++Z) { - for (int Y = 0; Y < 2; ++Y) + for (int Y = 0; Y < maxY; ++Y) { - for (int X = 0; X < 2; ++X) + for (int X = 0; X < maxX; ++X) { auto v_eo_slice = v_eo.kvslice_from_size({{'X', X}, {'Y', Y}, {'Z', Z}, {'T', T}}, {{'X', 1}, {'Y', 1}, {'Z', 1}, {'T', 1}}); @@ -2651,21 +8710,21 @@ namespace Chroma template Tensor displace(const std::vector>& u, Tensor v, Index first_tslice, int dir, Maybe action = none, - Maybe> w = none) + Tensor w = {}) { if (std::abs(dir) > Nd) throw std::runtime_error("Invalid direction"); - if (action.hasSome() != w.hasSome()) + if (action.hasSome() != (bool)w) throw std::runtime_error("Invalid default value"); // Address the zero direction case if (dir == 0) { - if (!w.hasSome()) + if (!w) return v; - v.doAction(action.getSome(), w.getSome()); - return w.getSome(); + v.doAction(action.getSome(), w); + return w; } int d = std::abs(dir) - 1; // space lattice direction, 0: x, 1: y, 2: z @@ -2675,7 +8734,7 @@ namespace Chroma if (len > 0) { // Do u[d] * shift(x,d) - Tensor r = w.hasSome() ? w.getSome() : v.like_this(); + Tensor r = w ? w : v.like_this(); v = shift(std::move(v), first_tslice, len, d); r.contract(std::move(v), {}, NotConjugate, u[d], {{'j', 'c'}}, NotConjugate, {{'c', 'i'}}, action.getSome(CopyTo) == CopyTo ? 0.0 : 1.0); @@ -2741,16 +8800,48 @@ namespace Chroma // r = conj(phases) * displace(u, v, dir) Tensor r = v.like_this("c%xyzXtm", '%'); - r.contract(displace(u, v, first_tslice, -dir), {}, NotConjugate, - asTensorView(phases), {{'i', 'm'}}, Conjugate); + r.contract(displace(u, v, first_tslice, -dir), {}, NotConjugate, asTensorView(phases), + {{'i', 'm'}}, Conjugate); // r = r - phases * displace(u, v, dir) if !ConjUnderAdd else r + phases * displace(u, v, dir) - r.contract(displace(u, v, first_tslice, dir).scale(conjUnderAdd ? 1 : -1), - {}, NotConjugate, asTensorView(phases), {{'i', 'm'}}, NotConjugate, {}, 1.0); + r.contract(displace(u, v, first_tslice, dir).scale(conjUnderAdd ? 1 : -1), {}, NotConjugate, + asTensorView(phases), {{'i', 'm'}}, NotConjugate, {}, 1.0); return r; } + /// Returns the \gamma_5 for a given number of spins + /// \param ns: number of spins + + template + Tensor<2, COMPLEX> getGamma5(int ns, DeviceHost dev = OnDefaultDevice, + const Distribution& dist = OnEveryoneReplicated) + { + const auto& new_dist = detail::compatible_replicated_distribution(dist); + if (ns == 1) + { + Tensor<2, COMPLEX> r("ij", {1, 1}, OnHost, new_dist); + r.set({{0, 0}}, COMPLEX{1}); + return r.make_sure(none, dev); + } + else if (ns == Ns) + { + return SB::Gamma(Ns * Ns - 1).template make_sure(none, dev, new_dist); + } + else if (ns == 2) + { + Tensor<2, COMPLEX> r("ij", {2, 2}, OnHost, new_dist); + r.set_zero(); + r.set({{0, 0}}, COMPLEX{1}); + r.set({{1, 1}}, COMPLEX{-1}); + return r.make_sure(none, dev); + } + else + { + throw std::runtime_error("Error in getGamma5: Unsupported spin number"); + } + } + // template // class Transform : public Tensor { // public: @@ -2896,8 +8987,8 @@ namespace Chroma assert(tnat.p->localVolume() == perm.size() * Nc); unsigned int i1 = perm.size(); - const T* x = tnat.data.get(); - T* y = trb.data.get(); + const T* x = tnat.data(); + T* y = trb.data(); # ifdef _OPENMP # pragma omp parallel for schedule(static) @@ -2920,8 +9011,8 @@ namespace Chroma assert(tnat.p->localVolume() == perm.size() * Nc); unsigned int i1 = perm.size(); - T* x = tnat.data.get(); - const T* y = trb.data.get(); + T* x = tnat.data(); + const T* y = trb.data(); # ifdef _OPENMP # pragma omp parallel for schedule(static) @@ -2937,19 +9028,44 @@ namespace Chroma template Tensor getPhase(Coor phase, int tfrom, int tsize, - DeviceHost dev = OnDefaultDevice) + DeviceHost dev = OnDefaultDevice, + const Distribution& dist = OnEveryone) { // Get spatial dimensions of the current lattice Coor dim = latticeSize("xyzX", {}); dim[0] *= dim[3]; - return fillLatticeField<5, T>("xyztX", {{'t', tfrom}}, {{'t', tsize}}, {}, dev, - [=](Coor c) { - typename T::value_type phase_dot_coor = 0; - for (int i = 0; i < Nd - 1; ++i) - phase_dot_coor += c[i] * 2 * M_PI * phase[i] / dim[i]; + return fillLatticeField<5, T>( + "xyztX", {{'t', tfrom}}, {{'t', tsize}}, {}, dev, + [=](Coor c) { + typename T::value_type phase_dot_coor = 0; + for (int i = 0; i < Nd - 1; ++i) + phase_dot_coor += c[i] * 2 * M_PI * phase[i] / dim[i]; + + return T{cos(phase_dot_coor), sin(phase_dot_coor)}; + }, + true /* zero is even */, dist); + } + + /// Return a lattice field with value exp(2*pi*(x./dim)'*phase) for each lattice site x + /// \param phase: integer phase + /// \param dev: device of the returned tensor - return T{cos(phase_dot_coor), sin(phase_dot_coor)}; - }); + template + Tensor getPhaseNatural(Coor phase, DeviceHost dev = OnDefaultDevice, + const Distribution& dist = OnEveryone) + { + // Get spatial dimensions of the current lattice + Coor dim = latticeSize("xyztX", {{'t', 1}}); + dim[0] *= dim[4]; + dim[4] = 1; + auto r = Tensor("xyztX", dim, dev, dist); + r.fillCpuFunCoor([&](const Coor& c) { + typename T::value_type phase_dot_coor = 0; + for (int i = 0; i < Nd - 1; ++i) + phase_dot_coor += c[i] * 2 * M_PI * phase[i] / dim[i]; + return T{cos(phase_dot_coor), sin(phase_dot_coor)}; + }); + return r; } # if defined(BUILD_PRIMME) @@ -2969,22 +9085,32 @@ namespace Chroma // chi = -2*N*psi psi.scale(-2 * N).copyTo(chi); - // I have no idea how to do this.... - using MaybeTensor = Maybe>; - for (int mu = 0; mu < N; ++mu) { - displace(u, psi, first_tslice, mu + 1, Action::AddTo, MaybeTensor(chi)); - displace(u, psi, first_tslice, -(mu + 1), Action::AddTo, MaybeTensor(chi)); + displace(u, psi, first_tslice, mu + 1, Action::AddTo, chi); + displace(u, psi, first_tslice, -(mu + 1), Action::AddTo, chi); } } +# if defined(SUPERBBLAS_USE_CUDA) + void inline gpuBlasCheck(cublasStatus_t s) + { + superbblas::detail::gpuBlasCheck(s); + } +# elif defined(SUPERBBLAS_USE_HIP) + void inline gpuBlasCheck(hipblasStatus_t s) + { + if (s != HIPBLAS_STATUS_SUCCESS) + throw std::runtime_error("hipblas error"); + } +# endif + // Auxiliary structure passed to PRIMME's matvec + template struct OperatorAux { - const std::vector> u; // Gauge fields - const Index first_tslice; // global t index - const std::string order; // Laplacian input/output tensor's order + const Operator op; // Operator, most likely cxyztX or csxyztX + const DeviceHost primme_dev; // where primme allocations are }; // Wrapper for PRIMME of `LaplacianOperator` @@ -2995,8 +9121,9 @@ namespace Chroma /// \param blockSize: number of input/output vectors /// \param ierr: output error state (zero means ok) - extern "C" inline void primmeMatvec(void* x, PRIMME_INT* ldx, void* y, PRIMME_INT* ldy, - int* blockSize, primme_params* primme, int* ierr) + template + inline void primmeMatvec(void* x, PRIMME_INT* ldx, void* y, PRIMME_INT* ldy, int* blockSize, + primme_params* primme, int* ierr) { *ierr = -1; try @@ -3005,19 +9132,50 @@ namespace Chroma if (*blockSize > 1 && (*ldx != primme->nLocal || *ldy != primme->nLocal)) throw std::runtime_error("We cannot play with the leading dimensions"); - OperatorAux& opaux = *(OperatorAux*)primme->matrix; - Coor size = latticeSize(opaux.order, {{'n', *blockSize}, {'t', 1}}); - Tensor tx(opaux.order, size, OnDefaultDevice, OnEveryone, - std::shared_ptr((ComplexD*)x, [](ComplexD*) {})); - Tensor ty(opaux.order, size, OnDefaultDevice, OnEveryone, - std::shared_ptr((ComplexD*)y, [](ComplexD*) {})); - LaplacianOperator(opaux.u, opaux.first_tslice, ty, tx); + OperatorAux& opaux = *(OperatorAux*)primme->matrix; + const std::string order(opaux.op.d.order + std::string("n")); + auto dim = opaux.op.i.kvdim(); + dim['n'] = *blockSize; + Coor size = kvcoors(order, dim); + Tensor tx(order, size, opaux.primme_dev, opaux.op.i.dist, (ComplexD*)x); + Tensor ty(order, size, opaux.primme_dev, opaux.op.i.dist, (ComplexD*)y); + assert(tx.getLocal().volume() == primme->nLocal * (*blockSize)); + if (dim.count('s') == 0) + { + // ty = op * tx + opaux.op(tx, ty); + } + else + { + // ty = op * g5 * tx + auto g5 = getGamma5(dim['s'], opaux.primme_dev); + opaux.op( + contract(g5.rename_dims({{'j', 's'}}), tx, "s").rename_dims({{'i', 's'}}), ty); + } + assert(ty.allocation->pending_operations.size() == 0); +# if defined(SUPERBBLAS_USE_CUDA) + // Make sure cublas handle operates on legacy stream for primme + gpuBlasCheck(cublasSetStream(*(superbblas::detail::GpuBlasHandle*)primme->queue, 0)); +# endif *ierr = 0; } catch (...) { } } + extern "C" inline void primmeMatvecLaplacian(void* x, PRIMME_INT* ldx, void* y, + PRIMME_INT* ldy, int* blockSize, + primme_params* primme, int* ierr) + { + primmeMatvec(x, ldx, y, ldy, blockSize, primme, ierr); + } + + extern "C" inline void primmeMatvecFermion(void* x, PRIMME_INT* ldx, void* y, PRIMME_INT* ldy, + int* blockSize, primme_params* primme, int* ierr) + { + primmeMatvec(x, ldx, y, ldy, blockSize, primme, ierr); + } + /// Wrapper for PRIMME of a global sum for double /// \param sendBuf: pointer to input vector /// \param recvBuf: pointer to output vector @@ -3056,39 +9214,70 @@ namespace Chroma detail::check_order_contains(order, "cxyztXn"); Tensor all_evecs( order, latticeSize(order, {{'n', n_colorvecs}, {'t', n_tslices}}), - OnDefaultDevice, OnEveryone); - std::vector> all_evals; + OnDefaultDevice, "t"); + Tensor<2, double> all_evals( + "nt", latticeSize<2>("nt", {{'n', n_colorvecs}, {'t', n_tslices}}), OnDefaultDevice, "t"); + + // Distribute ut only on the t direction + std::vector> ut_global(Nd - 1); + for (unsigned int d = 0; d < Nd - 1; d++) + { + ut_global[d] = asTensorView(u[d]) + .kvslice_from_size({{'t', from_tslice}}, {{'t', n_tslices}}) + .toComplex() + .template make_sure("ijxyztX", none, "t"); + } + + // If the 3D laplacian operator is big enough, run it on device + DeviceHost primme_dev = OnHost; +# if defined(SUPERBBLAS_USE_GPU) + primme_dev = OnDefaultDevice; +# endif +# if defined(SUPERBBLAS_USE_HIP) + hipblasHandle_t gpublas_handle; + gpuBlasCheck(hipblasCreate(&gpublas_handle)); +# endif for (Index t = 0; t < n_tslices; ++t) { - // Make a copy of the time-slicing of u[d] also supporting left and right - std::vector> ut(Nd); + // Distribute ut only on the t direction + std::vector> ut(Nd - 1); for (unsigned int d = 0; d < Nd - 1; d++) { - ut[d] = asTensorView(u[d]) - .kvslice_from_size({{'t', from_tslice + t}}, {{'t', 1}}) - .toComplex() - .template make_sure("ijxyztX"); + ut[d] = ut_global[d].kvslice_from_size({{'t', t}}, {{'t', 1}}).getLocal(); } + if (!ut[0]) + continue; + + // Create an efficient representation of the laplacian operator + std::string order("cxyztX"); + auto eg = + ut[0].template like_this(order, {{'c', Nc}}, primme_dev, none).make_eg(); + OperatorLayout op_layout = + ((from_tslice + t) % 2 == 0 ? XEvenOddLayout : XEvenOddLayoutZeroOdd); + auto laplacianOp = Chroma::SB::detail::cloneOperator( + Operator{ + [&](Tensor x, Tensor y) { + LaplacianOperator(ut, from_tslice + t, y, x); + }, + eg, eg, nullptr, "", op_layout, op_layout, + detail::getNeighbors(eg.kvdim(), 1 /* near-neighbors links only */, op_layout), + ColumnMajor, false /* no kronecker op */}, + ColumnMajor, RowMajor, Chroma::SB::detail::ConsiderBlockingDense, "laplacian"); + // Create an auxiliary struct for the PRIMME's matvec // NOTE: Please keep 'n' as the slowest index; the rows of vectors taken by PRIMME's matvec has dimensions 'cxyztX', - // and 'n' is the dimension for the columns. - OperatorAux opaux{ut, from_tslice + t, "cxyztXn"}; + // and 'n' is the dimension for the columns. + OperatorAux opaux{laplacianOp, primme_dev}; // Make a bigger structure holding primme_params primme; primme_initialize(&primme); // Get the global and local size of evec - std::size_t n, nLocal; - { - Tensor aux_tensor( - opaux.order, latticeSize(opaux.order, {{'n', 1}, {'t', 1}}), OnDefaultDevice, - OnEveryone); - n = aux_tensor.volume(); - nLocal = aux_tensor.getLocal().volume(); - } + std::size_t n = eg.volume(); + std::size_t nLocal = eg.localVolume(); if (n_colorvecs > n) { @@ -3105,20 +9294,17 @@ namespace Chroma primme.eps = 1e-9; primme.target = primme_largest; - // Set parallel settings - primme.nLocal = nLocal; - primme.numProcs = QDP::Layout::numNodes(); - primme.procID = QDP::Layout::nodeNumber(); - primme.globalSumReal = primmeGlobalSum; - // No preconditioner for my matrix - primme.matrixMatvec = primmeMatvec; + primme.matrixMatvec = primmeMatvecLaplacian; primme.matrix = &opaux; // Set block size - primme.maxBasisSize = 64; - primme.maxBlockSize = 4; - primme.ldOPs = primme.nLocal; + if (n > 128) + { + primme.maxBasisSize = 64; + primme.maxBlockSize = 4; + } + primme.ldOPs = n; // Should set lots of defaults if (primme_set_method(PRIMME_DEFAULT_MIN_TIME, &primme) < 0) @@ -3127,24 +9313,42 @@ namespace Chroma QDP_abort(1); } +# if defined(SUPERBBLAS_USE_GPU) + // Primme block orthogonalization is very slow on gpus + primme.orth = primme_orth_implicit_I; +# endif + // Allocate space for converged Ritz values and residual norms std::vector evals(primme.numEvals); std::vector rnorms(primme.numEvals); + const std::string evecs_order(eg.order + std::string("n")); Tensor evecs( - opaux.order, latticeSize(opaux.order, {{'n', primme.numEvals}, {'t', 1}}), - OnDefaultDevice, OnEveryone); -# if defined(SUPERBBLAS_USE_CUDA) - primme.queue = &*evecs.ctx->cublasHandle; -# elif defined(SUPERBBLAS_USE_HIP) - primme.queue = &*evecs.ctx->hipblasHandle; + evecs_order, latticeSize(evecs_order, {{'n', primme.numEvals}, {'t', 1}}), + primme_dev, eg.dist); + assert(evecs.localVolume() == primme.n * primme.numEvals); +# if defined(SUPERBBLAS_USE_GPU) +# if defined(SUPERBBLAS_USE_CUDA) + superbblas::detail::GpuBlasHandle gpublas_handle = + superbblas::detail::getGpuBlasHandle(evecs.ctx().toGpu(0)); + // Make sure cublas handle operates on legacy stream for primme + gpuBlasCheck(cublasSetStream(gpublas_handle, 0)); + +# endif + primme.queue = &gpublas_handle; # endif // Call primme + int ret; # if defined(SUPERBBLAS_USE_GPU) - int ret = cublas_zprimme(evals.data(), evecs.data.get(), rnorms.data(), &primme); -# else - int ret = zprimme(evals.data(), evecs.data.get(), rnorms.data(), &primme); + if (primme_dev == OnDefaultDevice) + { + ret = cublas_zprimme(evals.data(), evecs.data(), rnorms.data(), &primme); + } + else # endif + { + ret = zprimme(evals.data(), evecs.data(), rnorms.data(), &primme); + } if (primme.procID == 0) { @@ -3173,18 +9377,15 @@ namespace Chroma if (evals.size() > 0) { auto r = evecs.like_this(); - LaplacianOperator(opaux.u, opaux.first_tslice, r, evecs); + LaplacianOperator(ut, from_tslice + t, r, evecs); std::vector> evals_cmpl(evals.begin(), evals.end()); - r.contract(evecs, {}, NotConjugate, - asTensorView(evals_cmpl).rename_dims({{'i', 'n'}}).scale(-1), {}, - NotConjugate, {}, 1); - std::vector> norm2_r(evals.size()); - asTensorView(norm2_r) - .rename_dims({{'i', 'n'}}) - .contract(r, {}, Conjugate, r, {}, NotConjugate); - for (const auto& i : norm2_r) + contract( + evecs, asTensorView(evals_cmpl, Local).rename_dims({{'i', 'n'}}).scale(-1), "", AddTo, + r); + auto rnorm = norm<1>(r, "n"); + for (int i = 0, vol = rnorm.volume(); i < vol; ++i) { - if (std::sqrt(std::real(i)) > primme.stats.estimateLargestSVal * primme.eps * 10) + if (rnorm.get({{i}}) > primme.stats.estimateLargestSVal * primme.eps * 10) { QDPIO::cerr << "Error: primme returned eigenpairs with too much error\n"; QDP_abort(1); @@ -3193,11 +9394,27 @@ namespace Chroma } // Copy evecs into all_evecs - evecs.copyTo(all_evecs.kvslice_from_size({{'t', t}}, {{'t', 1}})); - all_evals.push_back(evals); + evecs.copyTo(all_evecs.kvslice_from_size({{'t', t}}, {{'t', 1}}).getLocal()); + asTensorView(evals, Local) + .rename_dims({{'i', 'n'}}) + .copyTo(all_evals.kvslice_from_size({{'t', t}}, {{'t', 1}}).getLocal()); + } + +# if defined(SUPERBBLAS_USE_HIP) + gpuBlasCheck(hipblasDestroy(gpublas_handle)); +# endif + + // Broadcast all_evals to everyone + std::vector> all_evals_r; + for (int t = 0; t < n_tslices; ++t) + { + std::vector evals(n_colorvecs); + all_evals.kvslice_from_size({{'t', t}}, {{'t', 1}}) + .copyTo(asTensorView(evals).rename_dims({{'i', 'n'}})); + all_evals_r.push_back(evals); } - return {all_evecs, all_evals}; + return {all_evecs.make_sure(none, none, OnEveryone), all_evals_r}; } # else // BUILD_PRIMME inline std::pair, std::vector>> @@ -3361,15 +9578,10 @@ namespace Chroma } // Compute the 2-norm of colorvecs_s3t and check that no vector is null - - Tensor<2, ComplexD> colorvecs_s3t_norms2("nt", Coor<2>{n_colorvecs, n_tslices}, OnHost, - OnEveryoneReplicated); - colorvecs_s3t_norms2.contract(colorvecs_s3t, {}, Conjugate, colorvecs_s3t, {}, - NotConjugate); - + auto colorvecs_s3t_norms = norm<2>(colorvecs_s3t, "nt"); for (int t = 0; t < n_tslices; ++t) for (int n = 0; n < n_colorvecs; ++n) - if (std::norm(colorvecs_s3t_norms2.get({n, t})) == 0) + if (colorvecs_s3t_norms.get({n, t}) == 0) throw std::runtime_error( "no colorvec exists with key t_slice= " + std::to_string(t + from_tslice) + " colorvec= " + std::to_string(n)); @@ -3377,9 +9589,17 @@ namespace Chroma if (write_fingerprint) { // Compute the colorvecs - auto colorvecs = - ns_getColorvecs::computeColorvecs(u_smr, from_tslice, n_tslices, n_colorvecs, order_) - .first; + auto colorvecs_values = + ns_getColorvecs::computeColorvecs(u_smr, from_tslice, n_tslices, n_colorvecs, order_); + auto colorvecs = colorvecs_values.first; + if (superbblas::getLogLevel() > 0 && Layout::nodeNumber() == 0) + { + std::cout << "Printing computed eigenvalues:" << std::endl; + for (int t = 0; t < n_tslices; ++t) + for (int n = 0; n < n_colorvecs; ++n) + std::cout << "Eigenvalue for t= " << (from_tslice + t) % Nt << " : " + << colorvecs_values.second[t][n] << std::endl; + } // We need to phase the individual eigenvectors so that the have the same phase as the // s3t's colorvecs. That is, we need to apply a phase phi[i] to each eigenvector so that @@ -3392,21 +9612,36 @@ namespace Chroma // // Therefore, phi[i] = (colorvecs_s3t[i]^\dagger * colorvecs_s3t[i]) / (colorvecs_s3t[i]^\dagger * colorvecs[i]) - auto ip = colorvecs_s3t_norms2.like_this(); - ip.contract(colorvecs_s3t, {}, Conjugate, colorvecs, {}, NotConjugate); + auto ip = contract<2>(colorvecs_s3t.conj(), colorvecs, + detail::remove_dimensions(colorvecs.order, "nt"), OnHost, + OnEveryoneReplicated) + .reorder("nt"); auto phi = ip.like_this(); + bool error = false; for (int t = 0; t < n_tslices; ++t) { for (int n = 0; n < n_colorvecs; ++n) { - auto phi_i = colorvecs_s3t_norms2.get({n, t}) / ip.get({n, t}); - if (std::fabs(std::fabs(phi_i) - 1) > 1e-4) - throw std::runtime_error( - "The colorvec fingerprint does not correspond to current gates field"); + auto cv_norm = colorvecs_s3t_norms.get({n, t}); + auto phi_i = cv_norm * cv_norm / ip.get({n, t}); + if (std::fabs(std::abs(phi_i) - 1) > 1e-4) + { + error = true; + if (Layout::nodeNumber() == 0) + { + std::cout << "warning: The colorvec fingerprint does not correspond to current " + "gates field: deviation of the phase: " + << std::fabs(std::abs(phi_i) - 1) << " on t slice " + << (from_tslice + t) % Nt << " and vector " << n << std::endl; + } + } phi.set({n, t}, phi_i); } } + if (error) + throw std::runtime_error( + "The colorvec fingerprint does not correspond to current gates field"); // Apply the phase of the colorvecs in s3t to the computed colorvecs colorvecs_s3t.contract(colorvecs, {}, NotConjugate, phi, {}, NotConjugate); @@ -3515,14 +9750,14 @@ namespace Chroma template Tensor phaseColorvecs(Tensor colorvecs, int from_tslice, - Coor phase = {}) + Coor phase = {{}}) { // Phase colorvecs if phase != (0,0,0) - if (phase == Coor{}) + if (phase == Coor{{}}) return colorvecs; Tensor tphase = ns_getColorvecs::getPhase( - phase, from_tslice, colorvecs.kvdim()['t'], colorvecs.getDev()); + phase, from_tslice, colorvecs.kvdim()['t'], colorvecs.getDev(), colorvecs.dist); Tensor r = colorvecs.like_this(); r.contract(colorvecs, {}, NotConjugate, tphase, {}, NotConjugate); return r; @@ -3540,11 +9775,11 @@ namespace Chroma /// \return: a tensor containing the eigenvectors template - Tensor getColorvecs(const ColorvecsStorage& sto, - const multi1d& u, int decay_dir, - int from_tslice, int n_tslices, int n_colorvecs, - const Maybe& order = none, - Coor phase = {}, DeviceHost dev = OnDefaultDevice) + Tensor + getColorvecs(const ColorvecsStorage& sto, const multi1d& u, int decay_dir, + int from_tslice, int n_tslices, int n_colorvecs, + const Maybe& order = none, Coor phase = {{}}, + DeviceHost dev = OnDefaultDevice) { StopWatch sw; sw.reset(); @@ -3586,12 +9821,11 @@ namespace Chroma /// match the computed ones, they are the ones stored; this guarantee that the /// that given smearing options were used to generate the colorvecs in `colorvec_file_src` - inline void - createColorvecStorage(const std::string& colorvec_file, GroupXML_t link_smear, - const multi1d& u, int from_tslice, int n_tslices, - int n_colorvecs, bool use_s3t_storage = false, bool fingerprint = false, - Coor phase = {}, - const Maybe>& colorvec_file_src = none) + inline void createColorvecStorage( + const std::string& colorvec_file, GroupXML_t link_smear, const multi1d& u, + int from_tslice, int n_tslices, int n_colorvecs, bool use_s3t_storage = false, + bool fingerprint = false, Coor phase = {{}}, + const Maybe>& colorvec_file_src = none, int max_tslices = 0) { // Check input const int Nt = Layout::lattSize()[3]; @@ -3632,7 +9866,7 @@ namespace Chroma // Open the DB and write metada MOD_t mod; StorageTensor sto; - Coor<3> fingerprint_dim{}; + Coor<3> fingerprint_dim{{}}; if (!use_s3t_storage) { @@ -3703,75 +9937,95 @@ namespace Chroma if (colorvec_file_src.getSome({}).size() > 0) colorvecsSto = openColorvecStorage(colorvec_file_src.getSome()); - for (int i_tslice = 0; i_tslice < n_tslices; ++i_tslice, from_tslice = (from_tslice + 1) % Nt) + if (max_tslices == 0) + max_tslices = std::min(n_tslices, Layout::numNodes()); + for (int i_tslice = 0, from_tslice0 = from_tslice, + n_tslices0 = std::min(n_tslices, max_tslices); + i_tslice < n_tslices; + i_tslice += n_tslices0, from_tslice0 = (from_tslice + i_tslice) % Nt, + n_tslices0 = std::min(n_tslices - i_tslice, max_tslices)) { // Compute colorvecs std::string order = "cxyzXtn"; auto colorvecs_and_evals = - ns_getColorvecs::computeColorvecs(u_smr, from_tslice, 1, n_colorvecs, order); + ns_getColorvecs::computeColorvecs(u_smr, from_tslice0, n_tslices0, n_colorvecs, order); + if (superbblas::getLogLevel() > 0 && Layout::nodeNumber() == 0) + { + std::cout << "Printing computed eigenvalues:" << std::endl; + for (int t = 0; t < n_tslices0; ++t) + for (int n = 0; n < n_colorvecs; ++n) + std::cout << "Eigenvalue for t= " << (from_tslice0 + t) % Nt << " : " + << colorvecs_and_evals.second[t][n] << std::endl; + } auto colorvecs = colorvecs_and_evals.first; // Read the eigenvectors from another source if indicated if (colorvec_file_src.getSome({}).size() > 0) { auto colorvecs_src = - getColorvecs(colorvecsSto, u, 3, from_tslice, 1, n_colorvecs); + getColorvecs(colorvecsSto, u, 3, from_tslice0, n_tslices0, n_colorvecs); - Tensor<2, ComplexD> ip("nt", Coor<2>{n_colorvecs, 1}, OnHost, OnEveryoneReplicated); + Tensor<2, ComplexD> ip("nt", Coor<2>{n_colorvecs, n_tslices0}, OnHost, + OnEveryoneReplicated); ip.contract(colorvecs, {}, Conjugate, colorvecs_src, {}, NotConjugate); - for (int n = 0; n < n_colorvecs; ++n) - if (std::fabs(std::fabs(ip.get({n, 0})) - 1) > 1e-4) - throw std::runtime_error( - "The given colorvec does not correspond to current gates field and smearing"); + for (int t = 0; t < n_tslices0; ++t) + for (int n = 0; n < n_colorvecs; ++n) + if (std::fabs(std::abs(ip.get({n, t})) - 1) > 1e-4) + throw std::runtime_error( + "The given colorvec does not correspond to current gates field and smearing"); colorvecs = colorvecs_src; } // Phase colorvecs - colorvecs = phaseColorvecs(colorvecs, from_tslice, phase); + colorvecs = phaseColorvecs(colorvecs, from_tslice0, phase); - // Compute the permutation from natural ordering to red-black - std::vector perm = ns_getColorvecs::getPermFromNatToRB(from_tslice); - - // Store the colorvecs in natural order (not in red-black ordering) - if (!use_s3t_storage) + for (int t = 0; t < n_tslices0; ++t) { - // Allocate a single time slice colorvec in natural ordering, as colorvec are stored - Tensor tnat("cxyz", latticeSize("cxyz", {{'x', Layout::lattSize()[0]}}), - OnHost, OnMaster); - - // Allocate a single time slice colorvec in case of using RB ordering - Tensor trb("cxyzX", latticeSize("cxyzX"), OnHost, OnMaster); + // Compute the permutation from natural ordering to red-black + int t0 = (from_tslice0 + t) % Nt; + std::vector perm = ns_getColorvecs::getPermFromNatToRB(t0); - for (int n = 0; n < n_colorvecs; ++n) + // Store the colorvecs in natural order (not in red-black ordering) + if (!use_s3t_storage) { - KeyTimeSliceColorVec_t time_key; - time_key.t_slice = from_tslice; - time_key.colorvec = n; - colorvecs.kvslice_from_size({{'t', 0}, {'n', n}}, {{'t', 1}, {'n', 1}}).copyTo(trb); - ns_getColorvecs::toNat(perm, trb, tnat); - mod.insert(time_key, tnat); + // Allocate a single time slice colorvec in natural ordering, as colorvec are stored + Tensor tnat( + "cxyz", latticeSize("cxyz", {{'x', Layout::lattSize()[0]}}), OnHost, OnMaster); + + // Allocate a single time slice colorvec in case of using RB ordering + Tensor trb("cxyzX", latticeSize("cxyzX"), OnHost, OnMaster); + + for (int n = 0; n < n_colorvecs; ++n) + { + KeyTimeSliceColorVec_t time_key; + time_key.t_slice = t0; + time_key.colorvec = n; + colorvecs.kvslice_from_size({{'t', t}, {'n', n}}, {{'t', 1}, {'n', 1}}).copyTo(trb); + ns_getColorvecs::toNat(perm, trb, tnat); + mod.insert(time_key, tnat); + } } - } - else - { - // Allocate a single time slice colorvec in natural ordering, as colorvec are stored - Tensor tnat("cxyz", latticeSize("cxyz", {{'x', Layout::lattSize()[0]}}), - OnHost, OnMaster); + else + { + // Allocate a single time slice colorvec in natural ordering, as colorvec are stored + Tensor tnat( + "cxyz", latticeSize("cxyz", {{'x', Layout::lattSize()[0]}}), OnHost, OnMaster); - // Allocate a single time slice colorvec in case of using RB ordering - Tensor trb("cxyzX", latticeSize("cxyzX"), OnHost, OnMaster); + // Allocate a single time slice colorvec in case of using RB ordering + Tensor trb("cxyzX", latticeSize("cxyzX"), OnHost, OnMaster); - std::map colorvec_size{}; - if (fingerprint) - colorvec_size = std::map{ - {'x', fingerprint_dim[0]}, {'y', fingerprint_dim[1]}, {'z', fingerprint_dim[2]}}; + std::map colorvec_size{}; + if (fingerprint) + colorvec_size = std::map{ + {'x', fingerprint_dim[0]}, {'y', fingerprint_dim[1]}, {'z', fingerprint_dim[2]}}; - for (int n = 0; n < n_colorvecs; ++n) - { - colorvecs.kvslice_from_size({{'t', 0}, {'n', n}}, {{'t', 1}, {'n', 1}}).copyTo(trb); - ns_getColorvecs::toNat(perm, trb, tnat); - sto.kvslice_from_size({{'t', from_tslice}, {'n', n}}, {{'t', 1}, {'n', 1}}) - .copyFrom(tnat.kvslice_from_size({}, colorvec_size)); + for (int n = 0; n < n_colorvecs; ++n) + { + colorvecs.kvslice_from_size({{'t', t}, {'n', n}}, {{'t', 1}, {'n', 1}}).copyTo(trb); + ns_getColorvecs::toNat(perm, trb, tnat); + sto.kvslice_from_size({{'t', t0}, {'n', n}}, {{'t', 1}, {'n', 1}}) + .copyFrom(tnat.kvslice_from_size({}, colorvec_size)); + } } } } @@ -3784,129 +10038,91 @@ namespace Chroma // High-level chroma operations // - /// Apply the inverse to LatticeColorVec tensors for a list of spins - /// \param PP: invertor - /// \param chi: lattice color tensor on a t_slice, cxyzXn - /// \param t_source: time-slice in chi - /// \param Nt_forward: return the next Nt_forward time-slices after t_source - /// \param Nt_backward: return the previous Nt_backward time-slices before t_source - /// \param spin_sources: list of spins - /// \param max_rhs: maximum number of vectors solved at once - /// \param order_out: coordinate order of the output tensor, a permutation of cSxyztXns where - /// s is the spin source and S is the spin sink - /// \return: tensor cSxyztXns where the first t_slice is the t_source-Nt_backward time-slice of - /// the vectors after the inversion, and goes increasingly until time-source t_source+Nt_forward - - template - Tensor doInversion(const SystemSolver& PP, - const Tensor chi, int t_source, - int first_tslice_out, int n_tslice_out, - const std::vector& spin_sources, int max_rhs, - const std::string& order_out = "cSxyztXns") + namespace detail { - detail::check_order_contains(order_out, "cSxyztXns"); - if (chi.kvdim()['t'] != 1) - throw std::runtime_error("Expected one time-slice"); - const int num_vecs = chi.kvdim()['n']; - - if (n_tslice_out > Layout::lattSize()[3]) - throw std::runtime_error("Too many tslices"); - - Tensor psi( - order_out, - latticeSize( - order_out, {{'t', n_tslice_out}, {'S', Ns}, {'s', spin_sources.size()}, {'n', num_vecs}}), - chi.getDev()); - - int max_step = std::max(num_vecs, max_rhs); - std::vector> chis(max_step), quark_solns(max_step); - for (int col = 0; col < max_step; col++) - chis[col].reset(new LatticeFermion); - for (int col = 0; col < max_step; col++) - quark_solns[col].reset(new LatticeFermion); + /// Path Node + struct PathNode { + std::map p; ///< following nodes + int disp_index; ///< if >= 0, the index in the displacement list + }; - StopWatch snarss1; - snarss1.reset(); - snarss1.start(); + /// Return the total amount of steps on each direction over all the displacement entries + /// \param disps: input tree of displacements - for (int spin_source : spin_sources) + inline Coor get_total_disps(const PathNode& disps) { - for (int n0 = 0, n_step = std::min(max_rhs, num_vecs); n0 < num_vecs; - n0 += n_step, n_step = std::min(n_step, num_vecs - n0)) + Coor counts{{}}; + for (const auto it : disps.p) { - for (int n = n0, col = 0; col < n_step; ++n, ++col) + if (std::abs(it.first) > 0) { - // Put the colorvec sources for the t_source on chis for spin `spin_source` - // chis[col][s=spin_source] = chi[n=n0] - *chis[col] = zero; - chi.kvslice_from_size({{'n', n}}, {{'n', 1}}) - .copyTo(SB::asTensorView(*chis[col]) - .kvslice_from_size({{'t', t_source}, {'s', spin_source}})); - - *quark_solns[col] = zero; + int dir = std::abs(it.first) - 1; + counts[dir]++; } + } + return counts; + } - // Solve - std::vector res = - PP(std::vector>(quark_solns.begin(), - quark_solns.begin() + n_step), - std::vector>(chis.begin(), - chis.begin() + n_step)); + /// Return the maximum steps on each direction on a single displacement entry + /// \param disps: input tree of displacements - for (int n = n0, col = 0; col < n_step; ++n, ++col) + inline Coor get_max_disp(const PathNode& disps) + { + Coor max_disp{{}}; + for (const auto it : disps.p) + { + Coor disp{{}}; + if (std::abs(it.first) > 0) { - // psi[n=n] = quark_solns[col][t=first_tslice+(0:n_tslice_out-1)] - asTensorView(*quark_solns[col]) - .kvslice_from_size({{'t', first_tslice_out}}, {{'t', n_tslice_out}}) - .rename_dims({{'s', 'S'}}) - .copyTo(psi.kvslice_from_size({{'n', n}, {'s', spin_source}})); + int dir = std::abs(it.first) - 1; + disp[dir] = 1; } + Coor this_max_disp = get_max_disp(it.second); + for (std::size_t i = 0; i < Nd; ++i) + max_disp[i] = std::max(max_disp[i], this_max_disp[i] + disp[i]); } + return max_disp; } - snarss1.stop(); - QDPIO::cout << "Time to compute inversions for " << spin_sources.size() - << " spin sources and " << num_vecs - << " colorvecs : " << snarss1.getTimeInSeconds() << " secs" << std::endl; - - return psi; - } - - namespace detail - { - /// Path Node - struct PathNode { - std::map p; ///< following nodes - int disp_index; ///< if >= 0, the index in the displacement list - }; + /// Return the lattice labels ordered from less to more counts + /// \param counts: counts returned by `get_total_disps` + /// \param extra_count_0_labels: (optional) labels to append after the zero count lattice labels - /// Return the directions that are going to be use and the maximum number of displacements keep in memory - inline void get_tree_mem_stats(const PathNode& disps, std::array& dirs, - unsigned int& max_rhs) + inline std::string get_optimal_lattice_order(const Coor& counts, + const std::string& extra_count_0_labels = "") { - unsigned int max_rhs_sub = 0; - for (const auto it : disps.p) - { - unsigned int max_rhs_sub_it = 0; - get_tree_mem_stats(it.second, dirs, max_rhs_sub_it); - max_rhs_sub = std::max(max_rhs_sub, max_rhs_sub_it); + static_assert(Nd == 4, "only supports 4D"); - if (std::abs(it.first) <= Nd) - dirs[std::abs(it.first) - 1] = true; - } - - if (disps.p.size() == 0) - { - max_rhs = 0; - } - else if (disps.p.size() == 1) + // Make a vector of pairs of {label, count} for label being each direction + std::vector> v(4); + for (int i = 0; i < 4; ++i) { - max_rhs = std::max(1u, max_rhs_sub); + v[i] = {i, counts[i]}; } - else + + // Sort them ascendantly on the number of counts + std::sort(v.begin(), v.end(), [](const std::array& a, const std::array& b) { + return a[1] < b[1]; + }); + + // Order of the labels + const char* lattice_labels = "xyzt"; + std::string order(4, 0); // string of size 4 + for (unsigned int i = 0; i < 4; ++i) + order[i] = lattice_labels[v[i][0]]; + + // Return the order with extra labels inserted + unsigned int first_label_nonzero = 4; + for (unsigned int i = 0; i < 4; ++i) { - max_rhs = 1 + max_rhs_sub; + if (v[i][1] > 0) + { + first_label_nonzero = i; + break; + } } + return std::string(order.begin(), order.begin() + first_label_nonzero) + + extra_count_0_labels + std::string(order.begin() + first_label_nonzero, order.end()); } const int path_separator = Nd + 1; @@ -3945,6 +10161,16 @@ namespace Chroma } + /// Callback function for each displacement/derivate, and chunk of time-slices and momenta + /// Arguments of the callback: + /// \param tensor: output tensor with order ijkmt + /// \param disp: index of the displacement/derivative + /// \param first_timeslice: index of the first time-slice in the tensor + /// \param first_mom: index of the first momentum in the tensor + + template + using MomGammaDispContractionFn = std::function, int, int, int)>; + namespace ns_doMomGammaDisp_contractions { using namespace detail; @@ -3952,6 +10178,9 @@ namespace Chroma /// Contract two LatticeFermion with different momenta, gammas, and displacements. /// \param leftconj: left lattice fermion tensor, cSxyzXN /// \param right: right lattice fermion tensor, csxyzXn + /// \param first_tslice: absolute index of the first tslice + /// \param save_from: relative first tslice to save + /// \param save_size: number of tslices to save /// \param disps: tree of displacements/derivatives /// \param deriv: if true, do left-right nabla derivatives /// \param gammas: tensor with spins, QSg @@ -3964,31 +10193,25 @@ namespace Chroma template void doMomGammaDisp_contractions(const std::vector>& u, - const Tensor leftconj, - Tensor right, Index first_tslice, - const PathNode& disps, bool deriv, Tensor<3, COMPLEX> gammas, - const std::vector>& moms, int max_rhs, - Tensor r, std::vector& disp_indices) + const Tensor& leftconj, + Tensor&& right, Index first_tslice, + Index save_from, Index save_size, const PathNode& disps, + bool deriv, Tensor<3, COMPLEX> gammas, + const std::vector>& moms, int first_mom, + const MomGammaDispContractionFn& call) { - max_rhs = std::max(1, max_rhs); - if (disps.disp_index >= 0) { detail::log(1, "contracting for disp_index=" + std::to_string(disps.disp_index)); - // Contract the spatial components and the color of the leftconj and right tensors - Tensor aux = - r.template like_this("mNQqnSst%", '%', "gd", {{'S', Ns}, {'Q', Ns}}); - aux.contract(leftconj, {}, Conjugate, right, {}, NotConjugate, - {}); - // Contract the spin components S and Q with the gammas, and put the result on r[d=disp_indices.size()] - Tensor aux0 = - r.template like_this("gmNqnst%", '%', "d"); - aux0.contract(gammas, {}, NotConjugate, aux, {}, NotConjugate); - aux0.copyTo(r.kvslice_from_size({{'d', disp_indices.size()}}, {{'d', 1}})); + // Contract the spatial components (Xxyz) and the color of the leftconj and right tensors, + // and the spin components S and Q with the gammas + Tensor r = + contract(contract(leftconj.conj(), right, "cxyzX"), gammas, "SQ"); - // Annotate on disp_indices the displacement being computed for the current `d` - disp_indices.push_back(disps.disp_index); + int Nt = Layout::lattSize()[3]; + call(r.kvslice_from_size({{'t', save_from}}, {{'t', save_size}}), disps.disp_index, + normalize_coor(first_tslice + save_from, Nt), first_mom); } // Apply displacements on the right and call recursively @@ -4005,8 +10228,8 @@ namespace Chroma : leftRightNabla(u, right, first_tslice, it.first, moms); if (node_disp == disps.p.size() - 1) right.release(); - doMomGammaDisp_contractions(u, leftconj, std::move(right_disp), first_tslice, it.second, - deriv, gammas, moms, max_rhs - num_vecs, r, disp_indices); + doMomGammaDisp_contractions(u, leftconj, std::move(right_disp), first_tslice, save_from, + save_size, it.second, deriv, gammas, moms, first_mom, call); node_disp++; detail::log(1, "pop direction"); } @@ -4016,61 +10239,18 @@ namespace Chroma using CoorMoms = std::vector>; template - using Moms = std::pair, std::vector>>; - - /// Copy several momenta into a single tensor - /// \param decay_dir: something that should be three - /// \param moms: momenta to apply - /// \param first_mom: first momentum to extract - /// \param num_moms: number of momenta to extract - /// \param first_tslice: first time-slice to extract - /// \param num_tslice: number of time-slices to extract - /// \param order_out: coordinate order of the output tensor, a permutation of mxyzXt - /// \return: the tensor with the momenta - - template - Moms getMoms(int decay_dir, const SftMom& moms, Maybe first_mom = none, - Maybe num_moms = none, Maybe first_tslice = none, - Maybe num_tslices = none, const std::string& order_out = "mxyzXt") - { - // Copy moms into a single tensor - const int Nt = Layout::lattSize()[decay_dir]; - int tfrom = first_tslice.getSome(0); // first tslice to extract - int tsize = num_tslices.getSome(Nt); // number of tslices to extract - int mfrom = first_mom.getSome(0); // first momentum to extract - int msize = num_moms.getSome(moms.numMom()); // number of momenta to extract - - Tensor momst(order_out, - latticeSize(order_out, {{'t', tsize}, {'m', msize}})); - for (unsigned int mom = 0; mom < msize; ++mom) - { - asTensorView(moms[mfrom + mom]) - .kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}) - .copyTo(momst.kvslice_from_size({{'m', mom}}, {{'m', 1}})); - } - - // Create mom_list - std::vector> mom_list(msize); - for (unsigned int mom = 0; mom < msize; ++mom) - { - for (unsigned int i = 0; i < 3; ++i) - mom_list[mom][i] = moms.numToMom(mfrom + mom)[i]; - } - - return {momst, mom_list}; - } + using Moms = std::pair, std::vector>>; /// Contract two LatticeFermion with different momenta, gammas, and displacements. /// \param leftconj: left lattice fermion tensor, cxyzXNQqt /// \param right: right lattice fermion tensor, cxyzXnSst /// \param first_tslice: first time-slice in leftconj and right + /// \param save_from: relative first tslice to save + /// \param save_size: number of tslices to save /// \param moms: momenta to apply - /// \param moms_first: index of the first momenta to apply - /// \param num_moms: number of momenta to apply (if none, apply all of them) /// \param gammas: list of gamma matrices to apply /// \param disps: list of displacements/derivatives /// \param deriv: if true, do left-right nabla derivatives - /// \param max_rhs: maximum number of vectors hold in memory /// \param order_out: coordinate order of the output tensor, a permutation of nNSQmgd where /// q and N (s and n) are the spin and vector from left (right) vectors, m is the momentum /// index, g is the gamma index, and d is the displacement index @@ -4078,59 +10258,43 @@ namespace Chroma /// index in the tensor with an input displacement index. template - std::pair, std::vector> doMomGammaDisp_contractions( + void doMomGammaDisp_contractions( const multi1d& u, Tensor leftconj, - Tensor right, Index first_tslice, const SftMom& moms, int first_mom, - Maybe num_moms, const std::vector>& gammas, + Tensor right, Index first_tslice, Index save_from, Index save_size, + const CoorMoms& moms, const std::vector>& gammas, const std::vector>& disps, bool deriv, - const std::string& order_out = "gmNndsqt", Maybe max_active_tslices = none, - DeviceHost dev = OnDefaultDevice) + const MomGammaDispContractionFn& call, + const std::string& order_out = "gmNnsqt", Maybe max_active_tslices = none, + Maybe max_active_moms = none, DeviceHost dev = OnDefaultDevice) { - detail::check_order_contains(order_out, "gmNndsqt"); + detail::check_order_contains(order_out, "gmNnsqt"); detail::check_order_contains(leftconj.order, "cxyzXNQqt"); detail::check_order_contains(right.order, "cxyzXnSst"); if (right.kvdim()['t'] != leftconj.kvdim()['t']) throw std::runtime_error("The t component of `right' and `left' does not match"); - int Nt = right.kvdim()['t']; + int num_tslices = right.kvdim()['t']; - int max_t = max_active_tslices.getSome(Nt); + int max_t = max_active_tslices.getSome(save_size); if (max_t <= 0) - max_t = Nt; + max_t = save_size; + if (max_t > save_size) + max_t = save_size; + int max_moms_in_contraction = max_active_moms.getSome(moms.size()); // Form a tree with the displacement paths detail::PathNode tree_disps = ns_doMomGammaDisp_contractions::get_tree(disps); // Get what directions are going to be used and the maximum number of displacements in memory - std::array active_dirs{}; - unsigned int max_active_disps = 0; - detail::get_tree_mem_stats(tree_disps, active_dirs, max_active_disps); - - // Number of moments to apply - int numMom = num_moms.getSome(moms.numMom()); - if (first_mom + numMom > moms.numMom()) - throw std::runtime_error("Invalid range of momenta"); - - // Allocate output tensor - std::map r_size = {{'t', Nt}, - {'n', right.kvdim()['n']}, - {'s', right.kvdim()['s']}, - {'N', leftconj.kvdim()['N']}, - {'q', leftconj.kvdim()['q']}, - {'m', numMom}, - {'g', gammas.size()}, - {'d', disps.size()}}; - for (char c : detail::remove_dimensions(order_out, "gmNndsqt")) - r_size[c] = leftconj.kvdim()[c]; - Tensor r(order_out, kvcoors(order_out, r_size)); - - // Create mom_list - std::vector> mom_list(numMom); - for (unsigned int mom = 0; mom < numMom; ++mom) - { - for (unsigned int i = 0; i < Nd - 1; ++i) - mom_list[mom][i] = moms.numToMom(first_mom + mom)[i]; - } + Coor active_dirs = get_max_disp(tree_disps); + Coor counts = get_total_disps(tree_disps); + + // Check that it is given enough extra time slices for time derivatives + int Nt = Layout::lattSize()[3]; + if (num_tslices < Nt && + (save_from < active_dirs[3] || save_from + save_size + active_dirs[3] < num_tslices)) + throw std::runtime_error("doMomGammaDisp_contractions: not enough time slices given for " + "the requested time derivatives"); // Copy all gammas into a single tensor Tensor<3, COMPLEX> gammast("gQS", {(Index)gammas.size(), Ns, Ns}, dev, OnEveryoneReplicated); @@ -4141,82 +10305,101 @@ namespace Chroma .copyTo(gammast.kvslice_from_size({{'g', g}}, {{'g', 1}})); } - // Iterate over time-slices - std::vector disp_indices; - - for (int tfrom = 0, tsize = std::min(max_t, Nt); tfrom < Nt; - tfrom += tsize, tsize = std::min(max_t, Nt - tfrom)) - { - // Make tsize one or even - if (tsize > 1 && tsize % 2 != 0) - --tsize; - - detail::log(1, "contracting " + std::to_string(tsize) + - " tslices from tslice= " + std::to_string(tfrom)); + Tracker _t(std::string("doMomGammaDisp_contractions")); - disp_indices.resize(0); - - // Copy moms into a single tensor - std::string momst_order = "mxyzXt"; - Tensor momst( - momst_order, latticeSize(momst_order, {{'t', tsize}, {'m', numMom}}), dev); - for (unsigned int mom = 0; mom < numMom; ++mom) - { - asTensorView(moms[first_mom + mom]) - .kvslice_from_size({{'t', tfrom + first_tslice}}, {{'t', tsize}}) - .copyTo(momst.kvslice_from_size({{'m', mom}}, {{'m', 1}})); - } + // Avoid distributing the tensors on directions with derivatives to avoid communications in shiftings, + // and on dimensions being contracted; so the only remaining dimension is t + const Distribution dist = OnEveryoneCompact + detail::get_optimal_lattice_order(counts); - // Apply momenta conjugated to the left tensor and rename the spin components s and Q to q and Q, - // and the colorvector component n to N - Tensor moms_left = leftconj.template like_this( - "mQNqc%xyzXt", '%', "", {{'m', numMom}, {'t', tsize}}); - moms_left.contract(std::move(momst), {}, Conjugate, - leftconj.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}), {}, - NotConjugate); - if (tfrom + tsize >= Nt) - leftconj.release(); + for (int tfrom = save_from, tsize = std::min(max_t, save_size); tfrom < save_from + save_size; + tfrom += tsize, tsize = std::min(max_t, save_from + save_size - tfrom)) + { + // When having derivatives in the time direction, pack extra time slices + int t_extra = active_dirs[3]; + t_extra = std::min(t_extra + tsize, Nt) - tsize; + int first_active_tslice = first_tslice + tfrom - t_extra; + int num_active_tslices = std::min(tsize + t_extra + active_dirs[3], Nt); // Make a copy of the time-slicing of u[d] also supporting left and right std::vector> ut(Nd); - for (unsigned int d = 0; d < Nd - 1; d++) + for (unsigned int d = 0; d < Nd; d++) { - if (!active_dirs[d]) + if (active_dirs[d] == 0) continue; // NOTE: This is going to create a tensor with the same distribution of the t-dimension as leftconj and right - ut[d] = asTensorView(u[d]) - .kvslice_from_size({{'t', first_tslice + tfrom}}, {{'t', tsize}}) - .toComplex() - .make_sure(none, dev); + ut[d] = detail::toNaturalOrdering( + asTensorView(u[d]) + .kvslice_from_size({{'t', first_active_tslice}}, {{'t', num_active_tslices}}) + .toComplex(), + first_active_tslice) + .make_sure(none, dev, dist); } - // Do the thing - auto this_right = right.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); - if (tfrom + tsize >= Nt) + auto this_right = + detail::toNaturalOrdering( + right.kvslice_from_size({{'t', tfrom - t_extra}}, {{'t', num_active_tslices}}), + first_active_tslice) + .make_sure(none, dev, dist); + if (tfrom + tsize >= save_from + save_size) right.release(); - auto this_r = r.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); - if (!deriv) - { - ns_doMomGammaDisp_contractions::doMomGammaDisp_contractions( - ut, std::move(moms_left), std::move(this_right), first_tslice + tfrom, tree_disps, - deriv, gammast, mom_list, 0, this_r, disp_indices); - } - else + + for (int mfrom = 0, msize = std::min(max_moms_in_contraction, (int)moms.size()); + mfrom < moms.size(); + mfrom += msize, msize = std::min(max_moms_in_contraction, (int)moms.size() - mfrom)) { - throw std::runtime_error("Derivatives are not implemented! Sorry!"); - // std::vector ones(moms.numMom(), COMPLEX(1)); - // std::string right_moms_order = std::string(right.order.begin(), right.order.size()) + "m"; - // Tensor right_moms = - // right.like_this(right_moms_order.c_str()); - // right_moms.contract(asTensorView(ones), {{'i', 'm'}}, NotConjugate, std::move(right), {}, - // NotConjugate); - // doMomGammaDisp_contractions(u, gammast_moms_left, right_moms, tree_disps, deriv, mom_list, - // max_rhs, r, disp_indices); + + detail::log(1, "contracting " + std::to_string(tsize) + " tslices from tslice= " + + std::to_string(tfrom) + " and " + std::to_string(msize) + + " momenta from momentum " + std::to_string(mfrom)); + + // Copy moms into a single tensor + std::string momst_order = "mxyzX"; + Tensor momst = + this_right.template make_compatible(momst_order, {{'m', msize}}); + for (int m = 0; m < msize; ++m) + { + ns_getColorvecs::getPhaseNatural(moms[mfrom + m], dev, dist) + .copyTo(momst.kvslice_from_size({{'m', m}}, {{'m', 1}})); + } + + // Apply momenta conjugated to the left tensor and rename the spin components s and Q to q and Q, + // and the colorvector component n to N + auto leftconj_nat = + detail::toNaturalOrdering( + leftconj.kvslice_from_size({{'t', tfrom - t_extra}}, {{'t', num_active_tslices}}), + first_active_tslice) + .make_sure(none, dev, dist); + Tensor moms_left = leftconj_nat.template like_this( + "mQNqc%xyzXt", '%', "", {{'m', msize}, {'t', num_active_tslices}}); + moms_left.contract(std::move(momst), {}, Conjugate, std::move(leftconj_nat), {}, + NotConjugate); + if (tfrom + tsize >= save_from + save_size && mfrom + msize >= moms.size()) + leftconj.release(); + + // Do the thing + if (!deriv) + { + auto this_right0 = this_right; + ns_doMomGammaDisp_contractions::doMomGammaDisp_contractions( + ut, moms_left, std::move(this_right0), first_active_tslice, t_extra, tsize, + tree_disps, deriv, gammast, + CoorMoms(moms.begin() + mfrom, moms.begin() + mfrom + msize), mfrom, call); + } + else + { + throw std::runtime_error("Derivatives are not implemented! Sorry!"); + // std::vector ones(moms.numMom(), COMPLEX(1)); + // std::string right_moms_order = std::string(right.order.begin(), right.order.size()) + "m"; + // Tensor right_moms = + // right.like_this(right_moms_order.c_str()); + // right_moms.contract(asTensorView(ones), {{'i', 'm'}}, NotConjugate, std::move(right), {}, + // NotConjugate); + // doMomGammaDisp_contractions(u, gammast_moms_left, right_moms, tree_disps, deriv, mom_list, + // max_rhs, r, disp_indices); + } } } - - return {r, disp_indices}; } /// Callback function for each displacement/derivate, and chunk of time-slices and momenta @@ -4266,11 +10449,11 @@ namespace Chroma template void doMomDisp_colorContractions(const std::vector>& u, - std::array, 3> colorvecs, + std::array, 3>&& colorvecs, Index first_tslice, const PathNode& disps, bool deriv, - int current_colorvec, const Moms moms, + int current_colorvec, const Moms& moms, int first_mom, int max_cols, const std::string& order_out, - DeviceHost dev, Distribution dist, + const DeviceHost& dev, const Distribution& dist, const ColorContractionFn& call) { if (disps.disp_index >= 0) @@ -4373,7 +10556,7 @@ namespace Chroma const ColorContractionFn& call, Maybe max_active_tslices = none, Maybe max_active_momenta = none, Maybe max_cols = none, const Maybe& order_out = none, Maybe dev = none, - Maybe dist = none) + Maybe dist_ret = none) { const std::string order_out_str = order_out.getSome("ijkmt"); detail::check_order_contains(order_out_str, "ijkmt"); @@ -4383,9 +10566,8 @@ namespace Chroma detail::PathNode tree_disps = ns_doMomDisp_colorContractions::get_tree(disps); // Get what directions are going to be used and the maximum number of displacements in memory - std::array active_dirs{}; - unsigned int max_active_disps = 0; - detail::get_tree_mem_stats(tree_disps, active_dirs, max_active_disps); + Coor active_dirs = get_max_disp(tree_disps); + Coor counts = get_total_disps(tree_disps); // Check that all tensors have the same number of time int Nt = colorvec.kvdim()['t']; @@ -4399,40 +10581,51 @@ namespace Chroma if (max_active_moms <= 0) max_active_moms = Nmom; + // Avoid distributing the tensors on directions with derivatives to avoid communications in shiftings, + // and on dimensions being contracted; so the only remaining dimension is t + const Distribution dist = + OnEveryoneCompact + detail::get_optimal_lattice_order(counts, deriv ? "m" : ""); + // Iterate over time-slices for (int tfrom = 0, tsize = std::min(max_t, Nt); tfrom < Nt; tfrom += tsize, tsize = std::min(max_t, Nt - tfrom)) { - detail::log( - 1, "color contracting " + std::to_string(tsize) + " tslices from tslice= " + - std::to_string(first_tslice + tfrom)); + detail::log(1, "color contracting " + std::to_string(tsize) + + " tslices from tslice= " + std::to_string(first_tslice + tfrom)); // Make a copy of the time-slicing of u[d] also supporting left and right std::vector> ut(Nd); for (unsigned int d = 0; d < Nd - 1; d++) { - if (!active_dirs[d]) + if (active_dirs[d] == 0) continue; // NOTE: This is going to create a tensor with the same distribution of the t-dimension as colorvec and moms - ut[d] = asTensorView(u[d]) - .kvslice_from_size({{'t', first_tslice + tfrom}}, {{'t', tsize}}) - .toComplex(); + ut[d] = detail::toNaturalOrdering( + asTensorView(u[d]) + .kvslice_from_size({{'t', first_tslice + tfrom}}, {{'t', tsize}}) + .toComplex(), + first_tslice + tfrom) + .make_sure(none, dev, dist); } // Get the time-slice for colorvec - auto this_colorvec = colorvec.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); + auto this_colorvec = + detail::toNaturalOrdering(colorvec.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}), + first_tslice + tfrom) + .make_sure(none, dev, dist); // Loop over the momenta for (int mfrom = 0, msize = std::min(max_active_moms, Nmom); mfrom < Nmom; mfrom += msize, msize = std::min(max_active_moms, Nmom - mfrom)) { auto this_moms = - this_colorvec.template like_this("xyzXtm", {{'m', msize}}); + this_colorvec.template make_compatible("xyzXm", {{'m', msize}}); for (int m = 0; m < msize; ++m) - ns_getColorvecs::getPhase(moms[mfrom + m], first_tslice + tfrom, tsize, - this_moms.getDev()) + { + ns_getColorvecs::getPhaseNatural(moms[mfrom + m], this_moms.getDev(), dist) .copyTo(this_moms.kvslice_from_size({{'m', m}}, {{'m', 1}})); + } if (tfrom + tsize >= Nt && mfrom + msize >= Nmom) { @@ -4441,10 +10634,12 @@ namespace Chroma std::vector> moms_list(moms.begin() + mfrom, moms.begin() + mfrom + msize); if (!deriv) { + std::array, 3> this_3_colorvec{this_colorvec, this_colorvec, + this_colorvec}; ns_doMomDisp_colorContractions::doMomDisp_colorContractions( - ut, {this_colorvec, this_colorvec, this_colorvec}, first_tslice + tfrom, tree_disps, - deriv, 0, {this_moms, moms_list}, mfrom, max_cols.getSome(0), order_out_str, - dev.getSome(OnDefaultDevice), dist.getSome(OnEveryoneReplicated), call); + ut, std::move(this_3_colorvec), first_tslice + tfrom, tree_disps, deriv, 0, + {this_moms, moms_list}, mfrom, max_cols.getSome(0), order_out_str, + dev.getSome(OnDefaultDevice), dist_ret.getSome(dist), call); } else { @@ -4454,11 +10649,12 @@ namespace Chroma this_colorvec.template like_this("%m", '%', "", {{'m', msize}}); this_colorvec_m.contract(this_colorvec, {}, NotConjugate, asTensorView(ones), {{'i', 'm'}}, NotConjugate); + std::array, 3> this_3_colorvec_m{ + this_colorvec_m, this_colorvec_m, this_colorvec_m}; ns_doMomDisp_colorContractions::doMomDisp_colorContractions( - ut, {this_colorvec_m, this_colorvec_m, this_colorvec_m}, first_tslice + tfrom, - tree_disps, deriv, 0, {this_moms, moms_list}, mfrom, max_cols.getSome(0), - order_out_str, dev.getSome(OnDefaultDevice), dist.getSome(OnEveryoneReplicated), - call); + ut, std::move(this_3_colorvec_m), first_tslice + tfrom, tree_disps, deriv, 0, + {this_moms, moms_list}, mfrom, max_cols.getSome(0), order_out_str, + dev.getSome(OnDefaultDevice), dist_ret.getSome(dist), call); } } } @@ -4490,11 +10686,12 @@ namespace Chroma template void doMomDisp_contractions(const std::vector>& u, - Tensor left, Tensor right, - Index first_tslice, const PathNode& disps, bool deriv, + const Tensor& left, + Tensor&& right, Index first_tslice, + const PathNode& disps, bool deriv, const std::vector>& moms, int first_mom, - const std::string& order_out, DeviceHost dev, Distribution dist, - const ContractionFn& call) + const std::string& order_out, const DeviceHost& dev, + const Distribution& dist, const ContractionFn& call) { if (disps.disp_index >= 0) { @@ -4554,7 +10751,7 @@ namespace Chroma Index first_tslice, const std::vector>& disps, bool deriv, const ContractionFn& call, const Maybe& order_out = none, - Maybe dev = none, Maybe dist = none, + Maybe dev = none, Maybe dist_ret = none, int max_tslices_in_contraction = 0, int max_moms_in_contraction = 0) { const std::string order_out_str = order_out.getSome("ijmt"); @@ -4565,9 +10762,8 @@ namespace Chroma detail::PathNode tree_disps = detail::get_tree(disps); // Get what directions are going to be used and the maximum number of displacements in memory - std::array active_dirs{}; - unsigned int max_active_disps = 0; - detail::get_tree_mem_stats(tree_disps, active_dirs, max_active_disps); + Coor active_dirs = get_max_disp(tree_disps); + Coor counts = get_total_disps(tree_disps); // Check that all tensors have the same number of time int Nt = colorvec.kvdim()['t']; @@ -4578,14 +10774,15 @@ namespace Chroma if (max_moms_in_contraction <= 0) max_moms_in_contraction = Nmom; + // Avoid distributing the tensors on directions with derivatives to avoid communications in shiftings, + // and on dimensions being contracted; so the only remaining dimension is t + const Distribution dist = + OnEveryoneCompact + detail::get_optimal_lattice_order(counts, deriv ? "m" : ""); + // Iterate over time-slices for (int tfrom = 0, tsize = std::min(Nt, max_tslices_in_contraction); tfrom < Nt; tfrom += tsize, tsize = std::min(max_tslices_in_contraction, Nt - tfrom)) { - // Make tsize one or even - if (tsize > 1 && tsize % 2 != 0) - --tsize; - detail::log(1, "contracting " + std::to_string(tsize) + " tslices from tslice= " + std::to_string(tfrom)); @@ -4593,41 +10790,51 @@ namespace Chroma std::vector> ut(Nd); for (unsigned int d = 0; d < Nd - 1; d++) { - if (!active_dirs[d]) + if (active_dirs[d] == 0) continue; // NOTE: This is going to create a tensor with the same distribution of the t-dimension as colorvec and moms - ut[d] = asTensorView(u[d]) - .kvslice_from_size({{'t', first_tslice + tfrom}}, {{'t', tsize}}) - .toComplex(); + ut[d] = detail::toNaturalOrdering( + asTensorView(u[d]) + .kvslice_from_size({{'t', first_tslice + tfrom}}, {{'t', tsize}}) + .toComplex(), first_tslice + tfrom) + .make_sure(none, dev, dist); } // Get the time-slice for colorvec - auto this_colorvec = colorvec.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); + auto this_colorvec_eo = colorvec.kvslice_from_size({{'t', tfrom}}, {{'t', tsize}}); // Apply the phases auto this_colorvec_phase_right = - phaseColorvecs(this_colorvec, first_tslice + tfrom, right_phase); + detail::toNaturalOrdering( + phaseColorvecs(this_colorvec_eo, first_tslice + tfrom, right_phase), + first_tslice + tfrom) + .make_sure(none, dev, dist); auto this_colorvec_phase_left = - phaseColorvecs(this_colorvec, first_tslice + tfrom, left_phase); + detail::toNaturalOrdering( + phaseColorvecs(this_colorvec_eo, first_tslice + tfrom, left_phase), + first_tslice + tfrom) + .make_sure(none, dev, dist); + this_colorvec_eo.release(); // Loop over the momenta for (int mfrom = 0, msize = std::min(max_moms_in_contraction, Nmom); mfrom < Nmom; mfrom += msize, msize = std::min(max_moms_in_contraction, Nmom - mfrom)) { - auto this_moms = - this_colorvec_phase_left.template like_this("xyzXtm", {{'m', msize}}); + auto this_moms = this_colorvec_phase_left.template make_compatible( + "xyzXm", {{'m', msize}}); for (int m = 0; m < msize; ++m) - ns_getColorvecs::getPhase(moms[mfrom + m], first_tslice + tfrom, tsize, - this_moms.getDev()) + { + ns_getColorvecs::getPhaseNatural(moms[mfrom + m], this_moms.getDev(), dist) .copyTo(this_moms.kvslice_from_size({{'m', m}}, {{'m', 1}})); - + } // Apply left phase and momenta conjugated to the left tensor // NOTE: look for the minus sign on left_phase in the doc of this function Tensor moms_left = - this_colorvec.template like_this("mc%xyzXt", '%', "", {{'m', msize}}); + this_colorvec_phase_right.template like_this("mc%xyzXt", '%', "", + {{'m', msize}}); moms_left.contract(std::move(this_moms), {}, Conjugate, this_colorvec_phase_left, {}, NotConjugate); @@ -4636,32 +10843,57 @@ namespace Chroma colorvec.release(); } - auto this_moms_coors = std::vector>(moms.begin() + mfrom, - moms.begin() + mfrom + msize); + auto this_moms_coors = + std::vector>(moms.begin() + mfrom, moms.begin() + mfrom + msize); if (!deriv) { + auto this_colorvec_phase_right0 = this_colorvec_phase_right; ns_doMomDisp_contractions::doMomDisp_contractions( - ut, std::move(moms_left), this_colorvec_phase_right, first_tslice + tfrom, tree_disps, - deriv, this_moms_coors, mfrom, order_out_str, dev.getSome(OnDefaultDevice), - dist.getSome(OnEveryoneReplicated), call); + ut, moms_left, std::move(this_colorvec_phase_right0), first_tslice + tfrom, + tree_disps, deriv, this_moms_coors, mfrom, order_out_str, + dev.getSome(OnDefaultDevice), dist_ret.getSome(dist), call); } else { // When using derivatives, each momenta has a different effect std::vector ones(msize, COMPLEX(1)); Tensor this_colorvec_m = - this_colorvec.template like_this("%m", '%', "", {{'m', msize}}); + this_colorvec_phase_right.template like_this("%m", '%', "", {{'m', msize}}); this_colorvec_m.contract(this_colorvec_phase_right, {}, NotConjugate, asTensorView(ones), {{'i', 'm'}}, NotConjugate); ns_doMomDisp_contractions::doMomDisp_contractions( - ut, std::move(moms_left), std::move(this_colorvec_m), first_tslice + tfrom, - tree_disps, deriv, this_moms_coors, mfrom, order_out_str, - dev.getSome(OnDefaultDevice), dist.getSome(OnEveryoneReplicated), call); + ut, moms_left, std::move(this_colorvec_m), first_tslice + tfrom, tree_disps, deriv, + this_moms_coors, mfrom, order_out_str, dev.getSome(OnDefaultDevice), + dist_ret.getSome(dist), call); } } } } + /// Call the destroy list and clean superbblas cache + + inline void finish() + { + // Show performance reports + // NOTE: QDPIO::cout doesn't support iomanip format declarations, so use std::cout on master node instead + if (Layout::nodeNumber() == 0) + { + superbblas::reportTimings(std::cout); + superbblas::reportCacheUsage(std::cout); + } + + // Clear internal superbblas caches + superbblas::clearCaches(); + + // Call the destroy list + for (const auto& f : detail::getDestroyList()) + f(); + detail::getDestroyList().clear(); + + // Make sure that no allocation is still around + superbblas::checkForMemoryLeaks(std::cout); + } + /// Return the smallest interval containing the union of two intervals /// \param from0: first element of the first interval /// \param size0: length of the first interval @@ -4752,7 +10984,7 @@ namespace Chroma inline CoorMoms getMomenta(int min_mom2, int max_mom2) { - static_assert(Nd == 4); + static_assert(Nd == 4, "Unsupported number of dimensions"); int max_component = (int)std::sqrt((float)max_mom2) + 1; CoorMoms r; for (int i = -max_component; i <= max_component; ++i) @@ -4773,9 +11005,9 @@ namespace Chroma /// Return a list of momenta as std::vector> from std::vector> /// \param v: list of momenta to transform - inline CoorMoms getMomenta(const std::vector> &v) + inline CoorMoms getMomenta(const std::vector>& v) { - static_assert(Nd == 4); + static_assert(Nd == 4, "Unsupported number of dimensions"); CoorMoms r; for (const auto vi : v) { @@ -4785,6 +11017,7 @@ namespace Chroma } return r; } + } } diff --git a/lib/util/ferm/superb_options.h b/lib/util/ferm/superb_options.h new file mode 100644 index 0000000000..f443667b41 --- /dev/null +++ b/lib/util/ferm/superb_options.h @@ -0,0 +1,858 @@ +// -*- C++ -*- +/*! \file + * \brief Alternative self-contained XML parser and processing + */ + +#ifndef __INCLUDE_SUPERB_OPTIONS__ +#define __INCLUDE_SUPERB_OPTIONS__ + +#include "chromabase.h" +#include "util/ferm/superb_contractions.h" +#include +#include +#include + +#ifdef BUILD_SB +namespace Chroma +{ + + namespace SB + { + + namespace detail + { + /// Return the previous lines up this one + /// \param file: content + /// \param char_num: character index of the last line to print + /// \param num_prev_lines: maximum number of previous lines to print + /// \param prefix: string to print previous to each line + + inline std::string get_prev_lines(const std::string& file, std::size_t char_num, + unsigned int num_prev_lines = 0, + const std::string prefix = "") + { + if (char_num > file.size()) + throw std::runtime_error("Erroneous character number"); + std::size_t line_num = std::count(file.begin(), file.begin() + char_num, '\n') + 1; + auto p = file.begin(); + std::string r; + for (unsigned int l = 1; l <= line_num && p != file.end(); ++l) + { + auto p1 = std::find(p, file.end(), '\n'); + if (l + num_prev_lines >= line_num) + r += std::string(p, p1) + std::string("\n"); + p = p1 + (p1 != file.end() ? 1 : 0); + } + return r; + } + + /// Return the given string with lowercase characters + /// \param s: input string + + inline std::string to_lower(const std::string& s) + { + + std::string valueLower = s; + std::transform(valueLower.begin(), valueLower.end(), valueLower.begin(), + [](unsigned char c) { return std::tolower(c); }); + return valueLower; + } + } + + /// Class for storing options + struct Option { + /// Get track of the path of this option on a set of options + std::string prefix; + /// Content of the file where the option comes from + std::shared_ptr file; + /// First line number in `file` associated to the option + std::size_t char_num; + /// Whether this option has been checked + mutable bool visited; + + Option() : char_num(0), visited(false) + { + } + + protected: + Option(std::shared_ptr file, std::size_t char_num) + : file(file), char_num(char_num), visited(false) + { + } + + /// Copy `prefix`, `file` and `char_num` from a given option + /// \param op: option to copy the info + + void copyFileInfo(const Option& op) + { + prefix = op.prefix; + file = op.file; + char_num = op.char_num; + } + + public: + /// Throw an error + /// \param s: error message + + void throw_error(const std::string& err_msg) const + { + if (file) + { + std::size_t line_num = std::count(file->begin(), file->begin() + char_num, '\n') + 1; + throw std::runtime_error(std::string("Error at prefix `") + prefix + // + "'(l. " + std::to_string(line_num) + "): " + // + err_msg + "\n" + // + detail::get_prev_lines(*file, char_num, 5, "| ")); // + } + else + { + throw std::runtime_error("Error at prefix `" + prefix + ": " + err_msg); + } + } + + /// Type of the option + enum Type { None, String, Double, Vector, Dictionary }; + + /// Return the type of this option + virtual Type getType() const + { + throw_error("getType: invalid object, it's abstract"); + throw std::exception{}; // silent no return warning + } + + /// Return if this options isn't None + explicit operator bool() const noexcept + { + return getType() != None; + } + + /// Return the string content of the option + virtual std::string getString() const + { + throw_error("expected the value to be a string"); + throw std::exception{}; // silent no return warning + } + + /// Return the double content of the option + virtual double getDouble() const + { + throw_error("expected the value to be a double"); + throw std::exception{}; // silent no return warning + } + + /// Return the integer content of the option + virtual int getInt() const + { + throw_error("expected the value to be an integer"); + throw std::exception{}; // silent no return warning + } + + /// Return the unsigned integer content of the option + virtual unsigned int getUInt() const + { + throw_error("expected the value to be an unsigned integer"); + throw std::exception{}; // silent no return warning + } + + /// Return the unsigned integer content of the option + virtual bool getBool() const + { + throw_error("expected the value to be a boolean"); + throw std::exception{}; // silent no return warning + } + + /// Return the vector content of the vector + virtual std::vector> getVector() const + { + throw_error("expected the value to be a vector"); + throw std::exception{}; // silent no return warning + } + + /// Return the vector content of the vector + virtual std::vector>& getVector() + { + throw_error("expected the value to be a vector"); + throw std::exception{}; // silent no return warning + } + + /// Return the map content of the vector + virtual const std::map>& getDictionary() const + { + throw_error("expected the value to be a dictionary"); + throw std::exception{}; // silent no return warning + } + + /// Return the map content of the vector + virtual std::map>& getDictionary() + { + throw_error("expected the value to be a dictionary"); + throw std::exception{}; // silent no return warning + } + + /// Return the option content on a path + const Option& getValue(const std::string& path, Maybe defaultValue = none, + Maybe expectedType = none, + Maybe fromOption = none, + Maybe originalPath = none) const + { + // If fromOption is none, set this node + Maybe fromOption_{fromOption.getSome(*this)}; + + // If originalPath is none, set the given path + Maybe originalPath_{originalPath.getSome(path)}; + + // Construct a nice error message + const std::string errorHeader = "Error in searching for option `" + + originalPath_.getSome() + "' from option at `" + + fromOption_.getSome().prefix + "': "; + + // If the path is empty or ask for the root node, just return this node + if (path.size() == 0 || path == std::string("/")) + { + if (expectedType && getType() != expectedType.getSome()) + throw std::runtime_error(errorHeader + "Expected another type"); + return *this; + } + + // If that path has a tag but the current option isn't a dictionary, either return + // the default value or throw an error + if (getType() != Dictionary) + { + if (defaultValue) + return defaultValue.getSome(); + throw std::runtime_error(errorHeader + "the element `" + path + "' is not a dictionary"); + } + + // If path starts with `/`, consume it and continue + if (path[0] == '/') + return getValue(std::string(path.begin() + 1, path.end()), defaultValue, expectedType, + fromOption_, originalPath_); + + // Find the name of the tag + auto p = std::find(path.begin(), path.end(), '/'); + std::string fieldName = std::string(path.begin(), p); + + // If the tag isn't under the current node, either return the default value or throw an error + auto m = getDictionary(); + if (m.count(fieldName) == 0) + { + if (defaultValue.hasSome()) + return defaultValue.getSome(); + throw std::runtime_error(errorHeader + "the tag `" + fieldName + "' was not found"); + } + + // Otherwise, consume the tag name, and continue + return m[fieldName]->getValue(std::string(p, path.end()), defaultValue, expectedType, + fromOption_, originalPath_); + } + + /// Return the option content on a path + const Option& getValue(const std::string& path, Type expectedType) const + { + return getValue(path, none, expectedType); + } + + /// Return the option content on a path + Maybe getValueMaybe(const std::string& path, + Maybe expectedType = none) const + { + struct AuxNone : Option { + Type getType() const override + { + return None; + } + } defaultOp; + const Option& op = getValue(path, Maybe{defaultOp}, expectedType); + if (&op == &defaultOp) + return none; + return op; + } + + void setPrefix(const std::string& thisPrefix = "") + { // Set the prefix of this option + prefix = thisPrefix; + + switch (getType()) + { + case None: + case String: + case Double: + { + // Do nothing + break; + } + + case Vector: + { + unsigned int i = 0; + for (auto& it : getVector()) + it->setPrefix(thisPrefix + "[" + std::to_string(i++) + "]/"); + break; + } + + case Dictionary: + { + for (auto& it : getDictionary()) + it.second->setPrefix(thisPrefix + it.first + "/"); + break; + } + } + } + }; + + /// Storing a string as the value of an option + struct NoneOption : public Option { + NoneOption() + { + } + NoneOption(std::shared_ptr file, std::size_t char_num) : Option{file, char_num} + { + } + Type getType() const override + { + return None; + } + }; + + /// Storing a string as the value of an option + struct StringOption : public Option { + std::string value; + StringOption(const std::string& s, std::shared_ptr file, std::size_t char_num) + : Option{file, char_num}, value(s) + { + } + StringOption(const std::string& s, Maybe op = none) : value(s) + { + if (op) + copyFileInfo(op.getSome()); + } + + Type getType() const override + { + return String; + } + std::string getString() const override + { + visited = true; + return value; + } + int getInt() const override + { + visited = true; + try + { + return std::stoi(value); + } catch (...) + { + throw_error("expected the value to be an integer"); + } + throw std::exception{}; // silent no return warning + } + unsigned int getUInt() const override + { + visited = true; + try + { + return std::stoul(value); + } catch (...) + { + throw_error("expected the value to be an unsigned integer"); + } + throw std::exception{}; // silent no return warning + } + double getDouble() const override + { + visited = true; + try + { + return std::stod(value); + } catch (...) + { + throw_error("expected the value to be a double"); + } + throw std::exception{}; // silent no return warning + } + bool getBool() const override + { + visited = true; + std::string valueLower = detail::to_lower(value); + if (valueLower == std::string("true")) + return true; + if (valueLower == std::string("false")) + return false; + throw_error("expected the value to be boolean, either `true' or `false'"); + throw std::exception{}; // silent no return warning + } + std::vector> getVector() const override + { + visited = true; + std::vector> v; + for (auto i = value.begin(), w = value.begin(); i != value.end(); ++i) + { + if (std::isspace(*i) || i + 1 == value.end()) + { + if (!std::isspace(*w)) + v.push_back(std::make_shared( + std::string(w, i + 1 == value.end() ? i + 1 : i), *this)); + w = i + 1; + } + } + return v; + } + }; + + /// Storing a double as the value of an option + struct DoubleOption : public Option { + double value; + DoubleOption(double d, std::shared_ptr file, std::size_t char_num) + : Option{file, char_num}, value(d) + { + } + DoubleOption(double d, Maybe op = none) : value(d) + { + if (op) + copyFileInfo(op.getSome()); + } + + Type getType() const override + { + return Double; + } + double getDouble() const override + { + visited = true; + return value; + } + int getInt() const override + { + visited = true; + return (int)std::round(value); + } + unsigned int getUInt() const override + { + visited = true; + int r = getInt(); + if (r < 0) + throw std::runtime_error("Error at prefix `" + prefix + + "': expected the value to be an unsigned integer"); + return (unsigned int)r; + } + bool getBool() const override + { + visited = true; + return std::fabs(value) != 0; + } + }; + + /// Storing a vector as the value of an option + struct VectorOption : public Option { + std::vector> value; + VectorOption(const Option& op) + { + copyFileInfo(op); + } + + Type getType() const override + { + return Vector; + } + std::vector> getVector() const override + { + visited = true; + return value; + } + std::vector>& getVector() override + { + visited = true; + return value; + } + }; + + /// Storing a vector as the value of an option + struct DictionaryOption : public Option { + std::map> value; + DictionaryOption(const Option& op) + { + copyFileInfo(op); + } + + Type getType() const override + { + return Dictionary; + } + const std::map>& getDictionary() const override + { + visited = true; + return value; + } + std::map>& getDictionary() override + { + visited = true; + return value; + } + }; + + template + struct GetExtraOption; + + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template + T getOption(const Option& ops, const std::string& path, Maybe defaultValue = none) + { + return GetExtraOption::getOption(ops, path, defaultValue); + } + + /// Return a string option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template <> + inline std::string getOption(const Option& ops, const std::string& path, + Maybe defaultValue) + { + StringOption defaultOp{defaultValue.getSome("")}; + return ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getString(); + } + + /// Return a double option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template <> + inline double getOption(const Option& ops, const std::string& path, + Maybe defaultValue) + { + DoubleOption defaultOp{defaultValue.getSome(0.0)}; + return ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getDouble(); + } + + /// Return an integer option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template <> + inline int getOption(const Option& ops, const std::string& path, Maybe defaultValue) + { + DoubleOption defaultOp{(double)defaultValue.getSome(0)}; + return ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getInt(); + } + + /// Return an unsigned integer option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template <> + inline unsigned int getOption(const Option& ops, const std::string& path, + Maybe defaultValue) + { + DoubleOption defaultOp{(double)defaultValue.getSome(0)}; + return ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getUInt(); + } + + /// Return a boolean option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template <> + inline bool getOption(const Option& ops, const std::string& path, + Maybe defaultValue) + { + StringOption defaultOp{defaultValue.getSome(false) ? "true" : "false"}; + return ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getBool(); + } + + /// Return an enum option given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template + Enum getOption(const Option& ops, const std::string& path, const std::map& m, + Maybe defaultValue = none) + { + // Transform the map entries to lowercase + std::map m0; + for (const auto& it : m) + { + std::string k = detail::to_lower(it.first); + if (m0.count(k) == 1) + throw std::runtime_error("getOption: invalid map, case sensitive keys"); + m0[k] = it.second; + } + + // Get value from the options + const std::string defaultStr = "default"; + StringOption defaultOp{defaultStr}; + std::string value = detail::to_lower( + ops.getValue(path, defaultValue ? Maybe{defaultOp} : none).getString()); + + // If no option was given or it's the default, return the default value if a default value is given + if (defaultValue && value == defaultStr) + return defaultValue.getSome(); + + // Check that the value is in the map + if (m0.count(value) == 0) + { + std::string availableOption = defaultValue ? defaultStr : std::string{}; + for (const auto& it : m0) + availableOption += std::string(" ") + it.first; + ops.getValue(path).throw_error("unsupported value `" + value + + "'; supported values: " + availableOption); + } + + // Return the enum associated to the value + return m0.at(value); + } + + /// Return a vector of options given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template + struct GetExtraOption> { + static std::vector getOption(const Option& ops, const std::string& path, + Maybe> defaultValue = none) + { + NoneOption defaultOp{}; + const Option& valueOp = + ops.getValue(path, defaultValue ? Maybe{defaultOp} : none); + if (!valueOp) + return defaultValue.getSome(); + std::vector r; + for (const auto& op : valueOp.getVector()) + r.push_back(SB::getOption(*op, "")); + return r; + } + }; + + /// Return an array of options given a path + /// \param ops: options into look for + /// \param path: option path + /// \param defaultValue: return value if the options isn't specified + + template + struct GetExtraOption> { + static std::array getOption(const Option& ops, const std::string& path, + Maybe> defaultValue = none) + { + NoneOption defaultOp{}; + const Option& valueOp = + ops.getValue(path, defaultValue ? Maybe{defaultOp} : none); + if (!valueOp) + return defaultValue.getSome(); + std::array r; + unsigned int i = 0; + for (const auto& op : valueOp.getVector()) + if (i < r.size()) + r[i++] = SB::getOption(*op, ""); + if (i != r.size()) + valueOp.throw_error("invalid number of elements: expected " + std::to_string(r.size())); + return r; + } + }; + + /// Return a dictionary option + /// \param ops: options into look for + /// \param path: option path + + inline const Option& getOptions(const Option& ops, const std::string& path) + { + return ops.getValue(path, Option::Dictionary); + } + + /// Return a dictionary option + /// \param ops: options into look for + /// \param path: option path + + inline Maybe getOptionsMaybe(const Option& ops, const std::string& path) + { + return ops.getValueMaybe(path, Option::Dictionary); + } + + using Options = Option; + + /// Returns options from XML + /// \param s: text + + inline std::shared_ptr