diff --git a/src/basis.f90 b/src/basis.f90 index 6fdadd78f..5b9f19358 100644 --- a/src/basis.f90 +++ b/src/basis.f90 @@ -42,6 +42,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier ! * Allocate arrays whose dimensions depend on NATOM (allocateatoms_ecp) ! * Read the Effective Core Potentials (ECPs), modify the atomic charges ! and the total number of electrons (readecp) + print *, "call readbasis" if (quick_method%ecp) call readecp call quick_open(ibasisfile,basisfilename,'O','F','W',.true.,ierr) CHECK_ERROR(ierr) @@ -56,6 +57,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier atmbs2=.true. icont=0 quick_method%ffunxiao=.true. + nfrozencore = 0 ! parse the file and find the sizes of things to allocate them in memory do while (iofile == 0 ) @@ -227,6 +229,9 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier nshell = nshell + quick_basis%kshell(quick_molspec%iattype(i)) nbasis = nbasis + kbasis(quick_molspec%iattype(i)) nprim = nprim + kcontract(quick_molspec%iattype(i)) + if(quick_method%frzCore)then + nfrozencore = nfrozencore + frozencore(quick_molspec%iattype(i))/2 + endif ! MFCC if(i.eq.natomfinal)nbasisfinal=nbasis diff --git a/src/calMP2.f90 b/src/calMP2.f90 index 15cf62cdc..de1ce3767 100644 --- a/src/calMP2.f90 +++ b/src/calMP2.f90 @@ -9,11 +9,41 @@ subroutine calmp2 use quick_cutoff_module, only: cshell_density_cutoff implicit double precision(a-h,o-z) - double precision cutoffTest,testtmp,testCutoff - integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2 + double precision cutoffTest,testtmp,testCutoff,gpuMP2WrapperTimeStart, gpuMP2WrapperTimeEnd + double precision :: gpuMp2cor(1),gpuEmemorysum(1) + integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2,NONZEROCOUNT,gpuNstepmp2(1) + integer:: gpuNtemp(1) common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2 - integer :: nelec,nelecb + integer :: nelec,nelecb + + if(.not. quick_method%frzCore)then + print *, "Do not use fc approximation!" + endif + print *, "nfrozencore is ", nfrozencore + + +#ifdef CUDA + if(quick_method%bCUDA) then + + call PrtAct(ioutfile,"Begin MP2 Calculation") + + call cpu_time(gpuMP2WrapperTimeStart) + call gpu_mp2_wrapper(quick_qm_struct%co,quick_qm_struct%vec,quick_qm_struct%dense,quick_qm_struct%E,& + cutmatrix,quick_method%integralCutoff,quick_method%primLimit,quick_method%DMCutoff,gpuMp2cor,& + gpuEmemorysum, gpuNstepmp2, gpuNtemp) + call cpu_time(gpuMP2WrapperTimeEnd) + + write(ioutfile,'("CURRENT MEMORY USAGE=",E12.6,"M")') gpuEmemorysum(1) + write(ioutfile,'("TOTAL STEP =",I6)') gpuNstepmp2(1) + write(ioutfile,'("EFFECT INTEGRALS =",i8)') gpuNtemp(1) + + quick_qm_struct%EMP2 = gpuMp2cor(1) + print *, 'cuda calculated mp2cor is', quick_qm_struct%EMP2 + timer_cumer%TMP2 = gpuMP2WrapperTimeEnd-gpuMP2WrapperTimeStart + print '("Total GPU MP2 Wrapper Time = ",f6.3," seconds.")', timer_cumer%TMP2 + endif +#else nelec = quick_molspec%nelec nelecb = quick_molspec%nelecb @@ -23,15 +53,24 @@ subroutine calmp2 quick_qm_struct%EMP2=0.0d0 ! occupied and virtual orbitals number - iocc=Nelec/2 + !change here + !iocc=Nelec/2 + iocc=Nelec/2-nfrozencore ivir=Nbasis-Nelec/2 ! calculate memory usage and determine steps ememorysum=real(iocc*ivir*nbasis*8.0d0/1024.0d0/1024.0d0/1024.0d0) ! actually nstep is step length - nstep=min(int(1.5d0/ememorysum),Nelec/2) - + ! set max mem req for Q3 to larger values + reqmemmax = 1.5d0 + !change here + !nstep=min(int(reqmemmax/ememorysum),Nelec/2-nfrozencore) + nstep=min(int(reqmemmax/ememorysum),iocc) + if(nstep<1)then + nstep=1 + endif + ! if with f orbital if(quick_method%ffunxiao)then nbasistemp=6 @@ -39,18 +78,28 @@ subroutine calmp2 nbasistemp=10 endif + print *, "in calmp2, nstep is", nstep + print *, "in calmp2, reqmemmax is", reqmemmax,& + "nbasistemp is", nbasistemp + ! Allocate some variables allocate(mp2shell(nbasis)) allocate(orbmp2(ivir,ivir)) allocate(orbmp2i331(nstep,nbasis,nbasistemp,nbasistemp,2)) allocate(orbmp2j331(nstep,ivir,nbasistemp,nbasistemp,2)) + !change here + !allocate(orbmp2k331(nstep,iocc,ivir,nbasis)) allocate(orbmp2k331(nstep,iocc,ivir,nbasis)) - ! with nstep(acutally, it represetns step lenght), we can + ! with nstep(acutally, it represetns step lenght), we can ! have no. of steps for mp2 calculation - nstepmp2=nelec/2/nstep + !change here + !nstepmp2=(nelec/2-nfrozencore)/nstep + nstepmp2=iocc/nstep nstepmp2=nstepmp2+1 - if(nstep*(nstepmp2-1).eq.nelec/2)then + !change here + !if(nstep*(nstepmp2-1).eq.(nelec/2-nfrozencore))then + if(nstep*(nstepmp2-1).eq.iocc)then nstepmp2=nstepmp2-1 endif @@ -66,17 +115,28 @@ subroutine calmp2 ttt=MAXVAL(Ycutoff) ! Max Value of Ycutoff + print *, "job step is ", nstepmp2 + print *, "nstep is ", nstep + call flush(6) + do i3new=1,nstepmp2 ! Step counter + print *, "i3new is ", i3new + call flush(6) call cpu_time(timer_begin%TMP2) ntemp=0 ! integer counter - nstepmp2s=(i3new-1)*nstep+1 ! Step start n - nstepmp2f=i3new*nstep ! Step end n + nstepmp2s=(i3new-1)*nstep+1+nfrozencore ! Step start n + nstepmp2f=i3new*nstep+nfrozencore ! Step end n if(i3new.eq.nstepmp2)nstepmp2f=nelec/2 nsteplength=nstepmp2f-nstepmp2s+1 ! Step Lengh, from nstepmp2s to nstepmp2f + + print *, "nstepmp2s is ", nstepmp2s + print *, "nstepmp2f is ", nstepmp2f ! Initial orbmp2k331 + ! change_here + !call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength) call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength) do II=1,jshell do JJ=II,jshell @@ -116,21 +176,20 @@ subroutine calmp2 enddo enddo enddo - + testCutoff=testCutoff*comax if(testCutoff.gt.cutoffmp2)then dnmax=comax ntemp=ntemp+1 - call shellmp2(nstepmp2s,nsteplength) - endif - + call shellmp2(nstepmp2s,nsteplength) + endif + endif enddo enddo - NII1=quick_basis%Qstart(II) NII2=quick_basis%Qfinal(II) NJJ1=quick_basis%Qstart(JJ) @@ -168,9 +227,10 @@ subroutine calmp2 enddo do j33=1,ivir - do k33=1,nelec/2 - atemp=quick_scratch%hold(k33,III) - atemp2=quick_scratch%hold(k33,JJJ) + !do k33=1+nfrozencore,nelec/2 + do k33=1,iocc + atemp=quick_scratch%hold(k33+nfrozencore,III) + atemp2=quick_scratch%hold(k33+nfrozencore,JJJ) do icycle=1,nsteplength orbmp2k331(icycle,k33,j33,JJJ)=orbmp2k331(icycle,k33,j33,JJJ)+ & orbmp2j331(icycle,j33,IIInew,JJJnew,1)*atemp @@ -203,7 +263,8 @@ subroutine calmp2 do icycle=1,nsteplength i3=nstepmp2s+icycle-1 - do k3=i3,nelec/2 + !do k3=i3,nelec/2 + do k3=i3-nfrozencore, iocc do J3=1,nbasis-nelec/2 do L3=1,nbasis-nelec/2 @@ -217,13 +278,15 @@ subroutine calmp2 do J3=1,nbasis-nelec/2 do L3=1,nbasis-nelec/2 - if(k3.gt.i3)then - quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) & + if(k3.gt.i3-nfrozencore)then + quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3) & + +quick_qm_struct%E(k3+nfrozencore) & -quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) & *orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3)) endif - if(k3.eq.i3)then - quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) & + if(k3.eq.i3-nfrozencore)then + quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3) & + +quick_qm_struct%E(k3+nfrozencore) & -quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) & *orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3)) endif @@ -233,10 +296,14 @@ subroutine calmp2 enddo enddo + + print *, "cleaned up!" call cpu_time(timer_end%TMP2) timer_cumer%TMP2=timer_end%TMP2-timer_begin%TMP2+timer_cumer%TMP2 enddo + write (ioutfile,'("EFFECT INTEGRALS =",i8)') ntemp +#endif write (iOutFile,'("SECOND ORDER ENERGY =",F16.9)') quick_qm_struct%EMP2 write (iOutFile,'("EMP2 =",F16.9)') quick_qm_struct%Etot+quick_qm_struct%EMP2 @@ -392,7 +459,8 @@ subroutine MPI_calmp2 if(testCutoff.gt.cutoffmp2)then dnmax=comax ntemp=ntemp+1 - call shellmp2(nstepmp2s,nsteplength) + !call shellmp2(nstepmp2s,nsteplength,Y_Matrix) + call shellmp2(nstepmp2s,nsteplength) endif endif @@ -971,3 +1039,20 @@ subroutine initialOrbmp2ij(orbmp2i331,nstep,nsteplength,nbasis,nbasistemp,nbasis enddo enddo end subroutine initialOrbmp2ij + +subroutine print_matrix(matrix,n) + integer n, i, j + double precision, dimension(n,n),intent(in) :: matrix + + do i=1, n + do j=1, n + write(*, '(f16.8)', advance='no'), matrix(i,j) + !write(*, '(f20.6)'), matrix(i,j) + end do + write(*,'(" ")') + end do + +end subroutine print_matrix + + + diff --git a/src/cuda/Makefile b/src/cuda/Makefile index 1c25d898c..8d2095e54 100644 --- a/src/cuda/Makefile +++ b/src/cuda/Makefile @@ -24,9 +24,9 @@ include $(MAKEIN) LIBXC_CUDA_FLAGS = -I$(libxcfolder) +CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o $(objfolder)/gpu_MP2.o CXXOBJ=$(objfolder)/xc_redistribute.o -CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o CUDAXCOBJ=$(objfolder)/gpu_getxc.o diff --git a/src/cuda/gpu.cu b/src/cuda/gpu.cu index 669dc0f17..87f404294 100644 --- a/src/cuda/gpu.cu +++ b/src/cuda/gpu.cu @@ -285,6 +285,8 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int* upload_para_to_const(); + //for MP2, added here! + upload_para_to_const_MP2(); #ifdef DEBUG cudaEventRecord(end, 0); cudaEventSynchronize(end); @@ -1066,7 +1068,8 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP #endif - gpu -> gpu_cutoff -> YCutoff -> DeleteCPU(); + printf("Don't do gpu -> gpu_cutoff -> YCutoff -> DeleteCPU()\n"); + //gpu -> gpu_cutoff -> YCutoff -> DeleteCPU(); gpu -> gpu_cutoff -> cutPrim -> DeleteCPU(); gpu -> gpu_cutoff -> sorted_YCutoffIJ -> DeleteCPU(); @@ -1087,7 +1090,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP //----------------------------------------------- // upload calculated information //----------------------------------------------- -extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense) +extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E) { #ifdef DEBUG @@ -1102,9 +1105,11 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou gpu -> gpu_calculated -> o = new cuda_buffer_type(gpu->nbasis, gpu->nbasis); gpu -> gpu_calculated -> o -> DeleteGPU(); gpu -> gpu_calculated -> dense = new cuda_buffer_type(dense, gpu->nbasis, gpu->nbasis); + gpu -> gpu_calculated -> coefficient = new cuda_buffer_type(co, gpu->nbasis, gpu->nbasis); gpu -> gpu_calculated -> oULL = new cuda_buffer_type(gpu->nbasis, gpu->nbasis); - - + gpu -> gpu_calculated -> mp2cor = new cuda_buffer_type(1); //to store finally mp2 correction + gpu -> gpu_calculated -> molorbe = new cuda_buffer_type(E, gpu->nbasis); + /* oULL is the unsigned long long int type of O matrix. The reason to do so is because Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type, @@ -1128,13 +1133,19 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou // gpu -> gpu_calculated -> o -> Upload(); gpu -> gpu_calculated -> dense -> Upload(); - gpu -> gpu_calculated -> oULL -> Upload(); - + gpu -> gpu_calculated -> coefficient -> Upload(); + gpu -> gpu_calculated -> oULL -> Upload(); + gpu -> gpu_calculated -> mp2cor -> Upload(); + gpu -> gpu_calculated -> molorbe -> Upload(); + // gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; gpu -> gpu_sim.dense = gpu -> gpu_calculated -> dense -> _devData; + gpu -> gpu_sim.coefficient = gpu -> gpu_calculated -> coefficient -> _devData; gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; - - + gpu -> gpu_sim.mp2cor = gpu -> gpu_calculated -> mp2cor -> _devData; + gpu -> gpu_sim.molorbe = gpu -> gpu_calculated -> molorbe -> _devData; + + #ifdef DEBUG cudaEventRecord(end, 0); cudaEventSynchronize(end); @@ -1157,6 +1168,16 @@ extern "C" void gpu_upload_density_matrix_(QUICKDouble* dense) gpu -> gpu_sim.dense = gpu -> gpu_calculated -> dense -> _devData; } +// Added by Chi Jin on 11/07/2020. Do we need this, since we did this in gpu_upload_calculated? +// Probably not. This function is not called anywhere +extern "C" void gpu_upload_coefficient_matrix_(QUICKDouble* coefficient) +{ + gpu -> gpu_calculated -> coefficient = new cuda_buffer_type(coefficient, gpu->nbasis, gpu->nbasis); + gpu -> gpu_calculated -> coefficient -> Upload(); + gpu -> gpu_sim.coefficient = gpu -> gpu_calculated -> coefficient -> _devData; +} + + //----------------------------------------------- // upload basis set information //----------------------------------------------- @@ -1165,7 +1186,7 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba int* first_basis_function, int* last_basis_function, int* first_shell_basis_function, int* last_shell_basis_function, \ int* ncenter, int* kstart, int* katom, int* ktype, int* kprim, int* kshell, int* Ksumtype, \ int* Qnumber, int* Qstart, int* Qfinal, int* Qsbasis, int* Qfbasis,\ - QUICKDouble* gccoeff, QUICKDouble* cons, QUICKDouble* gcexpo, int* KLMN) + QUICKDouble* gccoeff, QUICKDouble* cons, QUICKDouble* gcexpo, int* KLMN, int* nfrozencore) { #ifdef DEBUG @@ -1182,17 +1203,20 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba gpu -> gpu_basis -> jshell = *jshell; gpu -> gpu_basis -> jbasis = *jbasis; gpu -> gpu_basis -> maxcontract = *maxcontract; - + gpu -> gpu_basis -> nfrozencore = *nfrozencore; + gpu -> nshell = *nshell; gpu -> nprim = *nprim; gpu -> jshell = *jshell; gpu -> jbasis = *jbasis; + gpu -> nfrozencore = *nfrozencore; gpu -> gpu_sim.nshell = *nshell; gpu -> gpu_sim.nprim = *nprim; gpu -> gpu_sim.jshell = *jshell; gpu -> gpu_sim.jbasis = *jbasis; gpu -> gpu_sim.maxcontract = *maxcontract; + gpu -> gpu_sim.nfrozencore = *nfrozencore; gpu -> gpu_basis -> ncontract = new cuda_buffer_type(ncontract, gpu->nbasis);//gpu->nbasis); @@ -1492,11 +1516,13 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba gpu -> gpu_basis -> prim_start -> DeleteCPU(); gpu -> gpu_basis -> Qnumber -> DeleteCPU(); - gpu -> gpu_basis -> Qstart -> DeleteCPU(); - gpu -> gpu_basis -> Qfinal -> DeleteCPU(); - gpu -> gpu_basis -> Qsbasis -> DeleteCPU(); - gpu -> gpu_basis -> Qfbasis -> DeleteCPU(); + // for MP2, let them be here for a while + //gpu -> gpu_basis -> Qstart -> DeleteCPU(); + //gpu -> gpu_basis -> Qfinal -> DeleteCPU(); + + //gpu -> gpu_basis -> Qsbasis -> DeleteCPU(); + //gpu -> gpu_basis -> Qfbasis -> DeleteCPU(); gpu -> gpu_basis -> gccoeff -> DeleteCPU(); gpu -> gpu_basis -> cons -> DeleteCPU(); gpu -> gpu_basis -> gcexpo -> DeleteCPU(); @@ -2426,7 +2452,7 @@ extern "C" void gpu_grad_(QUICKDouble* grad) cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE RUNNING GRAD") + PRINTDEBUG("COMPLETE RUNNING GRAD"); } @@ -2678,10 +2704,13 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName){ delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; + delete gpu->gpu_calculated->coefficient; delete gpu->gpu_calculated->oULL; - delete gpu->gpu_cutoff->cutMatrix; + delete gpu->gpu_calculated->mp2cor; + delete gpu->gpu_calculated->molorbe; + delete gpu->gpu_cutoff->cutMatrix; delete gpu->gpu_cutoff->sorted_YCutoffIJ; - delete gpu->gpu_cutoff->YCutoff; + delete gpu->gpu_cutoff->YCutoff; delete gpu->gpu_cutoff->cutPrim; @@ -2701,7 +2730,7 @@ extern "C" void gpu_get2e_(QUICKDouble* o) PRINTDEBUG("BEGIN TO RUN KERNEL") get2e(gpu); - + PRINTDEBUG("COMPLETE KERNEL") gpu -> gpu_calculated -> oULL -> Download(); @@ -2759,6 +2788,114 @@ extern "C" void gpu_get2e_(QUICKDouble* o) } +//----------------------------------------------- +// compute mp2 +//----------------------------------------------- + +extern "C" void gpu_calmp2_(QUICKDouble* mp2cor, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp) +{ + PRINTDEBUG("BEGIN TO RUN gpu_calmp2"); + + upload_sim_to_constant_MP2(gpu); + + PRINTDEBUG("BEGIN TO RUN KERNEL") + + get2e_MP2(gpu, ememorysum, nstepmp2, ntemp); + printf("in gpu_calmp2_, get2e_MP2 is done\n"); + + PRINTDEBUG("COMPLETE KERNEL") + + float time = 0; + cudaEvent_t before_download, downloaded, copied; + cudaEventCreate(&before_download); + cudaEventCreate(&downloaded); + cudaEventCreate(&copied); + cudaEventRecord(before_download, 0); + + printf("here to download gpu->gpu_calculated->mp2cor\n"); + gpu->gpu_calculated->mp2cor->Download(); + printf("gpu->gpu_calculated->mp2cor downloaded\n"); + printf("gpu->gpu_calculated->mp2cor->_hostData is %lf\n", *gpu->gpu_calculated->mp2cor->_hostData); + + cudaEventRecord(downloaded, 0); + cudaEventSynchronize(downloaded); + cudaEventElapsedTime(&time, before_download, downloaded); + printf("in gpu_calmp2, total mp2cor download time is %6.3f ms\n", time); + + memcpy(mp2cor, gpu->gpu_calculated->mp2cor->_hostData, sizeof(QUICKDouble)); + + cudaEventRecord(copied, 0); + cudaEventSynchronize(copied); + cudaEventElapsedTime(&time, downloaded, copied); + printf("in gpu_calmp2, total mp2cor copy time is %6.3f ms\n", time); + + cudaEventDestroy(before_download); + cudaEventDestroy(downloaded); + cudaEventDestroy(copied); + +#ifdef DEBUG + cudaEvent_t start,end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start, 0); +#endif + +#ifdef DEBUG + cudaEventRecord(end, 0); + cudaEventSynchronize(end); + float time; + cudaEventElapsedTime(&time, start, end); + PRINTUSINGTIME("DOWNLOAD O",time); + cudaEventDestroy(start); + cudaEventDestroy(end); +#endif + + PRINTDEBUG("DELETE TEMP VARIABLES") + + delete gpu->gpu_calculated->o; + delete gpu->gpu_calculated->dense; + delete gpu->gpu_calculated->coefficient; + delete gpu->gpu_calculated->oULL; + delete gpu->gpu_calculated->mp2cor; + delete gpu->gpu_calculated->molorbe; + delete gpu->gpu_cutoff->cutMatrix; + + PRINTDEBUG("COMPLETE RUNNING MP2") +} + +extern "C" void gpu_mp2_wrapper_(QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E,\ +QUICKDouble* cutmatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit,QUICKDouble* DMCutoff, QUICKDouble* mp2cor,\ +QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp) +{ + float time = 0; + cudaEvent_t before_upload, uploaded, mp2_done; + cudaEventCreate(&before_upload); + cudaEventCreate(&uploaded); + cudaEventCreate(&mp2_done); + cudaEventRecord(before_upload, 0); + + //not changed the signature of this functure for the moment: + QUICKDouble* o = new QUICKDouble(0); + gpu_upload_calculated_(o,co,vec,dense,E); + free(o); + gpu_upload_cutoff_(cutmatrix,integralCutoff,primLimit,DMCutoff); + + cudaEventRecord(uploaded, 0); + cudaEventSynchronize(uploaded); + cudaEventElapsedTime(&time, before_upload, uploaded); + printf("in gpu_mp2_wrapper, total upload time is %6.3f ms\n", time); + + gpu_calmp2_(mp2cor, ememorysum, nstepmp2, ntemp); + + cudaEventRecord(mp2_done, 0); + cudaEventSynchronize(mp2_done); + cudaEventElapsedTime(&time, uploaded, mp2_done); + printf("in gpu_mp2_wrapper, total calmp2 time is %6.3f ms\n", time); + printf("in gpu_mp2_wrapper, finally mp2cor is %lf\n", *mp2cor); + + cudaDeviceSynchronize(); +} + extern "C" void gpu_getxc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o, int* nof_functionals, int* functional_id, int* xc_polarization) { PRINTDEBUG("BEGIN TO RUN GETXC") diff --git a/src/cuda/gpu.h b/src/cuda/gpu.h index 124374e84..532ead2e5 100644 --- a/src/cuda/gpu.h +++ b/src/cuda/gpu.h @@ -32,18 +32,19 @@ extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg); extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit, QUICKDouble* DMCutoff); extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutPrim); extern "C" void gpu_upload_energy_(QUICKDouble* E); -extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense); +extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E); extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jbasis, int* maxcontract, \ int* ncontract, int* itype, QUICKDouble* aexp, QUICKDouble* dcoeff,\ int* first_basis_function, int* last_basis_function, int* first_shell_basis_function, int* last_shell_basis_function, \ int* ncenter, int* kstart, int* katom, int* ktype, int* kprim, int* kshell, int* Ksumtype, \ int* Qnumber, int* Qstart, int* Qfinal, int* Qsbasis, int* Qfbasis,\ - QUICKDouble* gccoeff, QUICKDouble* cons, QUICKDouble* gcexpo, int* KLMN); + QUICKDouble* gccoeff, QUICKDouble* cons, QUICKDouble* gcexpo, int* KLMN, int* nfrozencore); extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff); extern "C" void gpu_cleanup_(); //Following methods weddre added by Madu Manathunga extern "C" void gpu_upload_density_matrix_(QUICKDouble* dense); +extern "C" void gpu_upload_coefficient_matrix_(QUICKDouble* coefficient); extern "C" void gpu_delete_dft_grid_(); //void upload_xc_smem(); void upload_pteval(); @@ -74,7 +75,7 @@ void getxc(_gpu_type gpu, gpu_libxc_info** glinfo, int nof_functionals); void getxc_grad(_gpu_type gpu, gpu_libxc_info** glinfo, int nof_functionals); void prune_grid_sswgrad(); void gpu_delete_sswgrad_vars(); -void get2e_MP2(_gpu_type gpu); +void get2e_MP2(_gpu_type gpu, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp); void getAddInt(_gpu_type gpu, int bufferSize, ERI_entry* aoint_buffer); void getGrad(_gpu_type gpu); // global [get2e_kernel] @@ -164,6 +165,7 @@ __device__ void iclass_grad_spdf7(int I, int J, int K, int L, unsigned int II, u __device__ void iclass_grad_spdf8(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, QUICKDouble DNMax); void upload_sim_to_constant(_gpu_type gpu); +void upload_sim_to_constant_MP2(_gpu_type gpu); void upload_sim_to_constant_dft(_gpu_type gpu); void upload_para_to_const(); @@ -512,4 +514,52 @@ __device__ QUICKDouble lyp_e(QUICKDouble pa, QUICKDouble pb, QUICKDouble gax, QU __device__ QUICKDouble becke_e(QUICKDouble density, QUICKDouble densityb, QUICKDouble gax, QUICKDouble gay, QUICKDouble gaz, QUICKDouble gbx, QUICKDouble gby, QUICKDouble gbz); + +//Chi Jin 09/23/2020 +//MP2 +__global__ void get2e_MP2_kernel(); +__global__ void firstQuarterTransKernel(int II,int JJ,int nstepmp2s, int nsteplength, int nstep, int nbasistemp, unsigned long long int* ntempptr_d,\ +QUICKDouble cutoffmp2, QUICKDouble* orbmp2i331); +__global__ void secondQuarterTransKernel(int III, int JJJ, int IIInew,int JJJnew,int nsteplength, int nstep, int nbasistemp, QUICKDouble* orbmp2i331, QUICKDouble* orbmp2j331); +__global__ void thirdQuarterTransKernel(int III, int JJJ, int IIInew,int JJJnew, int nsteplength, int nstep, int nbasistemp, QUICKDouble* orbmp2j331, QUICKDouble* orbmp2k331); +__global__ void forthQuarterTransKernel(int icycle, int i3, int k3, int nstep, QUICKDouble* orbmp2k331, QUICKDouble* orbmp2); +__global__ void finalMP2AccumulationKernel(int i3, int k3, QUICKDouble* orbmp2, QUICKDouble* MP2cor); +__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, \ + int nstepmp2s, int nsteplength, int nstep, int nbasistemp, QUICKDouble DNMax, QUICKDouble* orbmp2i331); +void fourQuarterTransHost(_gpu_type gpu, QUICKDouble* ememorysumptr, int* nstepmp2ptr, unsigned long long int* ntempptr); + +__device__ void FmT_MP2(int MaxM, QUICKDouble X, QUICKDouble* YVerticalTemp); +__device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalTemp, QUICKDouble* store, + QUICKDouble Ptempx, QUICKDouble Ptempy, QUICKDouble Ptempz, \ + QUICKDouble WPtempx,QUICKDouble WPtempy,QUICKDouble WPtempz, \ + QUICKDouble Qtempx, QUICKDouble Qtempy, QUICKDouble Qtempz, \ + QUICKDouble WQtempx,QUICKDouble WQtempy,QUICKDouble WQtempz, \ + QUICKDouble ABCDtemp,QUICKDouble ABtemp, \ + QUICKDouble CDtemp, QUICKDouble ABcom, QUICKDouble CDcom); +__device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \ + int III, int JJJ, int KKK, int LLL, int IJKLTYPE, QUICKDouble* store, \ + QUICKDouble RAx,QUICKDouble RAy,QUICKDouble RAz, \ + QUICKDouble RBx,QUICKDouble RBy,QUICKDouble RBz, \ + QUICKDouble RCx,QUICKDouble RCy,QUICKDouble RCz, \ + QUICKDouble RDx,QUICKDouble RDy,QUICKDouble RDz); +__device__ int lefthrr_MP2(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz, + QUICKDouble RBx, QUICKDouble RBy, QUICKDouble RBz, + int KLMNAx, int KLMNAy, int KLMNAz, + int KLMNBx, int KLMNBy, int KLMNBz, + int IJTYPE,QUICKDouble* coefAngularL, int* angularL); + +void upload_para_to_const_MP2(); +extern "C" void gpu_mp2_wrapper_(QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E,\ +QUICKDouble* cutmatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit,QUICKDouble* DMCutoff, QUICKDouble* mp2cor,\ +QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp); + + +/* +#undef STOREDIM +#ifdef int_spd +#define STOREDIM STOREDIM_S +#else +#define STOREDIM STOREDIM_L +#endif +*/ #endif diff --git a/src/cuda/gpu_MP2.cu b/src/cuda/gpu_MP2.cu index 1166247bc..24b4ca31b 100644 --- a/src/cuda/gpu_MP2.cu +++ b/src/cuda/gpu_MP2.cu @@ -12,6 +12,13 @@ #include "gpu.h" #include +#undef STOREDIM +//#ifdef int_spd +#define STOREDIM STOREDIM_S +//#else +//#define STOREDIM STOREDIM_L +//#endif + /* Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and readonly. So we put the following variables into constant memory: @@ -31,18 +38,524 @@ static __constant__ int Sumindex_MP2[10]={0,0,1,4,10,20,35,56,84,120}; */ void upload_sim_to_constant_MP2(_gpu_type gpu){ cudaError_t status; - status = cudaMemcpyToSymbol("devSim_MP2", &gpu->gpu_sim, sizeof(gpu_simulation_type), 0, cudaMemcpyHostToDevice); - PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed") + //status = cudaMemcpyToSymbol("devSim_MP2", &gpu->gpu_sim, sizeof(gpu_simulation_type), 0, cudaMemcpyHostToDevice); + status = cudaMemcpyToSymbol(devSim_MP2, &gpu->gpu_sim, sizeof(gpu_simulation_type)); + PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed") +} + + +//this kernel is to parallelize the two inner loops of the first transformation in firstThreeQuartersTransHost + +__global__ void +__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) firstQuarterTransKernel(int II,int JJ,int nstepmp2s, int nsteplength, int nstep, int nbasistemp, unsigned long long int* ntempptr_d,\ +QUICKDouble cutoffmp2, QUICKDouble* orbmp2i331) +{ + int offside = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; + int jshell = devSim_MP2.jshell; + int myInt = jshell*jshell/totalThreads; + + int nshell = devSim_MP2.nshell; + int nbasis = devSim_MP2.nbasis; + + if(jshell*jshell-myInt*totalThreads>offside) myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int KK = currentInt/jshell; + int LL = currentInt%jshell; + + if(KK=KK) + { + QUICKDouble comax = 0.0; + QUICKDouble testCutoff = LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell)*LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell); + + if(testCutoff>cutoffmp2) + { + int NII1=devSim_MP2.Qstart[II]; + int NII2=devSim_MP2.Qfinal[II]; + int NJJ1=devSim_MP2.Qstart[JJ]; + int NJJ2=devSim_MP2.Qfinal[JJ]; + int NKK1=devSim_MP2.Qstart[KK]; + int NKK2=devSim_MP2.Qfinal[KK]; + int NLL1=devSim_MP2.Qstart[LL]; + int NLL2=devSim_MP2.Qfinal[LL]; + + int NBK1 = LOC2(devSim_MP2.Qsbasis, KK, NKK1, nshell, 4); + int NBK2 = LOC2(devSim_MP2.Qfbasis, KK, NKK2, nshell, 4); + int NBL1 = LOC2(devSim_MP2.Qsbasis, LL, NLL1, nshell, 4); + int NBL2 = LOC2(devSim_MP2.Qfbasis, LL, NLL2, nshell, 4); + + int KK111 = NBK1; + int KK112 = NBK2; + int LL111 = NBL1; + int LL112 = NBL2; + + for(int icycle=1; icycle<=nsteplength; icycle++) + { + int i3 = nstepmp2s + icycle -1; + for(int KKK = KK111;KKK<=KK112;KKK++) + { + for(int LLL=MAX(KKK,LL111);LLL<=LL112;LLL++) + { + comax = MAX(comax, fabs(LOC2(devSim_MP2.coefficient,KKK-1,i3-1,nbasis,nbasis))); + comax = MAX(comax, fabs(LOC2(devSim_MP2.coefficient,LLL-1,i3-1,nbasis,nbasis))); + } + } + } + testCutoff *= comax; + + if(testCutoff>cutoffmp2) + { + // from shellmp2: + QUICKADD(ntempptr_d[0], (unsigned long long int)1); + for(int I=NII1; I<=NII2; I++) + { + for(int J=NJJ1; J<=NJJ2; J++) + { + for(int K=NKK1; K<=NKK2; K++) + { + for(int L=NLL1; L<=NLL2; L++) + { + // is I,J,K,L the same iii,jjj,kkk,lll as in get2e_MP2_kernel()? + int nshell = devSim_MP2.nshell; + QUICKDouble DNMax = MAX(MAX(4.0*LOC2(devSim_MP2.cutMatrix, II, JJ, nshell, nshell), 4.0*LOC2(devSim_MP2.cutMatrix, KK, LL, nshell, nshell)), + MAX(MAX(LOC2(devSim_MP2.cutMatrix, II, LL, nshell, nshell), LOC2(devSim_MP2.cutMatrix, II, KK, nshell, nshell)), + MAX(LOC2(devSim_MP2.cutMatrix, JJ, KK, nshell, nshell), LOC2(devSim_MP2.cutMatrix, JJ, LL, nshell, nshell)))); + + if ((LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell) * LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell))> devSim_MP2.integralCutoff && \ + (LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell) * LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell) * DNMax) > devSim_MP2.integralCutoff) + { + // equivalent to classmp2 + iclass_MP2(I,J,K,L,II,JJ,KK,LL,nstepmp2s,nsteplength,nstep, nbasistemp, DNMax,orbmp2i331); + } + } + } + } + } + } + } + } + } } +//this kernel is to parallelize the second transformation +__global__ void +__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) secondQuarterTransKernel\ +(int III, int JJJ, int IIInew,int JJJnew, int nsteplength, int nstep, int nbasistemp, \ +QUICKDouble* orbmp2i331, QUICKDouble* orbmp2j331) +{ + int offside = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; + + int nbasis = devSim_MP2.nbasis; + int nElec = devSim_MP2.nElec; + int iocc = nElec/2; + int ivir = nbasis - iocc; + QUICKDouble* coefficient = devSim_MP2.coefficient; + + int myInt = nbasis*ivir*nsteplength/totalThreads; + if(nbasis*ivir*nsteplength-myInt*totalThreads>offside) myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int icycle = currentInt%nsteplength; + currentInt /= nsteplength; + int j33 = currentInt%ivir; + int LLL = currentInt/ivir; + + if(LLLthreadIdx.x) myInt++; + for(int i=1;i<=myInt;i++) + { + int k33 =blockDim.x*(i-1) +threadIdx.x; + atemps[k33]=LOC2(coefficient, III-1, k33, nbasis, nbasis); + atemp2s[k33]=LOC2(coefficient, JJJ-1, k33, nbasis, nbasis); + } + + } + __syncthreads(); +*/ + + int myInt = ivir*iocc*nsteplength/totalThreads; + if(ivir*iocc*nsteplength-myInt*totalThreads>offside) myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int icycle = currentInt%nsteplength; + currentInt /= nsteplength; + int k33 = currentInt%iocc; + int j33 = currentInt/iocc; + + if(j33offside) + myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int lll = currentInt%nbasis; + currentInt/=nbasis; + int l3 = currentInt%ivir; + int j3 = currentInt/ivir; + + if(j3jshell; + int nshell = gpu->nshell; + int nbasis = gpu->nbasis; + int nElec = gpu->nElec; + int nfrozencore = gpu->nfrozencore; + int iocc = nElec/2 - nfrozencore; + int ivir = nbasis - nElec/2; + + printf("in fourQuarterTransHost, nfrozencore is %d\n ",nfrozencore); + + QUICKDouble cutoffmp2 = 1.0E-8; + QUICKDouble ttt = 0; + QUICKDouble* YCutoff = gpu->gpu_cutoff->YCutoff->_hostData; + + for(int i=0;igpu_basis->Qstart->_hostData; + int* Qfinal = gpu->gpu_basis->Qfinal->_hostData; + int* Qsbasis = gpu->gpu_basis->Qsbasis->_hostData; + int* Qfbasis = gpu->gpu_basis->Qfbasis->_hostData; + + QUICKDouble ememorysum = iocc*ivir*nbasis*8.0/1024.0/1024.0/1024.0; + QUICKDouble reqmemmax = 1.5; + int nstep = MIN(int(reqmemmax/ememorysum),iocc); + if(nstep<1) + nstep = 1; + printf("nstepsize is %d\n", nstep); + + *ememorysumptr = ememorysum; + //use 6 for up to d orbitals, and 10 for f orbitals: + int nbasistemp = 10; + + printf("in cuda, reqmemmax is %lf, nbasistemp is %d\n", reqmemmax, nbasistemp); + + QUICKDouble* orbmp2i331_d; + cudaMalloc((void **)&orbmp2i331_d, sizeof(QUICKDouble)*nstep*nbasis*nbasistemp*nbasistemp*2); + + QUICKDouble* orbmp2j331_d; + cudaMalloc((void **)&orbmp2j331_d, sizeof(QUICKDouble)*nstep*ivir*nbasistemp*nbasistemp*2); + + printf("memory consumption for 3rd trans is %lfG\n", sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis/1024.0/1024.0/1024.0); + + QUICKDouble* orbmp2k331_d; + cudaMalloc((void **)&orbmp2k331_d,sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis); + cudaMemset(orbmp2k331_d, 0, sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis); + + QUICKDouble* orbmp2_d; + cudaMalloc((void **)&orbmp2_d,sizeof(QUICKDouble)*ivir*ivir); + cudaMemset(orbmp2_d,0,sizeof(QUICKDouble)*ivir*ivir); + + QUICKDouble* MP2cor_d; + cudaMalloc((void **)&MP2cor_d,sizeof(QUICKDouble)); + cudaMemset(MP2cor_d, 0, sizeof(QUICKDouble)); + + QUICKDouble* MP2cor = new QUICKDouble[1]; + MP2cor[0]= 0; + + int nstepmp2 = iocc/nstep; + if(nstep*nstepmp2cutoffmp2/ttt) + { + cudaMemset(orbmp2i331_d,0,sizeof(QUICKDouble)*nstep*nbasis*nbasistemp*nbasistemp*2); + cudaMemset(orbmp2j331_d,0,sizeof(QUICKDouble)*nstep*ivir*nbasistemp*nbasistemp*2); + firstQuarterTransKernel<<blocks, gpu->twoEThreadsPerBlock>>>\ + (II,JJ,nstepmp2s,nsteplength,nstep,nbasistemp, ntempptr_d, cutoffmp2, orbmp2i331_d); + cudaDeviceSynchronize(); + + int NII1=Qstart[II]; + int NII2=Qfinal[II]; + int NJJ1=Qstart[JJ]; + int NJJ2=Qfinal[JJ]; + + int NBI1= LOC2(Qsbasis, II, NII1, nshell, 4); + int NBI2= LOC2(Qfbasis, II, NII2, nshell, 4); + int NBJ1= LOC2(Qsbasis, JJ, NJJ1, nshell, 4); + int NBJ2= LOC2(Qfbasis, JJ, NJJ2, nshell, 4); + + int II111=NBI1; + int II112=NBI2; + int JJ111=NBJ1; + int JJ112=NBJ2; + + for(int III=II111; III<=II112; III++) + { + for(int JJJ=max(III,JJ111); JJJ<=JJ112; JJJ++) + { + int IIInew = III - II111; + int JJJnew = JJJ - JJ111; + + // second quarter transformation + secondQuarterTransKernel<<blocks, gpu->twoEThreadsPerBlock>>>(III,JJJ,IIInew,JJJnew,nsteplength, nstep, nbasistemp,orbmp2i331_d,orbmp2j331_d); + cudaDeviceSynchronize(); + + // third quarter transformation, use devSim_MP2.orbmp2k331 and QUICKADD + thirdQuarterTransKernel<<blocks, gpu->twoEThreadsPerBlock>>>(III,JJJ,IIInew,JJJnew,nsteplength,nstep,nbasistemp,orbmp2j331_d,orbmp2k331_d); + cudaDeviceSynchronize(); + } + } + } + } + } + cudaMemcpy(ntempptr, ntempptr_d, sizeof(unsigned long long int),cudaMemcpyDeviceToHost); + printf("in fourQuarterTransHost, effect integrals is %llu\n\n", *ntempptr); + + // start the forth quarter and the final accumulation + for(int icycle=0;icycleblocks, gpu->twoEThreadsPerBlock>>>(icycle, i3, k3,nstep, orbmp2k331_d, orbmp2_d); + //cudaDeviceSynchronize(); + + //reinitialization is needed is using forthQuarterTransKernelV2 which is further parallelized, + //because no initialization can be done inside the kernel(one entry is accessed by multi threads) + cudaMemset(orbmp2_d,0,sizeof(QUICKDouble)*ivir*ivir); + forthQuarterTransKernelV2<<blocks, gpu->twoEThreadsPerBlock>>>(icycle, i3, k3,nstep, orbmp2k331_d, orbmp2_d); + cudaDeviceSynchronize(); + + finalMP2AccumulationKernel<<blocks, gpu->twoEThreadsPerBlock>>>(i3, k3, orbmp2_d, MP2cor_d); + cudaDeviceSynchronize(); + } + } + + } + cudaMemcpy(MP2cor, MP2cor_d, sizeof(QUICKDouble),cudaMemcpyDeviceToHost); + cudaMemcpy(gpu->gpu_calculated->mp2cor->_devData, MP2cor_d, sizeof(QUICKDouble),cudaMemcpyDeviceToDevice); + + cudaFree(orbmp2i331_d); + cudaFree(orbmp2j331_d); + cudaFree(orbmp2k331_d); + cudaFree(orbmp2_d); + cudaFree(MP2cor_d); + cudaFree(ntempptr_d); + delete[] MP2cor; +} + + +__global__ void +__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) forthQuarterTransKernel(int icycle, int i3, int k3, int nstep, QUICKDouble* orbmp2k331, QUICKDouble* orbmp2) +{ + QUICKDouble* coefficient = devSim_MP2.coefficient; + int nElec = devSim_MP2.nElec; + int nbasis = devSim_MP2.nbasis; + int iocc = nElec/2; + int ivir = nbasis - iocc; + + int offside = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; + int myInt = ivir*ivir/totalThreads; + + if(ivir*ivir-myInt*totalThreads>offside) + myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int j3 = currentInt/ivir; + int l3 = currentInt%ivir; + + if(j3offside) + myInt++; + + for(int i=1;i<=myInt;i++) + { + int currentInt = totalThreads*(i-1)+offside; + int j3 = currentInt/ivir; + int l3 = currentInt%ivir; + + if(j3i3-nfrozencore) + { + QUICKADD(MP2cor[0], 2.0/(molorbe[i3]+molorbe[k3+nfrozencore]-molorbe[j3+nElec/2]-molorbe[l3+nElec/2]) \ + *LOC2(orbmp2, j3, l3, ivir, ivir) \ + *(2.0*LOC2(orbmp2, j3, l3, ivir, ivir) \ + - LOC2(orbmp2, l3, j3, ivir, ivir))); + + } + if(k3==i3-nfrozencore) + { + QUICKADD(MP2cor[0],1.0/(molorbe[i3]+molorbe[k3+nfrozencore]-molorbe[j3+nElec/2]-molorbe[l3+nElec/2]) \ + *LOC2(orbmp2,j3,l3,ivir, ivir) \ + *(2.0*LOC2(orbmp2, j3, l3, ivir, ivir) \ + - LOC2(orbmp2, l3, j3, ivir, ivir))); + } + } + } +} // totTime is the timer for GPU 2e time. Only on under debug mode + #ifdef DEBUG static float totTime; #endif -void get2e_MP2(_gpu_type gpu) +void get2e_MP2(_gpu_type gpu, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int * ntemp) { #ifdef DEBUG cudaEvent_t start,end; @@ -50,41 +563,57 @@ void get2e_MP2(_gpu_type gpu) cudaEventCreate(&end); cudaEventRecord(start, 0); #endif - printf("BLOCK= %i, THREADSperBLOCK= %i\n", gpu->blocks, gpu->twoEThreadsPerBlock); - get2e_MP2_kernel<<blocks, gpu->twoEThreadsPerBlock>>>(); - + float mp2_time; + cudaEvent_t mp2_start, mp2_trans; + cudaEventCreate(&mp2_start); + cudaEventCreate(&mp2_trans); + cudaEventRecord(mp2_start,0); + fourQuarterTransHost(gpu, ememorysum, nstepmp2, ntemp); + cudaEventRecord(mp2_trans,0); + cudaEventSynchronize(mp2_trans); + cudaEventElapsedTime(&mp2_time,mp2_start,mp2_trans); + printf("in get2e_MP2, total gpu mp2 four quarter transformation time is %6.3f ms\n", mp2_time); + + cudaEventDestroy(mp2_start); + cudaEventDestroy(mp2_trans); + #ifdef DEBUG cudaEventRecord(end, 0); cudaEventSynchronize(end); float time; cudaEventElapsedTime(&time, start, end); totTime+=time; - printf("this cycle:%f ms total time:%f ms\n", time, totTime); + //printf("this cycle:%f ms total time:%f ms\n", time, totTime); cudaEventDestroy(start); cudaEventDestroy(end); #endif - + printf("get2e_MP2 is done\n"); } - +/* __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() { + unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x; int totalThreads = blockDim.x*gridDim.x; - + //printf("in get2e_MP2_kernel, totalThread is %d, offside is %d\n", totalThreads, offside); + QUICKULL jshell = (QUICKULL) devSim_MP2.sqrQshell; QUICKULL myInt = (QUICKULL) jshell*jshell / totalThreads; + + + if ((jshell*jshell - myInt*totalThreads)> offside) myInt++; //256>offside? - if ((jshell*jshell - myInt*totalThreads)> offside) myInt++; - - for (QUICKULL i = 1; i<=myInt; i++) { - - QUICKULL currentInt = totalThreads * (i-1)+offside; + for (QUICKULL i = 1; i<=myInt; i++) { + + + QUICKULL currentInt = totalThreads * (i-1)+offside; //currentInt=offside QUICKULL a = (QUICKULL) currentInt/jshell; QUICKULL b = (QUICKULL) (currentInt - a*jshell); - +*/ + //printf("in get2e_MP2_kernel/for, offside is %d\n",offside); /* the following code is to implement a better index mode. The original one can be see as: @@ -120,8 +649,7 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() b = 2*t-k; }*/ - - +/* int II = devSim_MP2.sorted_YCutoffIJ[a].x; int JJ = devSim_MP2.sorted_YCutoffIJ[a].y; int KK = devSim_MP2.sorted_YCutoffIJ[b].x; @@ -131,8 +659,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() int jj = devSim_MP2.sorted_Q[JJ]; int kk = devSim_MP2.sorted_Q[KK]; int ll = devSim_MP2.sorted_Q[LL]; - - if (ii<=kk){ + + //printf("for ii,jj,kk,ll = %d, %d, %d, %d, allocate local memory orbmp2i331\n", ii, jj,kk,ll); + ////QUICKDouble* orbmp2i331 = new QUICKDouble[devSim_MP2.nElec/2*devSim_MP2.nbasis*6*6*2]; + QUICKDouble* orbmp2i331 = new QUICKDouble[1]; + + if(ii<=kk){ int nshell = devSim_MP2.nshell; QUICKDouble DNMax = MAX(MAX(4.0*LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), 4.0*LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)), MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell), LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)), @@ -146,12 +678,15 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() int kkk = devSim_MP2.sorted_Qnumber[KK]; int lll = devSim_MP2.sorted_Qnumber[LL]; - iclass_MP2(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax); - - } - } + //printf("to call iclass_MP2 in kernel, iii, jjj, kkk, lll are %d, %d, %d, %d\n", iii, jjj, kkk, lll); + iclass_MP2(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, orbmp2i331); // Y should be added to orbmp2i331; + } + } + + delete[] orbmp2i331; } } +*/ /* sqr for double precision. there no internal function to do that in fast-math-lib of CUDA @@ -166,9 +701,8 @@ __device__ __forceinline__ QUICKDouble quick_dsqr_MP2(QUICKDouble a) iclass subroutine is to generate 2-electron intergral using HRR and VRR method, which is the most performance algrithem for electron intergral evaluation. See description below for details */ -__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, QUICKDouble DNMax) +__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, int nstepmp2s, int nsteplength, int nstep, int nbasistemp, QUICKDouble DNMax, QUICKDouble* orbmp2i331) { - /* kAtom A, B, C ,D is the coresponding atom for shell ii, jj, kk, ll and be careful with the index difference between Fortran and C++, @@ -201,7 +735,8 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int kStartJ = devSim_MP2.kstart[JJ]-1; int kStartK = devSim_MP2.kstart[KK]-1; int kStartL = devSim_MP2.kstart[LL]-1; - + + int nbasis = devSim_MP2.nbasis; /* store saves temp contracted integral as [as|bs] type. the dimension should be allocatable but because @@ -345,8 +880,8 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int KKK2 = LOC2(devSim_MP2.Qfbasis, KK, K, devSim_MP2.nshell, 4); int LLL1 = LOC2(devSim_MP2.Qsbasis, LL, L, devSim_MP2.nshell, 4); int LLL2 = LOC2(devSim_MP2.Qfbasis, LL, L, devSim_MP2.nshell, 4); - - + + // maxIJKL is the max of I,J,K,L int maxIJKL = (int)MAX(MAX(I,J),MAX(K,L)); @@ -359,131 +894,113 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned hybrid_coeff = 1.0; }else if (devSim_MP2.method == B3LYP){ hybrid_coeff = 0.2; - }else if (devSim_MP2.method == DFT){ + }else if (devSim_MP2.method == BLYP){ hybrid_coeff = 0.0; } + if(IIdevSim_MP2.integralCutoff){ - QUICKDouble DENSEKI = (QUICKDouble) LOC2(devSim_MP2.dense, KKK-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - QUICKDouble DENSEKJ = (QUICKDouble) LOC2(devSim_MP2.dense, KKK-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - QUICKDouble DENSELJ = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - QUICKDouble DENSELI = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - QUICKDouble DENSELK = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, KKK-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim_MP2.dense, JJJ-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis); - - - // ATOMIC ADD VALUE 1 - QUICKDouble _tmp = 2.0; - if (KKK==LLL) { - _tmp = 1.0; - } - - QUICKDouble val1d = _tmp*DENSELK*Y; - //if (abs(val1d) > devSim_MP2.integralCutoff ) { - QUICKULL val1 = (QUICKULL) (fabs(val1d*OSCALE) + (QUICKDouble)0.5); - if ( val1d < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_MP2.oULL, JJJ-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); - // } - - // ATOMIC ADD VALUE 2 - if ((LLL != JJJ) || (III!=KKK)) { - _tmp = 2.0; - if (III==JJJ) { - _tmp = 1.0; - } - - QUICKDouble val2d = _tmp*DENSEJI*Y; - // if (abs(val2d) > devSim_MP2.integralCutoff ) { - QUICKULL val2 = (QUICKULL) (fabs(val2d*OSCALE) + (QUICKDouble)0.5); - if ( val2d < (QUICKDouble)0.0) val2 = 0ull - val2; - QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, KKK-1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); - // } - } - - - // ATOMIC ADD VALUE 3 - - QUICKDouble val3d = hybrid_coeff*0.5*DENSELJ*Y; - //if (abs(2*val3d) > devSim_MP2.integralCutoff ) { - QUICKULL val3 = (QUICKULL) (fabs(val3d*OSCALE) + (QUICKDouble)0.5); - if (((III == KKK) && (III < JJJ) && (JJJ < LLL))) { - val3 = (QUICKULL) (fabs(2*val3d*OSCALE) + (QUICKDouble)0.5); - } - if ( DENSELJ*Y < (QUICKDouble)0.0) val3 = 0ull - val3; - QUICKADD(LOC2(devSim_MP2.oULL, KKK-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val3); - //} - - // ATOMIC ADD VALUE 4 - if (KKK != LLL) { - QUICKDouble val4d = hybrid_coeff*0.5*DENSEKJ*Y; - // if (abs(val4d) > devSim_MP2.integralCutoff ) { - QUICKULL val4 = (QUICKULL) (fabs(val4d*OSCALE) + (QUICKDouble)0.5); - if ( val4d < (QUICKDouble)0.0) val4 = 0ull - val4; - QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val4); - //} - } - - - - // ATOMIC ADD VALUE 5 - QUICKDouble val5d = hybrid_coeff*0.5*DENSELI*Y; - //if (abs(val5d) > devSim_MP2.integralCutoff ) { - QUICKULL val5 = (QUICKULL) (fabs(val5d*OSCALE) + (QUICKDouble)0.5); - if ( val5d < (QUICKDouble)0.0) val5 = 0ull - val5; - - if ((III != JJJ && III devSim_MP2.integralCutoff ) { - - QUICKULL val6 = (QUICKULL) (fabs(val6d*OSCALE) + (QUICKDouble)0.5); - if ( val6d < (QUICKDouble)0.0) val6 = 0ull - val6; - - QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,LLL)-1, MIN(JJJ,LLL)-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val6); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III!= KKK) { - QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val6); - } + + + //first quarter transformation: + if(fabs(Y)>devSim_MP2.integralCutoff) + { + for(int i3mp2=1; i3mp2<=nsteplength;i3mp2++) + { + int i3mp2new = nstepmp2s+i3mp2-1; + QUICKDouble atemp = LOC2(devSim_MP2.coefficient, KKK-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y; + QUICKDouble btemp = LOC2(devSim_MP2.coefficient, LLL-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y; + + // AO index in its shell + int IIInew = III- devSim_MP2.Ksumtype[II]+1; + int JJJnew = JJJ- devSim_MP2.Ksumtype[JJ]+1; + //printf("Here III is %d, II is %d, devSim_MP2.Ksumtype[II] is %d, IIInew is %d\n",III, II, devSim_MP2.Ksumtype[II], IIInew); + // KKK -> i3mp2, note i3mp2 is from [1, nsteplength], nsteplength is from nstep, MAX(nstep) is nElec/2 + QUICKADD(LOC5(orbmp2i331,i3mp2-1,LLL-1,IIInew-1,JJJnew-1,0, nstep, nbasis, nbasistemp, nbasistemp, 2), atemp); + // KKK -> i3mp2 + QUICKADD(LOC5(orbmp2i331,i3mp2-1,LLL-1,JJJnew-1,IIInew-1,1, nstep, nbasis, nbasistemp, nbasistemp, 2), atemp); + // LLL -> i3mp2 + QUICKADD(LOC5(orbmp2i331,i3mp2-1,KKK-1,IIInew-1,JJJnew-1,0, nstep, nbasis, nbasistemp, nbasistemp, 2), btemp); + // LLL -> i3mp2 + QUICKADD(LOC5(orbmp2i331,i3mp2-1,KKK-1,JJJnew-1,IIInew-1,1, nstep, nbasis, nbasistemp, nbasistemp, 2), btemp); + } + } + + + } + } + } + } + } + else + { + for(int III=III1; III<=III2; III++) + { + if(MAX(III,JJJ1)<=JJJ2) + { + for(int JJJ=MAX(III,JJJ1);JJJ<=JJJ2;JJJ++) + { + for(int KKK=KKK1; KKK<=KKK2; KKK++) + { + if(MAX(KKK,LLL1)<=LLL2) + { + for(int LLL=MAX(KKK,LLL1);LLL<=LLL2;LLL++) + { + QUICKDouble Y = (QUICKDouble) hrrwhole_MP2( I, J, K, L,\ + III, JJJ, KKK, LLL, IJKLTYPE, store, \ + RAx, RAy, RAz, RBx, RBy, RBz, \ + RCx, RCy, RCz, RDx, RDy, RDz); + if(fabs(Y)>devSim_MP2.integralCutoff) + { + for(int i3mp2=1;i3mp2<=nsteplength;i3mp2++) + { + int i3mp2new = nstepmp2s+i3mp2-1; + QUICKDouble atemp = LOC2(devSim_MP2.coefficient, KKK-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y; + QUICKDouble btemp = LOC2(devSim_MP2.coefficient, LLL-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y; + int IIInew = III - devSim_MP2.Ksumtype[II]+1; + int JJJnew = JJJ - devSim_MP2.Ksumtype[JJ]+1; + QUICKADD(LOC5(orbmp2i331, i3mp2-1, LLL-1, IIInew-1, JJJnew-1, 0, nstep, nbasis, nbasistemp, nbasistemp, 2),atemp); + if(JJJ != III) + { + QUICKADD(LOC5(orbmp2i331, i3mp2-1, LLL-1, JJJnew-1, IIInew-1, 1, nstep, nbasis, nbasistemp, nbasistemp, 2),atemp); + } + if(KKK != LLL) + { + QUICKADD(LOC5(orbmp2i331, i3mp2-1, KKK-1, IIInew-1, JJJnew-1, 0, nstep, nbasis, nbasistemp, nbasistemp, 2),btemp); + if(III != JJJ) + { + QUICKADD(LOC5(orbmp2i331, i3mp2-1, KKK-1, JJJnew-1, IIInew-1, 1, nstep, nbasis, nbasistemp, nbasistemp, 2),btemp); + } + } + + } + } } - //} } } - //} - } } } } + return; } @@ -497,8 +1014,27 @@ __device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalT QUICKDouble ABCDtemp,QUICKDouble ABtemp, \ QUICKDouble CDtemp, QUICKDouble ABcom, QUICKDouble CDcom) { + //unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x; + + // here is fine + //printf("at the begining of vertical_MP2, I, J, K, L are %d, %d, %d, %d\n",I, J, K, L); + + /* + if(offside==0) + { + printf("Ptempx is %lf, Ptempy is %lf, Ptempz is %lf, WPtempx is %lf, Qtempx is %lf, WQtempx is %lf, ABCDtemp is %lf, CDtemp is %lf, VY(0,0,0) is %lf\n",\ + Ptempx, Ptempy, Ptempz, WPtempx, Qtempx, WQtempx, ABCDtemp, CDtemp, VY(0,0,0)); + } + */ LOC2(store, 0, 0, STOREDIM, STOREDIM) += VY( 0, 0, 0); + + /* + if(offside==0) + { + printf("at the begining of vertical_MP2, store[0] is %lf, this is case %d\n", store[0], (I+J)*10+K+L); + } + */ switch ((I+J)*10+K+L){ case 1: { @@ -1086,7 +1622,8 @@ __device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalT default: { #ifndef CUDA_SP - if (K+L>=1){ + //printf("fall into default case, I, J, K, L are %d, %d, %d, %d\n",I, J, K, L); + if (K+L>=1){ //SSPS(0, YVerticalTemp, Qtempx, Qtempy, Qtempz, WQtempx, WQtempy, WQtempz); QUICKDouble x_0_1_0 = Qtempx * VY( 0, 0, 0) + WQtempx * VY( 0, 0, 1); QUICKDouble x_0_2_0 = Qtempy * VY( 0, 0, 0) + WQtempy * VY( 0, 0, 1); @@ -5479,30 +6016,38 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \ QUICKDouble RCx,QUICKDouble RCy,QUICKDouble RCz, \ QUICKDouble RDx,QUICKDouble RDy,QUICKDouble RDz) { + //printf("at the beginning of hrrwhole_MP2\n"); QUICKDouble Y; #ifdef CUDA_SP + printf("def CUDA_SP\n") int NAx = LOC2(devSim_MP2.KLMN,0,III-1,3,devSim_MP2.nbasis); int NAy = LOC2(devSim_MP2.KLMN,1,III-1,3,devSim_MP2.nbasis); int NAz = LOC2(devSim_MP2.KLMN,2,III-1,3,devSim_MP2.nbasis); - + //printf("NAx, NAy, NAz are done\n"); + int NBx = LOC2(devSim_MP2.KLMN,0,JJJ-1,3,devSim_MP2.nbasis); int NBy = LOC2(devSim_MP2.KLMN,1,JJJ-1,3,devSim_MP2.nbasis); int NBz = LOC2(devSim_MP2.KLMN,2,JJJ-1,3,devSim_MP2.nbasis); - + //printf("NBx, NBy, NBz are done\n"); + int NCx = LOC2(devSim_MP2.KLMN,0,KKK-1,3,devSim_MP2.nbasis); int NCy = LOC2(devSim_MP2.KLMN,1,KKK-1,3,devSim_MP2.nbasis); int NCz = LOC2(devSim_MP2.KLMN,2,KKK-1,3,devSim_MP2.nbasis); + //printf("NCx, NCy, NCz are done\n"); int NDx = LOC2(devSim_MP2.KLMN,0,LLL-1,3,devSim_MP2.nbasis); int NDy = LOC2(devSim_MP2.KLMN,1,LLL-1,3,devSim_MP2.nbasis); int NDz = LOC2(devSim_MP2.KLMN,2,LLL-1,3,devSim_MP2.nbasis); - + //printf("NDx, NDy, NDz are done\n"); + int MA = LOC3(devTrans_MP2, NAx, NAy, NAz, TRANSDIM, TRANSDIM, TRANSDIM); int MB = LOC3(devTrans_MP2, NBx, NBy, NBz, TRANSDIM, TRANSDIM, TRANSDIM); int MC = LOC3(devTrans_MP2, NCx, NCy, NCz, TRANSDIM, TRANSDIM, TRANSDIM); int MD = LOC3(devTrans_MP2, NDx, NDy, NDz, TRANSDIM, TRANSDIM, TRANSDIM); + //printf("inside hrrwhole_MP2, IJKLTYPE is %d\n", IJKLTYPE); + switch (IJKLTYPE) { case 0: case 10: @@ -5728,11 +6273,10 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \ break; } #else - int angularL[8], angularR[8]; QUICKDouble coefAngularL[8], coefAngularR[8]; - Y = (QUICKDouble) 0.0; - + Y = (QUICKDouble) 0.0; + int numAngularL = lefthrr_MP2(RAx, RAy, RAz, RBx, RBy, RBz, LOC2(devSim_MP2.KLMN,0,III-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,III-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,III-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,0,JJJ-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,JJJ-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,JJJ-1,3,devSim_MP2.nbasis), @@ -5741,14 +6285,15 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \ LOC2(devSim_MP2.KLMN,0,KKK-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,KKK-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,KKK-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,0,LLL-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,LLL-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,LLL-1,3,devSim_MP2.nbasis), L, coefAngularR, angularR); - + for (int i = 0; i=B?A:B) #define MIN(A,B) (AmaxL is %d\n", gpu->maxL); // Part spd // nvtxRangePushA("SCF 2e"); + //printf("before enter get2e_kernel\n"); QUICK_SAFE_CALL((get2e_kernel<<blocks, gpu->twoEThreadsPerBlock>>>())); - + //printf("gpu->maxL is %d\n", gpu->maxL); + #ifdef CUDA_SPDF + //printf("CUDA_SPDF\n"); if (gpu->maxL >= 3) { // Part f-1 QUICK_SAFE_CALL((get2e_kernel_spdf<<blocks, gpu->twoEThreadsPerBlock>>>())); diff --git a/src/cuda/gpu_get2e_subs.h b/src/cuda/gpu_get2e_subs.h index 95de8ac3d..eceeb5a8d 100644 --- a/src/cuda/gpu_get2e_subs.h +++ b/src/cuda/gpu_get2e_subs.h @@ -108,14 +108,18 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() #endif { + //if(threadIdx.x == 0) + //printf("inside get2e_kernel\n"); unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x; int totalThreads = blockDim.x*gridDim.x; - + //printf("offside is %d\n", offside); + //printf("totalThreads is %d\n", totalThreads); + // jshell and jshell2 defines the regions in i+j and k+l axes respectively. // sqrQshell= Qshell x Qshell; where Qshell is the number of sorted shells (see gpu_upload_basis_ in gpu.cu) // for details on sorting. -#ifdef int_spd +#ifdef int_spd //kernel 0, zone 0,1,2,3,4 /* Here we walk through full cutoff matrix. @@ -129,11 +133,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() |_____________| | */ - + //printf("is int_spd"); QUICKULL jshell = (QUICKULL) devSim.sqrQshell; QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell; + //printf("jshell and jshell2 are %llu\n", jshell); -#elif defined int_spdf +#elif defined int_spdf //kernel 1, zone 1,3,4 /* Here we walk through following region of the cutoff matrix. @@ -152,18 +157,18 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() QUICKULL jshell = (QUICKULL) devSim.sqrQshell; QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - devSim.fStart; -#elif defined int_spdf2 +#elif defined int_spdf2 //kernel 2, zone 2,3,4 QUICKULL jshell = (QUICKULL) devSim.sqrQshell; QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - devSim.fStart; -#elif defined int_spdf3 +#elif defined int_spdf3 //kernel 3, zone 3,4 QUICKULL jshell0 = (QUICKULL) devSim.fStart; QUICKULL jshell = (QUICKULL) devSim.sqrQshell - jshell0; QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - jshell0; -#elif defined int_spdf4 +#elif defined int_spdf4 //kernel 4, zone 4 QUICKULL jshell0 = (QUICKULL) devSim.fStart; QUICKULL jshell00 = (QUICKULL) devSim.ffStart; @@ -236,6 +241,9 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() QUICKULL a = (QUICKULL) i/jshell; QUICKULL b = (QUICKULL) (i - a*jshell); + //printf("a is %llu", a); + //printf("b is %llu", b); + #elif defined int_spdf @@ -348,7 +356,7 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() a = 0; b = 0; } - +// a,b are thread ordinates (x,y) #endif #ifdef CUDA_MPIV @@ -361,6 +369,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10() int ii = devSim.sorted_Q[II]; int kk = devSim.sorted_Q[KK]; + //printf("II is %d\n",II); + //printf("KK is %d\n",KK); + //printf("ii is %d\n",ii); + //printf("kk is %d\n",kk); + + if (ii<=kk){ diff --git a/src/cuda/gpu_type.h b/src/cuda/gpu_type.h index 62ca4f3c7..7a0b3ed38 100644 --- a/src/cuda/gpu_type.h +++ b/src/cuda/gpu_type.h @@ -31,7 +31,15 @@ struct gpu_calculated_type { int nbasis; // number of basis sets cuda_buffer_type* o; // O matrix cuda_buffer_type* dense; // Density Matrix - cuda_buffer_type* oULL; // Unsigned long long int type O matrix + cuda_buffer_type* coefficient; //Coefficient Matrix + //cuda_buffer_type* Y_Matrix; // Y Matrix + cuda_buffer_type* mp2cor; + //cuda_buffer_type* orbmp2i331; //first quarter of ERI transformation + //cuda_buffer_type* orbmp2j331; //second quarter of ERI transformation + //cuda_buffer_type* orbmp2k331; //third quarter of ERI transformation + //cuda_buffer_type* orbmp2; //forth quarter of ERI transformation + cuda_buffer_type* molorbe; + cuda_buffer_type* oULL; // Unsigned long long int type O matrix cuda_buffer_type* distance; // distance matrix }; @@ -170,6 +178,7 @@ struct gpu_simulation_type { int fStart; int ffStart; int maxL; + int nfrozencore; //New XC implementation int npoints; //Total number of packed grid points @@ -273,7 +282,15 @@ struct gpu_simulation_type { QUICKDouble* o; QUICKULL* oULL; QUICKDouble* dense; - + QUICKDouble* coefficient; + //QUICKDouble* Y_Matrix; + QUICKDouble* mp2cor; + //QUICKDouble* orbmp2i331; + //QUICKDouble* orbmp2j331; + //QUICKDouble* orbmp2k331; + //QUICKDouble* orbmp2; + QUICKDouble* molorbe; + QUICKDouble* distance; QUICKDouble* Xcoeff; QUICKDouble* expoSum; @@ -330,7 +347,7 @@ struct gpu_basis_type { int fStart; int ffStart; - + int nfrozencore; // Gaussian Type function cuda_buffer_type* ncontract; @@ -430,7 +447,8 @@ struct gpu_type { int jshell; int jbasis; int maxL; - + int nfrozencore; + cuda_buffer_type* iattype; cuda_buffer_type* xyz; cuda_buffer_type* chg; diff --git a/src/amber_interface.f90 b/src/deprecated/amber_interface.f90 similarity index 100% rename from src/amber_interface.f90 rename to src/deprecated/amber_interface.f90 diff --git a/src/calmp2divcon.f90 b/src/deprecated/calmp2divcon.f90 similarity index 100% rename from src/calmp2divcon.f90 rename to src/deprecated/calmp2divcon.f90 diff --git a/src/debug_quick.f90 b/src/deprecated/debug_quick.f90 similarity index 100% rename from src/debug_quick.f90 rename to src/deprecated/debug_quick.f90 diff --git a/src/deprecated/dipole.f90 b/src/deprecated/dipole.f90 new file mode 100644 index 000000000..8df356290 --- /dev/null +++ b/src/deprecated/dipole.f90 @@ -0,0 +1,498 @@ +! +! dipole.f90 +! new_quick +! +! Created by Yipu Miao on 3/4/11. +! Copyright 2011 University of Florida. All rights reserved. +! +! written by Ed Brothers. March 18,2002 +! 3456789012345678901234567890123456789012345678901234567890123456789012<= quick_method%iscf-1) then diff --git a/src/scf_operator.f90 b/src/scf_operator.f90 index b5d3c9eef..cec6cabfb 100644 --- a/src/scf_operator.f90 +++ b/src/scf_operator.f90 @@ -84,7 +84,10 @@ subroutine scf_operator(deltaO) call cpu_time(timer_begin%T2e) #if defined CUDA || defined CUDA_MPIV - if (quick_method%bCUDA) then + + print *, "in scf_operator, get CUDA" + + if (quick_method%bCUDA) then if(quick_method%HF)then call gpu_upload_method(0, 1.0d0) @@ -105,6 +108,8 @@ subroutine scf_operator(deltaO) if (quick_method%nodirect) then #ifdef CUDA + print *, "call gpu_addint" + print *, "" call gpu_addint(quick_qm_struct%o, intindex, intFileName) #else #ifndef MPI @@ -120,14 +125,17 @@ subroutine scf_operator(deltaO) ! The next two terms define the two electron part. !----------------------------------------------------------------- #if defined CUDA || defined CUDA_MPIV - if (quick_method%bCUDA) then - call gpu_get2e(quick_qm_struct%o) + if (quick_method%bCUDA) then + !print *, "before gpu_get2e, quick_qm_struct%o is ", quick_qm_struct%o + call gpu_get2e(quick_qm_struct%o) !c function + !print *, "after gpu_get2e, quick_qm_struct%o is ", quick_qm_struct%o else #endif ! Schwartz cutoff is implemented here. (ab|cd)**2<=(ab|ab)*(cd|cd) ! Reference: Strout DL and Scuseria JCP 102(1995),8448. #if defined MPIV && !defined CUDA_MPIV + print *, "defined MPIV and not defined CUDA_MPIV" ! Every nodes will take about jshell/nodes shells integrals such as 1 water, which has ! 4 jshell, and 2 nodes will take 2 jshell respectively. if(bMPI) then @@ -141,9 +149,15 @@ subroutine scf_operator(deltaO) enddo endif #else + !print *, "Here in scf_operator, jshell is", jshell + !print *, "Here in scf_operator, before adding 2e integrals, quick_qm_struct%o is" + !print *, quick_qm_struct%o do II=1,jshell - call get2e(II) + call get2e(II) !!serial get2e and added to quick_qm_struct%o: get2e->shell->iclass enddo + !print *, "Here in scf_operator, after adding 2e integrals, quick_qm_struct%o is" + !print *, quick_qm_struct%o + !print *, " " #endif #if defined CUDA || defined CUDA_MPIV diff --git a/src/shell.f90 b/src/shell.f90 index b44632f55..ad57c758c 100644 --- a/src/shell.f90 +++ b/src/shell.f90 @@ -868,7 +868,7 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD) do KKK=max(III,KKK1),KKK2 do LLL=max(KKK,LLL1),LLL2 if (III.lt.JJJ .and. III.lt. KKK .and. KKK.lt. LLL) then - call hrrwhole + call hrrwhole !get Y for get2e if (abs(Y).gt.quick_method%maxIntegralCutoff) then A = (III-1)*nbasis+JJJ-1 B = (KKK-1)*nbasis+LLL-1 @@ -895,7 +895,7 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD) endif endif else if((III.LT.KKK).OR.(JJJ.LE.LLL))then - call hrrwhole + call hrrwhole if (abs(Y).gt.quick_method%maxintegralCutoff) then A = (III-1)*nbasis+JJJ-1 @@ -934,12 +934,13 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD) else - if(II.lt.JJ.and.II.lt.KK.and.KK.lt.LL)then + !print *, "quick_method%x_hybrid_coeff is ", quick_method%x_hybrid_coeff + if(II.lt.JJ.and.II.lt.KK.and.KK.lt.LL)then !2e integral do III=III1,III2 do JJJ=JJJ1,JJJ2 do KKK=KKK1,KKK2 do LLL=LLL1,LLL2 - call hrrwhole + call hrrwhole !get Y for get2e !write(*,*) Y,III,JJJ,KKK,LLL DENSEKI=quick_qm_struct%dense(KKK,III) DENSEKJ=quick_qm_struct%dense(KKK,JJJ) @@ -1303,6 +1304,7 @@ end subroutine attrashellenergy ! Vertical Recursion by Xiao HE 07/07/07 version +!subroutine shellmp2(nstepmp2s,nsteplength, Y_Matrix) subroutine shellmp2(nstepmp2s,nsteplength) use allmod @@ -1313,6 +1315,9 @@ subroutine shellmp2(nstepmp2s,nsteplength) double precision RA(3),RB(3),RC(3),RD(3) double precision Qtemp(3),WQtemp(3),CDtemp,ABcom,Ptemp(3),WPtemp(3),ABtemp,CDcom,ABCDtemp + ! add Y_Matrix here + !double precision :: Y_Matrix(nbasis*nbasis, nbasis*nbasis) + integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2 common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2 @@ -1320,13 +1325,20 @@ subroutine shellmp2(nstepmp2s,nsteplength) COMMON /COM1/RA,RB,RC,RD + !print *, "in shellmp2, quick_basis%katom are", quick_basis%katom ! indice of central atoms + !print *, "in shell mp2, II, JJ, KK, LL are", II,JJ,KK,LLL + !print *, "in shellmp2, quick_basis%ktype are", quick_basis%ktype + do M=1,3 RA(M)=xyz(M,quick_basis%katom(II)) RB(M)=xyz(M,quick_basis%katom(JJ)) RC(M)=xyz(M,quick_basis%katom(KK)) RD(M)=xyz(M,quick_basis%katom(LL)) + !print *, "in shellmp2, RA(M), RB(M), RC(M), RD(M), M are", RA(M),RB(M),RC(M),RD(M),M enddo - + + !print *, "in shellmp2, II, JJ, KK, LL are", II, JJ, KK, LL + NII1=quick_basis%Qstart(II) NII2=quick_basis%Qfinal(II) NJJ1=quick_basis%Qstart(JJ) @@ -1430,6 +1442,11 @@ subroutine shellmp2(nstepmp2s,nsteplength) enddo enddo + !print *, "NII1, NII2 is", NII1, NII2 + !print *, "NJJ1, NJJ2 is", NJJ1, NJJ2 + !print *, "NKK1, NKK2 is", NKK1, NKK2 + !print *, "NLL1, NLL2 is", NLL1, NLL2 + !print *, "Sumindex is", Sumindex do I=NII1,NII2 NNA=Sumindex(I-1)+1 @@ -1439,8 +1456,18 @@ subroutine shellmp2(nstepmp2s,nsteplength) NNC=Sumindex(k-1)+1 do L=NLL1,NLL2 NNCD=SumIndex(K+L) - call classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) - ! call class + !! NNA,NNC,NNAB,NNCD not used? + !This prescreening is adopted from the cuda version + DNmaxmp2 = max(4.0d0*cutmatrix(II,JJ), & + 4.0d0*cutmatrix(KK,LL), & + cutmatrix(II,LL), & + cutmatrix(II,KK), & + cutmatrix(JJ,KK), & + cutmatrix(JJ,LL)) + if((Ycutoff(II,JJ)*Ycutoff(KK,LL).gt.quick_method%integralCutoff) .and.& + (Ycutoff(II,JJ)*Ycutoff(KK,LL))*DNmaxmp2.gt.quick_method%integralCutoff)then + call classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) + endif enddo enddo enddo @@ -1744,6 +1771,7 @@ End subroutine classmp2divcon ! Horrizontal recursion and Fock matrix builder by Xiao HE 07/07/07 version +!subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength, Y_Matrix) subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) ! subroutine class use allmod @@ -1756,6 +1784,8 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) double precision FM(0:13) double precision RA(3),RB(3),RC(3),RD(3) double precision X44(129600) + ! add Y_Matrix here, why here the dimension needs to be specified? + ! double precision, dimension(nbasis*nbasis,nbasis*nbasis):: Y_Matrix COMMON /COM1/RA,RB,RC,RD COMMON /COM2/AA,BB,CC,DD,AB,CD,ROU,ABCD @@ -1766,6 +1796,7 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) common /xiaostore/store common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2 +! commenting starts here ITT=0 do JJJ=1,quick_basis%kprim(JJ) Nprij=quick_basis%kstart(JJ)+JJJ-1 @@ -1797,7 +1828,7 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) store(MM1,MM2)=Ytemp enddo enddo - +! commenting ends here NBI1=quick_basis%Qsbasis(II,I) NBI2=quick_basis%Qfbasis(II,I) @@ -1834,8 +1865,23 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) NBI1=quick_basis%Qsbasis(II,NII1) NBJ1=quick_basis%Qsbasis(JJ,NJJ1) - II111=quick_basis%ksumtype(II)+NBI1 - JJ111=quick_basis%ksumtype(JJ)+NBJ1 + !II111=quick_basis%ksumtype(II)+NBI1 + !JJ111=quick_basis%ksumtype(JJ)+NBJ1 + II111=quick_basis%ksumtype(II) + JJ111=quick_basis%ksumtype(JJ) + + !print *, "in classmp2, II111, JJ111 are", II111, JJ111 + !print *, "in classmp2, NBI1 and NBJ1 should always be 0 while they are:", NBI1, NBJ1 + !if(NBI1.NE.0 .OR. NBJ1.NE.0) then + ! print *, "got nonezero NBI1 or NBJ1" + !endif + + ! from here inspection + !print *, "in classmp2, before getting Y," + !print *, "III1, III2 are", III1, III2 + !print *, "JJJ1, JJJ2 are", JJJ1, JJJ2 + !print *, "KKK1, KKK2 are", KKK1, KKK2 + !print *, "LLL1, LLL2 are", LLL1, LLL2 if(II.lt.JJ.and.KK.lt.LL)then @@ -1843,24 +1889,65 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) do JJJ=JJJ1,JJJ2 do KKK=KKK1,KKK2 do LLL=LLL1,LLL2 - +!#ifndef CUDA + !print *, "before calling hrrwhole" + !print *, "get Y is ", Y call hrrwhole - if (dabs(Y).gt.quick_method%integralCutoff) then + !print *, "FIRST CALL after calling hrrwhole" + !print "('in serial III,JJJ,KKK,LLL,Y are', i2, i2, i2, i2, f10.6)", III, JJJ, KKK, LLL, Y +!#endif +!#ifdef CUDA +! Y = 0.0 + ! Here we retrieve the Y value calculated in gpu_MP2.cu/hrrwhole_MP2. Two points to note: + ! 1. Cuda code only calculated Y_Matrix(i,j,k,l) where ik, use the sysmetry Y_Matrix(i,j,k,l) = Y_Matrix(k,l,i,j) + ! 2. In Fortran arrays are stored by column in memory. + !if(quick_method%bCUDA) then + ! if((III-1)*nbasis+JJJ > (KKK-1)*nbasis+LLL) then + !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL) + !print *, "" + ! Y = Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL) + ! else + !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ) + !print *, "" + ! Y = Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ) + ! endif + !endif +!#endif + !print *, "" + !print "('in classmp2, III,JJJ,KKK,LLL are', i3, i3, i3, i3, ', II,JJ,KK,LL are', & + ! i3, i3, i3, i3, ', I,J,K,L are', i3, i3, i3, i3)", III,JJJ,KKK,LLL,II,JJ,KK,LL,I,J,K,L + !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')", LLL, IIInew, JJJnew + !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')", LLL, JJJnew, IIInew + !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')", KKK, IIInew, JJJnew + !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')", KKK, JJJnew, IIInew + + if (dabs(Y).gt.quick_method%integralCutoff) then do i3mp2=1,nsteplength i3mp2new=nstepmp2s+i3mp2-1 atemp=quick_qm_struct%co(KKK,i3mp2new)*Y btemp=quick_qm_struct%co(LLL,i3mp2new)*Y IIInew=III-II111+1 - JJJnew=JJJ-JJ111+1 - - orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= & - orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp - orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= & - orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp - orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= & - orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp - orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= & - orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp + JJJnew=JJJ-JJ111+1 + + !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew,JJJnew, 1 are', i3, i3, i3, ' 1')",& + ! LLL, IIInew, JJJnew + !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",& + ! LLL, JJJnew, IIInew + !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",& + ! KKK, IIInew, JJJnew + !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",& + ! KKK, JJJnew, IIInew + + + !print "('for orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1),i3mp2,LLL,IIInew,JJJnew are', i2,i2,i2,i2)",i3mp2,LLL,IIInew,JJJnew + orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp + !print "('for orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2),i3mp2,LLL,JJJnew,IIInew are', i2,i2,i2,i2)",i3mp2,LLL,JJJnew,IIInew + orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp + !print "('for orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1),i3mp2,KKK,IIInew,JJJnew are',i2,i2,i2,i2)",i3mp2,KKK,IIInew,JJJnew + orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp + !print "('for orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2),i3mp2,KKK,JJJnew,IIInew are',i2,i2,i2,i2)",i3mp2,KKK,JJJnew,IIInew + orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp enddo endif enddo @@ -1876,8 +1963,30 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) do KKK=KKK1,KKK2 if(max(KKK,LLL1).le.LLL2)then do LLL=max(KKK,LLL1),LLL2 - +!#ifndef CUDA + !print *, "second call:before calling hrrwhole" + !print *, "get Y is ", Y call hrrwhole + !print *, "second call:after calling hrrwhole" + !print "('in serial III,JJJ,KKK,LLL,Y are', i2, i2, i2, i2, f10.6)", III, JJJ, KKK, LLL, Y +!#endif +!#ifdef CUDA +! Y = 0.0 + !if(quick_method%bCUDA) then + ! if((III-1)*nbasis+JJJ > (KKK-1)*nbasis+LLL) then + !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL) + !print *, "" + ! Y=Y_Matrix((III-1)*nbasis+JJJ,(KKK-1)*nbasis+LLL) + ! else + !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ) + !print *, "" + ! Y=Y_Matrix((KKK-1)*nbasis+LLL,(III-1)*nbasis+JJJ) + ! endif + !endif +!#endif + !print *, "" + !print "('else in classmp2, III,JJJ,KKK,LLL are', i3, i3, i3, i3,', II,JJ,KK,LL are', & + ! i3, i3, i3, i3, ', I,J,K,L are', i3, i3, i3, i3)",III,JJJ,KKK,LLL,II,JJ,KK,LL,I,J,K,L if (dabs(Y).gt.quick_method%integralCutoff) then do i3mp2=1,nsteplength i3mp2new=nstepmp2s+i3mp2-1 @@ -1887,18 +1996,22 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength) IIInew=III-II111+1 JJJnew=JJJ-JJ111+1 - orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= & - orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp + !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",& + ! LLL, IIInew, JJJnew + orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp if(JJJ.ne.III)then - orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= & - orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp + !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",& + !LLL, JJJnew, IIInew + orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp endif if(KKK.ne.LLL)then - orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= & - orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp + !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",& + ! KKK, IIInew, JJJnew + orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp if(III.ne.JJJ)then - orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= & - orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp + !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 1 are', i3, i3, i3, ' 2')",& + ! KKK, JJJnew, IIInew + orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp endif endif diff --git a/src/subs/eigval.f90 b/src/subs/deprecated/eigval.f90 similarity index 100% rename from src/subs/eigval.f90 rename to src/subs/deprecated/eigval.f90