diff --git a/src/basis.f90 b/src/basis.f90
index 6fdadd78f..5b9f19358 100644
--- a/src/basis.f90
+++ b/src/basis.f90
@@ -42,6 +42,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
       ! * Allocate arrays whose dimensions depend on NATOM (allocateatoms_ecp)
       ! * Read the Effective Core Potentials (ECPs), modify the atomic charges
       !   and the total number of electrons (readecp)
+      print *, "call readbasis"
       if (quick_method%ecp)    call readecp
       call quick_open(ibasisfile,basisfilename,'O','F','W',.true.,ierr)
       CHECK_ERROR(ierr)
@@ -56,6 +57,7 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
       atmbs2=.true.
       icont=0
       quick_method%ffunxiao=.true.
+      nfrozencore = 0
 
       ! parse the file and find the sizes of things to allocate them in memory
       do while (iofile  == 0 )
@@ -227,6 +229,9 @@ subroutine readbasis(natomxiao,natomstart,natomfinal,nbasisstart,nbasisfinal,ier
          nshell = nshell + quick_basis%kshell(quick_molspec%iattype(i))
          nbasis = nbasis + kbasis(quick_molspec%iattype(i))
          nprim = nprim + kcontract(quick_molspec%iattype(i))
+         if(quick_method%frzCore)then
+            nfrozencore = nfrozencore + frozencore(quick_molspec%iattype(i))/2
+         endif
 
          ! MFCC
          if(i.eq.natomfinal)nbasisfinal=nbasis
diff --git a/src/calMP2.f90 b/src/calMP2.f90
index 15cf62cdc..de1ce3767 100644
--- a/src/calMP2.f90
+++ b/src/calMP2.f90
@@ -9,11 +9,41 @@ subroutine calmp2
   use quick_cutoff_module, only: cshell_density_cutoff
   implicit double precision(a-h,o-z)
 
-  double precision cutoffTest,testtmp,testCutoff
-  integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
+  double precision cutoffTest,testtmp,testCutoff,gpuMP2WrapperTimeStart, gpuMP2WrapperTimeEnd
+  double precision :: gpuMp2cor(1),gpuEmemorysum(1)
+  integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2,NONZEROCOUNT,gpuNstepmp2(1)
+  integer:: gpuNtemp(1)
   common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
-    integer :: nelec,nelecb
+  integer :: nelec,nelecb
+
+    if(.not. quick_method%frzCore)then
+        print *, "Do not use fc approximation!"
+    endif
+    print *, "nfrozencore is ", nfrozencore
+
+
+#ifdef CUDA
+    if(quick_method%bCUDA) then
+    
+    call PrtAct(ioutfile,"Begin MP2 Calculation")
+
+    call cpu_time(gpuMP2WrapperTimeStart)
+    call gpu_mp2_wrapper(quick_qm_struct%co,quick_qm_struct%vec,quick_qm_struct%dense,quick_qm_struct%E,&
+            cutmatrix,quick_method%integralCutoff,quick_method%primLimit,quick_method%DMCutoff,gpuMp2cor,&
+            gpuEmemorysum, gpuNstepmp2, gpuNtemp)    
+    call cpu_time(gpuMP2WrapperTimeEnd)
+
+    write(ioutfile,'("CURRENT MEMORY USAGE=",E12.6,"M")') gpuEmemorysum(1)
+    write(ioutfile,'("TOTAL STEP          =",I6)') gpuNstepmp2(1)
+    write(ioutfile,'("EFFECT INTEGRALS    =",i8)') gpuNtemp(1)
+
+    quick_qm_struct%EMP2 = gpuMp2cor(1)
+    print *, 'cuda calculated mp2cor is', quick_qm_struct%EMP2
+    timer_cumer%TMP2 = gpuMP2WrapperTimeEnd-gpuMP2WrapperTimeStart 
+    print '("Total GPU MP2 Wrapper Time = ",f6.3," seconds.")', timer_cumer%TMP2
 
+    endif
+#else   
     nelec = quick_molspec%nelec
     nelecb = quick_molspec%nelecb
 
@@ -23,15 +53,24 @@ subroutine calmp2
   quick_qm_struct%EMP2=0.0d0
 
   ! occupied and virtual orbitals number
-  iocc=Nelec/2
+  !change here
+  !iocc=Nelec/2
+  iocc=Nelec/2-nfrozencore
   ivir=Nbasis-Nelec/2
 
   ! calculate memory usage and determine steps
   ememorysum=real(iocc*ivir*nbasis*8.0d0/1024.0d0/1024.0d0/1024.0d0)
 
   ! actually nstep is step length
-  nstep=min(int(1.5d0/ememorysum),Nelec/2)
-
+  ! set max mem req for Q3 to larger values
+  reqmemmax = 1.5d0
+  !change here
+  !nstep=min(int(reqmemmax/ememorysum),Nelec/2-nfrozencore)
+  nstep=min(int(reqmemmax/ememorysum),iocc)
+  if(nstep<1)then
+    nstep=1
+  endif
+   
   ! if with f orbital
   if(quick_method%ffunxiao)then
      nbasistemp=6
@@ -39,18 +78,28 @@ subroutine calmp2
      nbasistemp=10
   endif
 
+  print *, "in calmp2, nstep is", nstep
+  print *, "in calmp2, reqmemmax is", reqmemmax,&
+    "nbasistemp is", nbasistemp
+
   ! Allocate some variables
   allocate(mp2shell(nbasis))
   allocate(orbmp2(ivir,ivir))
   allocate(orbmp2i331(nstep,nbasis,nbasistemp,nbasistemp,2))
   allocate(orbmp2j331(nstep,ivir,nbasistemp,nbasistemp,2))
+  !change here
+  !allocate(orbmp2k331(nstep,iocc,ivir,nbasis))
   allocate(orbmp2k331(nstep,iocc,ivir,nbasis))
 
-  ! with nstep(acutally, it represetns step lenght), we can
+  ! with nstep(acutally, it represetns step lenght), we can 
   ! have no. of steps for mp2 calculation
-  nstepmp2=nelec/2/nstep
+  !change here
+  !nstepmp2=(nelec/2-nfrozencore)/nstep
+  nstepmp2=iocc/nstep
   nstepmp2=nstepmp2+1
-  if(nstep*(nstepmp2-1).eq.nelec/2)then
+  !change here
+  !if(nstep*(nstepmp2-1).eq.(nelec/2-nfrozencore))then
+  if(nstep*(nstepmp2-1).eq.iocc)then
      nstepmp2=nstepmp2-1
   endif
 
@@ -66,17 +115,28 @@ subroutine calmp2
 
   ttt=MAXVAL(Ycutoff) ! Max Value of Ycutoff
 
+  print *, "job step is ", nstepmp2
+  print *, "nstep is ", nstep
+  call flush(6)
+
   do i3new=1,nstepmp2               ! Step counter
+    print *, "i3new is ", i3new
+    call flush(6)
 
      call cpu_time(timer_begin%TMP2)
      ntemp=0    ! integer counter
-     nstepmp2s=(i3new-1)*nstep+1    ! Step start n
-     nstepmp2f=i3new*nstep          ! Step end n
+     nstepmp2s=(i3new-1)*nstep+1+nfrozencore    ! Step start n
+     nstepmp2f=i3new*nstep+nfrozencore          ! Step end n
 
      if(i3new.eq.nstepmp2)nstepmp2f=nelec/2
      nsteplength=nstepmp2f-nstepmp2s+1  ! Step Lengh, from nstepmp2s to nstepmp2f
+     
+     print *, "nstepmp2s is ", nstepmp2s
+     print *, "nstepmp2f is ", nstepmp2f
 
      ! Initial orbmp2k331
+     ! change_here
+     !call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength)
      call initialOrbmp2k331(orbmp2k331,nstep,nbasis,ivir,iocc,nsteplength)
      do II=1,jshell
         do JJ=II,jshell
@@ -116,21 +176,20 @@ subroutine calmp2
                                 enddo
                             enddo
                        enddo
-
+                       
                        testCutoff=testCutoff*comax
                        if(testCutoff.gt.cutoffmp2)then
                           dnmax=comax
                           ntemp=ntemp+1
-                          call shellmp2(nstepmp2s,nsteplength)
-                       endif
-
+                           call shellmp2(nstepmp2s,nsteplength)
+                        endif
+                                                
                     endif
 
                  enddo
               enddo
 
 
-
               NII1=quick_basis%Qstart(II)
               NII2=quick_basis%Qfinal(II)
               NJJ1=quick_basis%Qstart(JJ)
@@ -168,9 +227,10 @@ subroutine calmp2
                     enddo
 
                     do j33=1,ivir
-                       do k33=1,nelec/2
-                          atemp=quick_scratch%hold(k33,III)
-                          atemp2=quick_scratch%hold(k33,JJJ)
+                       !do k33=1+nfrozencore,nelec/2
+                        do k33=1,iocc
+                          atemp=quick_scratch%hold(k33+nfrozencore,III)
+                          atemp2=quick_scratch%hold(k33+nfrozencore,JJJ)
                           do icycle=1,nsteplength
                              orbmp2k331(icycle,k33,j33,JJJ)=orbmp2k331(icycle,k33,j33,JJJ)+ &
                                   orbmp2j331(icycle,j33,IIInew,JJJnew,1)*atemp
@@ -203,7 +263,8 @@ subroutine calmp2
 
      do icycle=1,nsteplength
         i3=nstepmp2s+icycle-1
-        do k3=i3,nelec/2
+        !do k3=i3,nelec/2
+        do k3=i3-nfrozencore, iocc
 
            do J3=1,nbasis-nelec/2
               do L3=1,nbasis-nelec/2
@@ -217,13 +278,15 @@ subroutine calmp2
 
            do J3=1,nbasis-nelec/2
               do L3=1,nbasis-nelec/2
-                 if(k3.gt.i3)then
-                    quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) &
+                 if(k3.gt.i3-nfrozencore)then
+                    quick_qm_struct%EMP2=quick_qm_struct%EMP2+2.0d0/(quick_qm_struct%E(i3) &
+                        +quick_qm_struct%E(k3+nfrozencore) &
                         -quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) &
                          *orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3))
                  endif
-                 if(k3.eq.i3)then
-                    quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3)+quick_qm_struct%E(k3) &
+                 if(k3.eq.i3-nfrozencore)then
+                    quick_qm_struct%EMP2=quick_qm_struct%EMP2+1.0d0/(quick_qm_struct%E(i3) &
+                        +quick_qm_struct%E(k3+nfrozencore) &
                         -quick_qm_struct%E(j3+nelec/2)-quick_qm_struct%E(l3+nelec/2)) &
                          *orbmp2(j3,l3)*(2.0d0*orbmp2(j3,l3)-orbmp2(l3,j3))
                  endif
@@ -233,10 +296,14 @@ subroutine calmp2
         enddo
      enddo
 
+
+     print *, "cleaned up!"
      call cpu_time(timer_end%TMP2)
      timer_cumer%TMP2=timer_end%TMP2-timer_begin%TMP2+timer_cumer%TMP2
 
   enddo
+  write (ioutfile,'("EFFECT INTEGRALS    =",i8)') ntemp
+#endif
 
   write (iOutFile,'("SECOND ORDER ENERGY =",F16.9)') quick_qm_struct%EMP2
   write (iOutFile,'("EMP2                =",F16.9)') quick_qm_struct%Etot+quick_qm_struct%EMP2
@@ -392,7 +459,8 @@ subroutine MPI_calmp2
                        if(testCutoff.gt.cutoffmp2)then
                           dnmax=comax
                           ntemp=ntemp+1
-                          call shellmp2(nstepmp2s,nsteplength)
+                          !call shellmp2(nstepmp2s,nsteplength,Y_Matrix)
+                           call shellmp2(nstepmp2s,nsteplength)
                        endif
 
                     endif
@@ -971,3 +1039,20 @@ subroutine initialOrbmp2ij(orbmp2i331,nstep,nsteplength,nbasis,nbasistemp,nbasis
      enddo
   enddo
 end subroutine initialOrbmp2ij
+
+subroutine print_matrix(matrix,n)
+    integer n, i, j
+    double precision, dimension(n,n),intent(in) :: matrix
+
+    do i=1, n
+        do j=1, n
+            write(*, '(f16.8)', advance='no'), matrix(i,j)
+            !write(*, '(f20.6)'), matrix(i,j)
+        end do
+        write(*,'(" ")')
+    end do
+
+end subroutine print_matrix
+
+
+
diff --git a/src/cuda/Makefile b/src/cuda/Makefile
index 1c25d898c..8d2095e54 100644
--- a/src/cuda/Makefile
+++ b/src/cuda/Makefile
@@ -24,9 +24,9 @@ include $(MAKEIN)
 
 LIBXC_CUDA_FLAGS = -I$(libxcfolder)
 
+CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o $(objfolder)/gpu_MP2.o
 CXXOBJ=$(objfolder)/xc_redistribute.o
 
-CUDACOBJ=$(objfolder)/gpu.o $(objfolder)/gpu_type.o $(objfolder)/gpu_get2e.o 
 
 CUDAXCOBJ=$(objfolder)/gpu_getxc.o
 
diff --git a/src/cuda/gpu.cu b/src/cuda/gpu.cu
index 669dc0f17..87f404294 100644
--- a/src/cuda/gpu.cu
+++ b/src/cuda/gpu.cu
@@ -285,6 +285,8 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int*
     
     upload_para_to_const();
 
+	//for MP2, added here!
+	upload_para_to_const_MP2();
 #ifdef DEBUG
     cudaEventRecord(end, 0);
     cudaEventSynchronize(end);
@@ -1066,7 +1068,8 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP
 
 #endif   
  
-    gpu -> gpu_cutoff -> YCutoff -> DeleteCPU();
+	printf("Don't do gpu -> gpu_cutoff -> YCutoff -> DeleteCPU()\n");
+    //gpu -> gpu_cutoff -> YCutoff -> DeleteCPU();
     gpu -> gpu_cutoff -> cutPrim -> DeleteCPU();
     gpu -> gpu_cutoff -> sorted_YCutoffIJ -> DeleteCPU();
  
@@ -1087,7 +1090,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP
 //-----------------------------------------------
 //  upload calculated information
 //-----------------------------------------------
-extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense)
+extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E)
 {
     
 #ifdef DEBUG
@@ -1102,9 +1105,11 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou
     gpu -> gpu_calculated -> o        =   new cuda_buffer_type<QUICKDouble>(gpu->nbasis, gpu->nbasis);
     gpu -> gpu_calculated -> o        ->  DeleteGPU();
     gpu -> gpu_calculated -> dense    =   new cuda_buffer_type<QUICKDouble>(dense,  gpu->nbasis, gpu->nbasis);
+	gpu -> gpu_calculated -> coefficient    =   new cuda_buffer_type<QUICKDouble>(co,  gpu->nbasis, gpu->nbasis);
     gpu -> gpu_calculated -> oULL     =   new cuda_buffer_type<QUICKULL>(gpu->nbasis, gpu->nbasis);
-    
-    
+	gpu -> gpu_calculated -> mp2cor = new cuda_buffer_type<QUICKDouble>(1); //to store finally mp2 correction
+	gpu	-> gpu_calculated -> molorbe		= 	new cuda_buffer_type<QUICKDouble>(E, gpu->nbasis);
+	
     /*
      oULL is the unsigned long long int type of O matrix. The reason to do so is because
      Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type,
@@ -1128,13 +1133,19 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou
     
     //    gpu -> gpu_calculated -> o        -> Upload();
     gpu -> gpu_calculated -> dense    -> Upload();
-    gpu -> gpu_calculated -> oULL     -> Upload();
-    
+    gpu -> gpu_calculated -> coefficient -> Upload();
+	gpu -> gpu_calculated -> oULL     -> Upload();
+	gpu -> gpu_calculated -> mp2cor -> Upload();
+	gpu -> gpu_calculated -> molorbe         -> Upload();
+
     //    gpu -> gpu_sim.o                 =  gpu -> gpu_calculated -> o -> _devData;
     gpu -> gpu_sim.dense             =  gpu -> gpu_calculated -> dense -> _devData;
+	gpu -> gpu_sim.coefficient		 =  gpu -> gpu_calculated -> coefficient -> _devData;
     gpu -> gpu_sim.oULL              =  gpu -> gpu_calculated -> oULL -> _devData;
-    
-    
+	gpu -> gpu_sim.mp2cor = gpu -> gpu_calculated -> mp2cor  -> _devData;
+	gpu -> gpu_sim.molorbe               = gpu -> gpu_calculated -> molorbe -> _devData;
+
+
 #ifdef DEBUG
     cudaEventRecord(end, 0);
     cudaEventSynchronize(end);
@@ -1157,6 +1168,16 @@ extern "C" void gpu_upload_density_matrix_(QUICKDouble* dense)
     gpu -> gpu_sim.dense             =  gpu -> gpu_calculated -> dense -> _devData;
 }
 
+// Added by Chi Jin on 11/07/2020. Do we need this, since we did this in gpu_upload_calculated?
+// Probably not. This function is not called anywhere
+extern "C" void gpu_upload_coefficient_matrix_(QUICKDouble* coefficient)
+{
+	gpu -> gpu_calculated -> coefficient = new cuda_buffer_type<QUICKDouble>(coefficient,  gpu->nbasis, gpu->nbasis);
+	gpu -> gpu_calculated -> coefficient -> Upload();
+	gpu -> gpu_sim.coefficient = gpu -> gpu_calculated -> coefficient -> _devData;
+}
+
+
 //-----------------------------------------------
 //  upload basis set information
 //-----------------------------------------------
@@ -1165,7 +1186,7 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba
                                   int* first_basis_function, int* last_basis_function, int* first_shell_basis_function, int* last_shell_basis_function, \
                                   int* ncenter,   int* kstart,    int* katom,     int* ktype,     int* kprim,  int* kshell, int* Ksumtype, \
                                   int* Qnumber,   int* Qstart,    int* Qfinal,    int* Qsbasis,   int* Qfbasis,\
-                                  QUICKDouble* gccoeff,           QUICKDouble* cons,      QUICKDouble* gcexpo, int* KLMN)
+                                  QUICKDouble* gccoeff,           QUICKDouble* cons,      QUICKDouble* gcexpo, int* KLMN, int* nfrozencore)
 {
     
 #ifdef DEBUG
@@ -1182,17 +1203,20 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba
     gpu -> gpu_basis -> jshell          =   *jshell;
     gpu -> gpu_basis -> jbasis          =   *jbasis;
     gpu -> gpu_basis -> maxcontract     =   *maxcontract;
-    
+	gpu -> gpu_basis -> nfrozencore		=	*nfrozencore;   
+
     gpu -> nshell                       =   *nshell;
     gpu -> nprim                        =   *nprim;
     gpu -> jshell                       =   *jshell;
     gpu -> jbasis                       =   *jbasis;
+	gpu	-> nfrozencore					=	*nfrozencore;
     
     gpu -> gpu_sim.nshell                   =   *nshell;
     gpu -> gpu_sim.nprim                    =   *nprim;
     gpu -> gpu_sim.jshell                   =   *jshell;
     gpu -> gpu_sim.jbasis                   =   *jbasis;
     gpu -> gpu_sim.maxcontract              =   *maxcontract;
+	gpu -> gpu_sim.nfrozencore				= 	*nfrozencore;
     
     
     gpu -> gpu_basis -> ncontract                   =   new cuda_buffer_type<int>(ncontract, gpu->nbasis);//gpu->nbasis);
@@ -1492,11 +1516,13 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba
     gpu -> gpu_basis -> prim_start -> DeleteCPU();
     
     gpu -> gpu_basis -> Qnumber -> DeleteCPU();
-    gpu -> gpu_basis -> Qstart -> DeleteCPU();
-    gpu -> gpu_basis -> Qfinal -> DeleteCPU();
     
-    gpu -> gpu_basis -> Qsbasis -> DeleteCPU();
-    gpu -> gpu_basis -> Qfbasis -> DeleteCPU();
+	// for MP2, let them be here for a while
+	//gpu -> gpu_basis -> Qstart -> DeleteCPU();
+    //gpu -> gpu_basis -> Qfinal -> DeleteCPU();
+    
+    //gpu -> gpu_basis -> Qsbasis -> DeleteCPU();
+    //gpu -> gpu_basis -> Qfbasis -> DeleteCPU();
     gpu -> gpu_basis -> gccoeff -> DeleteCPU();
     gpu -> gpu_basis -> cons -> DeleteCPU();
     gpu -> gpu_basis -> gcexpo -> DeleteCPU();
@@ -2426,7 +2452,7 @@ extern "C" void gpu_grad_(QUICKDouble* grad)
     cudaEventDestroy(end);
 #endif
     
-    PRINTDEBUG("COMPLETE RUNNING GRAD")
+    PRINTDEBUG("COMPLETE RUNNING GRAD");
 }
 
 
@@ -2678,10 +2704,13 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName){
     
     delete gpu->gpu_calculated->o;
     delete gpu->gpu_calculated->dense;
+ 	delete gpu->gpu_calculated->coefficient;
     delete gpu->gpu_calculated->oULL;
-    delete gpu->gpu_cutoff->cutMatrix;
+	delete gpu->gpu_calculated->mp2cor;
+	delete gpu->gpu_calculated->molorbe;
+	delete gpu->gpu_cutoff->cutMatrix;
     delete gpu->gpu_cutoff->sorted_YCutoffIJ;
-    delete gpu->gpu_cutoff->YCutoff;
+	delete gpu->gpu_cutoff->YCutoff;
     delete gpu->gpu_cutoff->cutPrim;
     
     
@@ -2701,7 +2730,7 @@ extern "C" void gpu_get2e_(QUICKDouble* o)
     PRINTDEBUG("BEGIN TO RUN KERNEL")
     
     get2e(gpu);
- 
+
     PRINTDEBUG("COMPLETE KERNEL")
     gpu -> gpu_calculated -> oULL -> Download();
 
@@ -2759,6 +2788,114 @@ extern "C" void gpu_get2e_(QUICKDouble* o)
 }
 
 
+//-----------------------------------------------
+//  compute mp2
+//-----------------------------------------------
+
+extern "C" void gpu_calmp2_(QUICKDouble* mp2cor, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp)
+{
+	PRINTDEBUG("BEGIN TO RUN gpu_calmp2");
+	
+	upload_sim_to_constant_MP2(gpu);
+
+	PRINTDEBUG("BEGIN TO RUN KERNEL")
+	
+	get2e_MP2(gpu, ememorysum, nstepmp2, ntemp);
+	printf("in gpu_calmp2_, get2e_MP2 is done\n");
+	
+	PRINTDEBUG("COMPLETE KERNEL")
+	
+	float time = 0;
+	cudaEvent_t before_download, downloaded, copied;
+    cudaEventCreate(&before_download);
+    cudaEventCreate(&downloaded);
+	cudaEventCreate(&copied);
+    cudaEventRecord(before_download, 0);
+
+	printf("here to download gpu->gpu_calculated->mp2cor\n");
+    gpu->gpu_calculated->mp2cor->Download();
+    printf("gpu->gpu_calculated->mp2cor downloaded\n");
+	printf("gpu->gpu_calculated->mp2cor->_hostData is %lf\n", *gpu->gpu_calculated->mp2cor->_hostData);
+
+	cudaEventRecord(downloaded, 0);
+	cudaEventSynchronize(downloaded);
+	cudaEventElapsedTime(&time, before_download, downloaded);
+	printf("in gpu_calmp2, total mp2cor download time is %6.3f ms\n", time);
+
+	memcpy(mp2cor, gpu->gpu_calculated->mp2cor->_hostData, sizeof(QUICKDouble));
+
+	cudaEventRecord(copied, 0);
+	cudaEventSynchronize(copied);
+    cudaEventElapsedTime(&time, downloaded, copied);
+	printf("in gpu_calmp2, total mp2cor copy time is %6.3f ms\n", time);
+
+	cudaEventDestroy(before_download);
+	cudaEventDestroy(downloaded);
+	cudaEventDestroy(copied);
+
+#ifdef DEBUG
+    cudaEvent_t start,end;
+    cudaEventCreate(&start);
+    cudaEventCreate(&end);
+    cudaEventRecord(start, 0);
+#endif
+	
+#ifdef DEBUG
+    cudaEventRecord(end, 0);
+    cudaEventSynchronize(end);
+    float time;
+    cudaEventElapsedTime(&time, start, end);
+    PRINTUSINGTIME("DOWNLOAD O",time);
+    cudaEventDestroy(start);
+    cudaEventDestroy(end);
+#endif
+
+    PRINTDEBUG("DELETE TEMP VARIABLES")
+	
+    delete gpu->gpu_calculated->o;
+    delete gpu->gpu_calculated->dense;
+    delete gpu->gpu_calculated->coefficient;
+	delete gpu->gpu_calculated->oULL;
+ 	delete gpu->gpu_calculated->mp2cor;
+	delete gpu->gpu_calculated->molorbe;
+	delete gpu->gpu_cutoff->cutMatrix;
+
+    PRINTDEBUG("COMPLETE RUNNING MP2")
+}
+
+extern "C" void gpu_mp2_wrapper_(QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E,\
+QUICKDouble* cutmatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit,QUICKDouble* DMCutoff, QUICKDouble* mp2cor,\
+QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp)
+{
+	float time = 0;
+	cudaEvent_t before_upload, uploaded, mp2_done;
+	cudaEventCreate(&before_upload);
+	cudaEventCreate(&uploaded);
+	cudaEventCreate(&mp2_done);
+	cudaEventRecord(before_upload, 0);
+
+	//not changed the signature of this functure for the moment:
+	QUICKDouble* o = new QUICKDouble(0);
+	gpu_upload_calculated_(o,co,vec,dense,E);
+	free(o);
+	gpu_upload_cutoff_(cutmatrix,integralCutoff,primLimit,DMCutoff);
+	
+	cudaEventRecord(uploaded, 0);
+	cudaEventSynchronize(uploaded);
+	cudaEventElapsedTime(&time, before_upload, uploaded);
+	printf("in gpu_mp2_wrapper, total upload time is %6.3f ms\n", time);
+
+	gpu_calmp2_(mp2cor, ememorysum, nstepmp2, ntemp);
+
+	cudaEventRecord(mp2_done, 0);
+    cudaEventSynchronize(mp2_done);
+    cudaEventElapsedTime(&time, uploaded, mp2_done);
+    printf("in gpu_mp2_wrapper, total calmp2 time is %6.3f ms\n", time);
+	printf("in gpu_mp2_wrapper, finally mp2cor is %lf\n", *mp2cor);	
+
+	cudaDeviceSynchronize();
+}
+
 extern "C" void gpu_getxc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o, int* nof_functionals, int* functional_id, int* xc_polarization)
 {
     PRINTDEBUG("BEGIN TO RUN GETXC")
diff --git a/src/cuda/gpu.h b/src/cuda/gpu.h
index 124374e84..532ead2e5 100644
--- a/src/cuda/gpu.h
+++ b/src/cuda/gpu.h
@@ -32,18 +32,19 @@ extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg);
 extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit, QUICKDouble* DMCutoff);
 extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutPrim);
 extern "C" void gpu_upload_energy_(QUICKDouble* E);
-extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense);
+extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E);
 extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jbasis, int* maxcontract, \
                                   int* ncontract, int* itype,     QUICKDouble* aexp,      QUICKDouble* dcoeff,\
                                   int* first_basis_function, int* last_basis_function, int* first_shell_basis_function, int* last_shell_basis_function, \
                                   int* ncenter,   int* kstart,    int* katom,     int* ktype,     int* kprim,  int* kshell, int* Ksumtype, \
                                   int* Qnumber,   int* Qstart,    int* Qfinal,    int* Qsbasis,   int* Qfbasis,\
-                                  QUICKDouble* gccoeff,           QUICKDouble* cons,      QUICKDouble* gcexpo, int* KLMN);
+                                  QUICKDouble* gccoeff,           QUICKDouble* cons,      QUICKDouble* gcexpo, int* KLMN, int* nfrozencore);
 extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff);
 extern "C" void gpu_cleanup_();
 
 //Following methods weddre added by Madu Manathunga
 extern "C" void gpu_upload_density_matrix_(QUICKDouble* dense);
+extern "C" void gpu_upload_coefficient_matrix_(QUICKDouble* coefficient);
 extern "C" void gpu_delete_dft_grid_();
 //void upload_xc_smem();
 void upload_pteval();
@@ -74,7 +75,7 @@ void getxc(_gpu_type gpu, gpu_libxc_info** glinfo, int nof_functionals);
 void getxc_grad(_gpu_type gpu, gpu_libxc_info** glinfo, int nof_functionals);
 void prune_grid_sswgrad();
 void gpu_delete_sswgrad_vars();
-void get2e_MP2(_gpu_type gpu);
+void get2e_MP2(_gpu_type gpu, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp);
 void getAddInt(_gpu_type gpu, int bufferSize, ERI_entry* aoint_buffer);
 void getGrad(_gpu_type gpu);
 // global [get2e_kernel]
@@ -164,6 +165,7 @@ __device__ void iclass_grad_spdf7(int I, int J, int K, int L, unsigned int II, u
 __device__ void iclass_grad_spdf8(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, QUICKDouble DNMax);
 
 void upload_sim_to_constant(_gpu_type gpu);
+void upload_sim_to_constant_MP2(_gpu_type gpu);
 void upload_sim_to_constant_dft(_gpu_type gpu);
 
 void upload_para_to_const();
@@ -512,4 +514,52 @@ __device__ QUICKDouble lyp_e(QUICKDouble pa, QUICKDouble pb, QUICKDouble gax, QU
 __device__ QUICKDouble becke_e(QUICKDouble density, QUICKDouble densityb, QUICKDouble gax, QUICKDouble gay, QUICKDouble gaz,
                                QUICKDouble gbx,     QUICKDouble gby,      QUICKDouble gbz);
 
+
+//Chi Jin 09/23/2020
+//MP2
+__global__ void get2e_MP2_kernel();
+__global__ void firstQuarterTransKernel(int II,int JJ,int nstepmp2s, int nsteplength, int nstep, int nbasistemp, unsigned long long int* ntempptr_d,\
+QUICKDouble cutoffmp2, QUICKDouble* orbmp2i331);
+__global__ void secondQuarterTransKernel(int III, int JJJ, int IIInew,int JJJnew,int nsteplength, int nstep, int nbasistemp, QUICKDouble* orbmp2i331, QUICKDouble* orbmp2j331);
+__global__ void thirdQuarterTransKernel(int III, int JJJ, int IIInew,int JJJnew, int nsteplength, int nstep, int nbasistemp, QUICKDouble* orbmp2j331, QUICKDouble* orbmp2k331);
+__global__ void forthQuarterTransKernel(int icycle, int i3, int k3, int nstep, QUICKDouble* orbmp2k331, QUICKDouble* orbmp2);
+__global__ void finalMP2AccumulationKernel(int i3, int k3, QUICKDouble* orbmp2, QUICKDouble* MP2cor);
+__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, \
+	int nstepmp2s, int nsteplength, int nstep, int nbasistemp, QUICKDouble DNMax, QUICKDouble* orbmp2i331);
+void fourQuarterTransHost(_gpu_type gpu, QUICKDouble* ememorysumptr, int* nstepmp2ptr, unsigned long long int* ntempptr);
+
+__device__ void FmT_MP2(int MaxM, QUICKDouble X, QUICKDouble* YVerticalTemp);
+__device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalTemp, QUICKDouble* store,
+                         QUICKDouble Ptempx, QUICKDouble Ptempy, QUICKDouble Ptempz,  \
+                         QUICKDouble WPtempx,QUICKDouble WPtempy,QUICKDouble WPtempz, \
+                         QUICKDouble Qtempx, QUICKDouble Qtempy, QUICKDouble Qtempz,  \
+                         QUICKDouble WQtempx,QUICKDouble WQtempy,QUICKDouble WQtempz, \
+                         QUICKDouble ABCDtemp,QUICKDouble ABtemp, \
+                         QUICKDouble CDtemp, QUICKDouble ABcom, QUICKDouble CDcom);
+__device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \
+                                int III, int JJJ, int KKK, int LLL, int IJKLTYPE, QUICKDouble* store, \
+                                QUICKDouble RAx,QUICKDouble RAy,QUICKDouble RAz, \
+                                QUICKDouble RBx,QUICKDouble RBy,QUICKDouble RBz, \
+                                QUICKDouble RCx,QUICKDouble RCy,QUICKDouble RCz, \
+                                QUICKDouble RDx,QUICKDouble RDy,QUICKDouble RDz);
+__device__ int lefthrr_MP2(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz,
+                       QUICKDouble RBx, QUICKDouble RBy, QUICKDouble RBz,
+                       int KLMNAx, int KLMNAy, int KLMNAz,
+                       int KLMNBx, int KLMNBy, int KLMNBz,
+                       int IJTYPE,QUICKDouble* coefAngularL, int* angularL);
+
+void upload_para_to_const_MP2();
+extern "C" void gpu_mp2_wrapper_(QUICKDouble* co, QUICKDouble* vec, QUICKDouble* dense, QUICKDouble* E,\
+QUICKDouble* cutmatrix, QUICKDouble* integralCutoff,QUICKDouble* primLimit,QUICKDouble* DMCutoff, QUICKDouble* mp2cor,\
+QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int* ntemp);
+
+
+/*
+#undef STOREDIM
+#ifdef int_spd
+#define STOREDIM STOREDIM_S
+#else
+#define STOREDIM STOREDIM_L
+#endif
+*/
 #endif
diff --git a/src/cuda/gpu_MP2.cu b/src/cuda/gpu_MP2.cu
index 1166247bc..24b4ca31b 100644
--- a/src/cuda/gpu_MP2.cu
+++ b/src/cuda/gpu_MP2.cu
@@ -12,6 +12,13 @@
 #include "gpu.h"
 #include <cuda.h>
 
+#undef STOREDIM
+//#ifdef int_spd
+#define STOREDIM STOREDIM_S
+//#else
+//#define STOREDIM STOREDIM_L
+//#endif
+
 /*
  Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and 
  readonly. So we put the following variables into constant memory:
@@ -31,18 +38,524 @@ static __constant__ int Sumindex_MP2[10]={0,0,1,4,10,20,35,56,84,120};
  */
 void upload_sim_to_constant_MP2(_gpu_type gpu){
     cudaError_t status;
-    status = cudaMemcpyToSymbol("devSim_MP2", &gpu->gpu_sim, sizeof(gpu_simulation_type), 0, cudaMemcpyHostToDevice);
-    PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed")
+    //status = cudaMemcpyToSymbol("devSim_MP2", &gpu->gpu_sim, sizeof(gpu_simulation_type), 0, cudaMemcpyHostToDevice);
+    status = cudaMemcpyToSymbol(devSim_MP2, &gpu->gpu_sim, sizeof(gpu_simulation_type));
+	PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed")
+}
+
+
+//this kernel is to parallelize the two inner loops of the first transformation in firstThreeQuartersTransHost
+
+__global__ void 
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) firstQuarterTransKernel(int II,int JJ,int nstepmp2s, int nsteplength, int nstep, int nbasistemp, unsigned long long int* ntempptr_d,\
+QUICKDouble cutoffmp2, QUICKDouble* orbmp2i331)
+{
+	int offside = blockIdx.x*blockDim.x+threadIdx.x;
+	int totalThreads = blockDim.x*gridDim.x;	
+	int jshell =  devSim_MP2.jshell;
+	int myInt = jshell*jshell/totalThreads;
+
+	int nshell = devSim_MP2.nshell;
+	int nbasis = devSim_MP2.nbasis;
+
+	if(jshell*jshell-myInt*totalThreads>offside) myInt++;
+
+	for(int i=1;i<=myInt;i++)
+	{
+		int currentInt = totalThreads*(i-1)+offside;	
+		int KK = currentInt/jshell;
+		int LL = currentInt%jshell;
+
+		if(KK<jshell && LL<jshell && LL>=KK)
+		{
+			QUICKDouble comax = 0.0;
+			QUICKDouble testCutoff = LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell)*LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell); 
+			
+			if(testCutoff>cutoffmp2)
+			{
+				int NII1=devSim_MP2.Qstart[II];
+        		int NII2=devSim_MP2.Qfinal[II];
+        		int NJJ1=devSim_MP2.Qstart[JJ];
+        		int NJJ2=devSim_MP2.Qfinal[JJ];
+       			int NKK1=devSim_MP2.Qstart[KK];
+       			int NKK2=devSim_MP2.Qfinal[KK];
+        		int NLL1=devSim_MP2.Qstart[LL];
+        		int NLL2=devSim_MP2.Qfinal[LL];
+	
+				int NBK1 = LOC2(devSim_MP2.Qsbasis, KK, NKK1, nshell, 4);
+				int NBK2 = LOC2(devSim_MP2.Qfbasis, KK, NKK2, nshell, 4);
+            	int NBL1 = LOC2(devSim_MP2.Qsbasis, LL, NLL1, nshell, 4);
+				int NBL2 = LOC2(devSim_MP2.Qfbasis, LL, NLL2, nshell, 4);	    
+	
+				int KK111 = NBK1;
+				int KK112 = NBK2;
+				int LL111 = NBL1;
+				int LL112 = NBL2;		
+
+				for(int icycle=1; icycle<=nsteplength; icycle++)
+				{
+					int i3 = nstepmp2s + icycle -1;
+					for(int KKK = KK111;KKK<=KK112;KKK++)
+					{
+						for(int LLL=MAX(KKK,LL111);LLL<=LL112;LLL++)
+						{
+							comax = MAX(comax, fabs(LOC2(devSim_MP2.coefficient,KKK-1,i3-1,nbasis,nbasis)));
+							comax = MAX(comax, fabs(LOC2(devSim_MP2.coefficient,LLL-1,i3-1,nbasis,nbasis)));				
+						}
+					}
+				}
+				testCutoff *= comax;
+
+				if(testCutoff>cutoffmp2)
+				{
+					// from shellmp2:
+					QUICKADD(ntempptr_d[0], (unsigned long long int)1);
+					for(int I=NII1; I<=NII2; I++)
+        			{
+        				for(int J=NJJ1; J<=NJJ2; J++)
+            			{
+            				for(int K=NKK1; K<=NKK2; K++)
+                			{
+                				for(int L=NLL1; L<=NLL2; L++)
+                    			{
+									// is I,J,K,L the same iii,jjj,kkk,lll as in get2e_MP2_kernel()?
+									int nshell = devSim_MP2.nshell;
+									QUICKDouble DNMax = MAX(MAX(4.0*LOC2(devSim_MP2.cutMatrix, II, JJ, nshell, nshell), 4.0*LOC2(devSim_MP2.cutMatrix, KK, LL, nshell, nshell)),
+                                    	MAX(MAX(LOC2(devSim_MP2.cutMatrix, II, LL, nshell, nshell),     LOC2(devSim_MP2.cutMatrix, II, KK, nshell, nshell)),
+                                        	MAX(LOC2(devSim_MP2.cutMatrix, JJ, KK, nshell, nshell),     LOC2(devSim_MP2.cutMatrix, JJ, LL, nshell, nshell))));
+						
+									if ((LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell) * LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell))> devSim_MP2.integralCutoff && \
+                						(LOC2(devSim_MP2.YCutoff, KK, LL, nshell, nshell) * LOC2(devSim_MP2.YCutoff, II, JJ, nshell, nshell) * DNMax) > devSim_MP2.integralCutoff) 
+									{
+										// equivalent to classmp2
+										iclass_MP2(I,J,K,L,II,JJ,KK,LL,nstepmp2s,nsteplength,nstep, nbasistemp, DNMax,orbmp2i331);
+									}
+                    			}
+                			}
+           				}
+       				}
+				}
+			}	
+		}
+	}
 }
 
+//this kernel is to parallelize the second transformation
+__global__ void
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) secondQuarterTransKernel\
+(int III, int JJJ, int IIInew,int JJJnew, int nsteplength, int nstep, int nbasistemp, \
+QUICKDouble* orbmp2i331, QUICKDouble* orbmp2j331)
+{
+	int offside = blockIdx.x*blockDim.x+threadIdx.x;
+    int totalThreads = blockDim.x*gridDim.x;
+	
+	int nbasis =  devSim_MP2.nbasis;
+	int nElec = devSim_MP2.nElec;
+	int iocc = nElec/2;
+	int ivir = nbasis - iocc;
+	QUICKDouble* coefficient = devSim_MP2.coefficient;	
+	
+	int myInt = nbasis*ivir*nsteplength/totalThreads;
+	if(nbasis*ivir*nsteplength-myInt*totalThreads>offside) myInt++;	
+
+	for(int i=1;i<=myInt;i++)	
+	{
+		int currentInt = totalThreads*(i-1)+offside;
+		int icycle = currentInt%nsteplength;
+		currentInt /= nsteplength;
+		int j33 = currentInt%ivir;
+		int LLL = currentInt/ivir;
+
+		if(LLL<nbasis && j33<ivir && icycle<nsteplength)
+		{
+			int j33new = j33 + nElec/2;
+			QUICKDouble atemp = LOC2(coefficient, LLL, j33new, nbasis, nbasis);
+			QUICKADD(LOC5(orbmp2j331,icycle,j33,IIInew,JJJnew,0,nstep,ivir,nbasistemp, nbasistemp,2),\
+				LOC5(orbmp2i331,icycle,LLL,IIInew,JJJnew,0,nstep,nbasis,nbasistemp, nbasistemp,2)*atemp);
+
+            if(III!=JJJ)
+			{
+				QUICKADD(LOC5(orbmp2j331,icycle,j33,JJJnew,IIInew,1,nstep,ivir,nbasistemp, nbasistemp,2),\
+					LOC5(orbmp2i331,icycle,LLL,JJJnew,IIInew,1,nstep,nbasis,nbasistemp, nbasistemp,2)*atemp);
+			}
+		}	
+	}
+}
+
+//this kernel is to parallelize the third transformation
+__global__ void
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) thirdQuarterTransKernel(int III, int JJJ, int IIInew,int JJJnew, int nsteplength, int nstep, int nbasistemp, QUICKDouble* orbmp2j331, QUICKDouble* orbmp2k331)
+{
+	int offside = blockIdx.x*blockDim.x+threadIdx.x;
+	int totalThreads = blockDim.x*gridDim.x;
+	
+	int nbasis =  devSim_MP2.nbasis;
+    int nElec = devSim_MP2.nElec;
+	int nfrozencore = devSim_MP2.nfrozencore;
+	int iocc = nElec/2 - nfrozencore;
+	int ivir = nbasis - nElec/2;
+	QUICKDouble* coefficient = devSim_MP2.coefficient;
+	
+/*
+	__shared__ QUICKDouble atemps[2000];
+    __shared__ QUICKDouble atemp2s[2000];
+	if(iocc<=2000)
+	{
+		
+		//for(int k33=0;k33<iocc;k33++)
+		//{
+		//	atemps[k33]=LOC2(coefficient, III-1, k33, nbasis, nbasis);
+		//	atemp2s[k33]=LOC2(coefficient, JJJ-1, k33, nbasis, nbasis);		
+		//}	
+		
+		int myInt = iocc/blockDim.x;
+		if(iocc-myInt*blockDim.x>threadIdx.x) myInt++;	
+		for(int i=1;i<=myInt;i++)	
+		{
+			int k33 =blockDim.x*(i-1) +threadIdx.x;
+			atemps[k33]=LOC2(coefficient, III-1, k33, nbasis, nbasis);
+			atemp2s[k33]=LOC2(coefficient, JJJ-1, k33, nbasis, nbasis);
+		}
+	
+	}
+	__syncthreads();
+*/
+
+	int myInt = ivir*iocc*nsteplength/totalThreads;
+	if(ivir*iocc*nsteplength-myInt*totalThreads>offside) myInt++;
+
+	for(int i=1;i<=myInt;i++)
+	{
+		int currentInt = totalThreads*(i-1)+offside;
+		int icycle = currentInt%nsteplength;
+		currentInt /= nsteplength;
+		int k33 = currentInt%iocc;
+		int j33 = currentInt/iocc;	
+	
+		if(j33<ivir && k33<iocc && icycle<nsteplength)
+		{
+			/*
+			if(iocc<2000)	//use shared mem
+			{
+				LOC4(orbmp2k331,icycle,k33,j33,JJJ-1,nstep, iocc, ivir, nbasis) +=\
+                    LOC5(orbmp2j331,icycle,j33,IIInew,JJJnew,0, nstep, ivir, nbasistemp, nbasistemp, 2)*atemps[k33];
+				if(III!=JJJ)
+				{
+					LOC4(orbmp2k331,icycle,k33,j33,III-1,nstep, iocc, ivir, nbasis) +=\
+                        LOC5(orbmp2j331,icycle,j33,JJJnew,IIInew,1,nstep, ivir, nbasistemp, nbasistemp, 2)*atemp2s[k33];		
+				}
+			}
+			else	//use global mem
+			{*/
+				QUICKDouble atemp = LOC2(coefficient, III-1, k33+nfrozencore, nbasis, nbasis);
+            	QUICKDouble atemp2 = LOC2(coefficient, JJJ-1, k33+nfrozencore, nbasis, nbasis);
+            	LOC4(orbmp2k331,icycle,k33,j33,JJJ-1,nstep, iocc, ivir, nbasis) +=\
+            		LOC5(orbmp2j331,icycle,j33,IIInew,JJJnew,0, nstep, ivir, nbasistemp, nbasistemp, 2)*atemp;
+
+            	if(III!=JJJ)
+				{
+            		LOC4(orbmp2k331,icycle,k33,j33,III-1,nstep, iocc, ivir, nbasis) +=\
+                		LOC5(orbmp2j331,icycle,j33,JJJnew,IIInew,1,nstep, ivir, nbasistemp, nbasistemp, 2)*atemp2;
+            	}
+		//	}
+		}
+		
+	}
+}
+
+__global__ void
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) forthQuarterTransKernelV2(int icycle, int i3, int k3, int nstep, QUICKDouble* orbmp2k331, QUICKDouble* orbmp2)
+{   
+    QUICKDouble* coefficient = devSim_MP2.coefficient;
+    int nElec = devSim_MP2.nElec;
+    int nbasis = devSim_MP2.nbasis;
+	int nfrozencore = devSim_MP2.nfrozencore;	
+    int iocc = nElec/2 - nfrozencore; 
+    int ivir = nbasis - nElec/2;
+    
+    int offside = blockIdx.x*blockDim.x+threadIdx.x;
+    int totalThreads = blockDim.x*gridDim.x;
+    int myInt = ivir*ivir*nbasis/totalThreads;
+    
+    if(ivir*ivir*nbasis-myInt*totalThreads>offside)
+        myInt++;
+    
+    for(int i=1;i<=myInt;i++)
+    {   
+        int currentInt = totalThreads*(i-1)+offside;
+		int lll = currentInt%nbasis;
+		currentInt/=nbasis;
+        int l3 = currentInt%ivir;
+		int j3 = currentInt/ivir;        
+
+        if(j3<ivir && l3<ivir && lll<nbasis)
+        {   
+            int l3new = l3 + nElec/2;
+			QUICKADD(LOC2(orbmp2,l3,j3,ivir,ivir),\
+					LOC4(orbmp2k331,icycle,k3,j3,lll, nstep,iocc,ivir,nbasis)*LOC2(coefficient,lll,l3new, nbasis, nbasis));
+        }
+    }
+}
+
+
+
+
+void fourQuarterTransHost(_gpu_type gpu, QUICKDouble* ememorysumptr, int* nstepmp2ptr, unsigned long long int* ntempptr)
+{
+	int jshell = gpu->jshell;
+	int nshell = gpu->nshell;
+	int nbasis = gpu->nbasis;
+	int nElec = gpu->nElec;
+	int nfrozencore = gpu->nfrozencore;
+	int iocc = nElec/2 - nfrozencore;
+	int ivir = nbasis - nElec/2;
+
+	printf("in fourQuarterTransHost, nfrozencore is %d\n ",nfrozencore);	
+
+	QUICKDouble cutoffmp2 = 1.0E-8;
+	QUICKDouble ttt = 0;
+	QUICKDouble* YCutoff = gpu->gpu_cutoff->YCutoff->_hostData;
+	
+	for(int i=0;i<nshell;i++)
+		for(int j=0;j<nshell;j++)
+			ttt = MAX(ttt,LOC2(YCutoff, i, j, nshell, nshell));
+
+	int* Qstart = gpu->gpu_basis->Qstart->_hostData;
+	int* Qfinal = gpu->gpu_basis->Qfinal->_hostData;
+	int* Qsbasis = gpu->gpu_basis->Qsbasis->_hostData;
+	int* Qfbasis = gpu->gpu_basis->Qfbasis->_hostData;
+	
+	QUICKDouble ememorysum = iocc*ivir*nbasis*8.0/1024.0/1024.0/1024.0;
+	QUICKDouble reqmemmax = 1.5;
+	int nstep = MIN(int(reqmemmax/ememorysum),iocc);
+	if(nstep<1) 
+		nstep = 1;
+	printf("nstepsize is %d\n", nstep);
+
+	*ememorysumptr = ememorysum;
+	//use 6 for up to d orbitals, and 10 for f orbitals:
+	int nbasistemp = 10;
+
+	printf("in cuda, reqmemmax is %lf, nbasistemp is %d\n", reqmemmax, nbasistemp);	
+
+	QUICKDouble* orbmp2i331_d;
+	cudaMalloc((void **)&orbmp2i331_d, sizeof(QUICKDouble)*nstep*nbasis*nbasistemp*nbasistemp*2);	
+
+	QUICKDouble* orbmp2j331_d;
+	cudaMalloc((void **)&orbmp2j331_d, sizeof(QUICKDouble)*nstep*ivir*nbasistemp*nbasistemp*2);
+
+	printf("memory consumption for 3rd trans is %lfG\n", sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis/1024.0/1024.0/1024.0);
+
+	QUICKDouble* orbmp2k331_d;
+	cudaMalloc((void **)&orbmp2k331_d,sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis);
+	cudaMemset(orbmp2k331_d, 0, sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis);	
+
+	QUICKDouble* orbmp2_d;
+	cudaMalloc((void **)&orbmp2_d,sizeof(QUICKDouble)*ivir*ivir);
+	cudaMemset(orbmp2_d,0,sizeof(QUICKDouble)*ivir*ivir);
+
+	QUICKDouble* MP2cor_d;
+	cudaMalloc((void **)&MP2cor_d,sizeof(QUICKDouble));
+    cudaMemset(MP2cor_d, 0, sizeof(QUICKDouble));
+	
+    QUICKDouble* MP2cor = new QUICKDouble[1];
+    MP2cor[0]= 0;
+	
+	int nstepmp2 = iocc/nstep;
+	if(nstep*nstepmp2<iocc)
+		nstepmp2++;
+	*nstepmp2ptr = nstepmp2;
+	printf("TOTAL STEP is %d\n", nstepmp2);	
+
+	unsigned long long int* ntempptr_d;
+	cudaMalloc((void **)&ntempptr_d,sizeof(unsigned long long int));
+	cudaMemset(ntempptr_d, 0, sizeof(unsigned long long int));
+
+	for(int i3new=1;i3new<=nstepmp2;i3new++)
+	{
+		int nstepmp2s = (i3new-1)*nstep+1+nfrozencore;
+		int nstepmp2f = i3new*nstep+nfrozencore;
+		if(i3new==nstepmp2)
+			nstepmp2f=nElec/2;
+		
+		int nsteplength = nstepmp2f-nstepmp2s+1;
+		printf("i3new is %d\n", i3new);
+        printf("nsteplength is %d\n", nsteplength);
+		
+		fflush(stdout);		
+
+		cudaMemset(orbmp2k331_d, 0, sizeof(QUICKDouble)*nstep*iocc*ivir*nbasis);
+		for(int II=0; II<jshell; II++)
+    	{
+        	for(int JJ=II; JJ<jshell; JJ++)
+        	{
+				if(LOC2(YCutoff, II, JJ, nshell, nshell)>cutoffmp2/ttt)
+				{		
+					cudaMemset(orbmp2i331_d,0,sizeof(QUICKDouble)*nstep*nbasis*nbasistemp*nbasistemp*2);
+					cudaMemset(orbmp2j331_d,0,sizeof(QUICKDouble)*nstep*ivir*nbasistemp*nbasistemp*2);
+            		firstQuarterTransKernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>\
+						(II,JJ,nstepmp2s,nsteplength,nstep,nbasistemp, ntempptr_d, cutoffmp2, orbmp2i331_d);
+					cudaDeviceSynchronize(); 
+
+					int NII1=Qstart[II];
+            		int NII2=Qfinal[II];
+            		int NJJ1=Qstart[JJ];
+            		int NJJ2=Qfinal[JJ];
+
+            		int NBI1= LOC2(Qsbasis, II, NII1, nshell, 4);
+            		int NBI2= LOC2(Qfbasis, II, NII2, nshell, 4);
+            		int NBJ1= LOC2(Qsbasis, JJ, NJJ1, nshell, 4);
+            		int NBJ2= LOC2(Qfbasis, JJ, NJJ2, nshell, 4);
+
+            		int II111=NBI1;
+            		int II112=NBI2;
+            		int JJ111=NBJ1;
+            		int JJ112=NBJ2;
+
+					for(int III=II111; III<=II112; III++)
+                	{
+                    	for(int JJJ=max(III,JJ111); JJJ<=JJ112; JJJ++)
+                    	{
+                        	int IIInew = III - II111;
+                        	int JJJnew = JJJ - JJ111;
+
+                        	// second quarter transformation
+							secondQuarterTransKernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>(III,JJJ,IIInew,JJJnew,nsteplength, nstep, nbasistemp,orbmp2i331_d,orbmp2j331_d);
+							cudaDeviceSynchronize();
+						
+							// third quarter transformation, use devSim_MP2.orbmp2k331 and QUICKADD	
+							thirdQuarterTransKernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>(III,JJJ,IIInew,JJJnew,nsteplength,nstep,nbasistemp,orbmp2j331_d,orbmp2k331_d);
+							cudaDeviceSynchronize();
+                    	}
+                	}
+				}
+			}
+		}
+		cudaMemcpy(ntempptr, ntempptr_d, sizeof(unsigned long long int),cudaMemcpyDeviceToHost);	
+		printf("in fourQuarterTransHost, effect integrals is %llu\n\n", *ntempptr);
+
+		// start the forth quarter and the final accumulation	
+		for(int icycle=0;icycle<nsteplength;icycle++) 
+    	{   
+        	int i3 = nstepmp2s+icycle-1;
+			for(int k3=i3-nfrozencore;k3<iocc;k3++) 
+        	{   
+            	//forthQuarterTransKernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>(icycle, i3, k3,nstep, orbmp2k331_d, orbmp2_d);
+            	//cudaDeviceSynchronize();
+				
+				//reinitialization is needed is using forthQuarterTransKernelV2 which is further parallelized,
+				//because no initialization can be done inside the kernel(one entry is accessed by multi threads)
+				cudaMemset(orbmp2_d,0,sizeof(QUICKDouble)*ivir*ivir);
+				forthQuarterTransKernelV2<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>(icycle, i3, k3,nstep, orbmp2k331_d, orbmp2_d);
+				cudaDeviceSynchronize();
+
+            	finalMP2AccumulationKernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>(i3, k3, orbmp2_d, MP2cor_d);
+				cudaDeviceSynchronize();
+			}
+		}
+	
+	}
+	cudaMemcpy(MP2cor, MP2cor_d, sizeof(QUICKDouble),cudaMemcpyDeviceToHost);
+	cudaMemcpy(gpu->gpu_calculated->mp2cor->_devData, MP2cor_d, sizeof(QUICKDouble),cudaMemcpyDeviceToDevice);
+
+	cudaFree(orbmp2i331_d);
+	cudaFree(orbmp2j331_d);
+	cudaFree(orbmp2k331_d);
+	cudaFree(orbmp2_d);	
+	cudaFree(MP2cor_d);	
+	cudaFree(ntempptr_d);
+	delete[] MP2cor;
+}
+
+
+__global__ void
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) forthQuarterTransKernel(int icycle, int i3, int k3, int nstep, QUICKDouble* orbmp2k331, QUICKDouble* orbmp2)
+{
+	QUICKDouble* coefficient = devSim_MP2.coefficient;
+	int nElec = devSim_MP2.nElec;
+	int nbasis = devSim_MP2.nbasis;
+	int iocc = nElec/2;
+	int ivir = nbasis - iocc;
+
+	int offside = blockIdx.x*blockDim.x+threadIdx.x;
+	int totalThreads = blockDim.x*gridDim.x;
+	int myInt = ivir*ivir/totalThreads;
+
+	if(ivir*ivir-myInt*totalThreads>offside) 
+		myInt++;
+
+	for(int i=1;i<=myInt;i++)
+	{
+		int currentInt = totalThreads*(i-1)+offside;
+		int j3 = currentInt/ivir;
+		int l3 = currentInt%ivir;
+
+		if(j3<ivir && l3<ivir)
+		{
+			LOC2(orbmp2,l3,j3,ivir,ivir)=0.0;
+        	int l3new = l3 + nElec/2;
+			for(int lll=0;lll<nbasis;lll++)
+        	{
+				//should not need atomicadd
+        		LOC2(orbmp2,l3,j3,ivir,ivir) += \
+            		LOC4(orbmp2k331,icycle,k3,j3,lll, nstep,iocc,ivir,nbasis)*LOC2(coefficient,lll,l3new, nbasis, nbasis);
+        	}
+		}
+	}		
+}
+
+__global__ void 
+__launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) finalMP2AccumulationKernel(int i3, int k3, QUICKDouble* orbmp2, QUICKDouble* MP2cor)
+{
+    QUICKDouble* molorbe = devSim_MP2.molorbe;
+    int nElec = devSim_MP2.nElec;
+    int nbasis = devSim_MP2.nbasis;
+	int iocc = nElec/2;
+    int ivir = nbasis - iocc;
+	int nfrozencore = devSim_MP2.nfrozencore;
+
+    int offside = blockIdx.x*blockDim.x+threadIdx.x;
+	int totalThreads = blockDim.x*gridDim.x;
+    int myInt = ivir*ivir/totalThreads; 
+
+	if(ivir*ivir-myInt*totalThreads>offside)
+        myInt++;
+  
+	for(int i=1;i<=myInt;i++)
+	{
+		int currentInt = totalThreads*(i-1)+offside;
+    	int j3 = currentInt/ivir;
+    	int l3 = currentInt%ivir;
+
+    	if(j3<ivir && l3<ivir)
+    	{	
+        	if(k3>i3-nfrozencore)
+        	{
+            	QUICKADD(MP2cor[0], 2.0/(molorbe[i3]+molorbe[k3+nfrozencore]-molorbe[j3+nElec/2]-molorbe[l3+nElec/2]) \
+                                *LOC2(orbmp2, j3, l3, ivir, ivir) \
+                                *(2.0*LOC2(orbmp2, j3, l3, ivir, ivir) \
+                                - LOC2(orbmp2, l3, j3, ivir, ivir)));
+
+        	}
+        	if(k3==i3-nfrozencore)
+       		{
+            	QUICKADD(MP2cor[0],1.0/(molorbe[i3]+molorbe[k3+nfrozencore]-molorbe[j3+nElec/2]-molorbe[l3+nElec/2]) \
+                                *LOC2(orbmp2,j3,l3,ivir, ivir) \
+                                *(2.0*LOC2(orbmp2, j3, l3, ivir, ivir) \
+                                - LOC2(orbmp2, l3, j3, ivir, ivir)));
+        	}
+    	}
+	}
+}
 
 
 // totTime is the timer for GPU 2e time. Only on under debug mode
+
 #ifdef DEBUG
 static float totTime;
 #endif
 
-void get2e_MP2(_gpu_type gpu)
+void get2e_MP2(_gpu_type gpu, QUICKDouble* ememorysum, int* nstepmp2, unsigned long long int * ntemp)
 {
 #ifdef DEBUG
     cudaEvent_t start,end;
@@ -50,41 +563,57 @@ void get2e_MP2(_gpu_type gpu)
     cudaEventCreate(&end);
     cudaEventRecord(start, 0);
 #endif
-    printf("BLOCK= %i, THREADSperBLOCK= %i\n", gpu->blocks, gpu->twoEThreadsPerBlock);
-    get2e_MP2_kernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>();
-    
+	float mp2_time;
+	cudaEvent_t mp2_start, mp2_trans;
+	cudaEventCreate(&mp2_start);
+	cudaEventCreate(&mp2_trans);
+	cudaEventRecord(mp2_start,0);
     
+	fourQuarterTransHost(gpu, ememorysum, nstepmp2, ntemp);
+	cudaEventRecord(mp2_trans,0);
+    cudaEventSynchronize(mp2_trans);
+    cudaEventElapsedTime(&mp2_time,mp2_start,mp2_trans);
+    printf("in get2e_MP2, total gpu mp2 four quarter transformation time is %6.3f ms\n", mp2_time);
+
+	cudaEventDestroy(mp2_start);
+	cudaEventDestroy(mp2_trans);
+
 #ifdef DEBUG
     cudaEventRecord(end, 0);
     cudaEventSynchronize(end);
     float time;
     cudaEventElapsedTime(&time, start, end);
     totTime+=time;
-    printf("this cycle:%f ms total time:%f ms\n", time, totTime);
+    //printf("this cycle:%f ms total time:%f ms\n", time, totTime);
     cudaEventDestroy(start);
     cudaEventDestroy(end);
 #endif
-    
+    printf("get2e_MP2 is done\n");
 }
 
-
+/*
 __global__ void 
 __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel()
 {
+
     unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x;
     int totalThreads = blockDim.x*gridDim.x;
-    
+	//printf("in get2e_MP2_kernel, totalThread is %d, offside is %d\n", totalThreads, offside);
+
     QUICKULL jshell   = (QUICKULL) devSim_MP2.sqrQshell;
     QUICKULL myInt    = (QUICKULL) jshell*jshell / totalThreads;
+	
+
+    if ((jshell*jshell - myInt*totalThreads)> offside) myInt++;	//256>offside?
     
-    if ((jshell*jshell - myInt*totalThreads)> offside) myInt++;
-    
-    for (QUICKULL i = 1; i<=myInt; i++) {
-        
-        QUICKULL currentInt = totalThreads * (i-1)+offside;        
+	for (QUICKULL i = 1; i<=myInt; i++) {
+
+  
+        QUICKULL currentInt = totalThreads * (i-1)+offside;     //currentInt=offside   
         QUICKULL a = (QUICKULL) currentInt/jshell;
         QUICKULL b = (QUICKULL) (currentInt - a*jshell);
-        
+*/        
+		//printf("in get2e_MP2_kernel/for, offside is %d\n",offside);
         /* 
          the following code is to implement a better index mode.
          The original one can be see as:
@@ -120,8 +649,7 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel()
          b = 2*t-k;
          }*/
         
-        
-        
+/*        
         int II = devSim_MP2.sorted_YCutoffIJ[a].x;
         int JJ = devSim_MP2.sorted_YCutoffIJ[a].y;
         int KK = devSim_MP2.sorted_YCutoffIJ[b].x;
@@ -131,8 +659,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel()
         int jj = devSim_MP2.sorted_Q[JJ];
         int kk = devSim_MP2.sorted_Q[KK];
         int ll = devSim_MP2.sorted_Q[LL];
-        
-        if (ii<=kk){
+
+		//printf("for ii,jj,kk,ll = %d, %d, %d, %d, allocate local memory orbmp2i331\n", ii, jj,kk,ll);
+		////QUICKDouble* orbmp2i331 = new QUICKDouble[devSim_MP2.nElec/2*devSim_MP2.nbasis*6*6*2];
+		QUICKDouble* orbmp2i331 = new QUICKDouble[1];
+
+			if(ii<=kk){
             int nshell = devSim_MP2.nshell;
             QUICKDouble DNMax = MAX(MAX(4.0*LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), 4.0*LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)),
                                     MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell),     LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)),
@@ -146,12 +678,15 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel()
                 int kkk = devSim_MP2.sorted_Qnumber[KK];
                 int lll = devSim_MP2.sorted_Qnumber[LL];
                 
-                iclass_MP2(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax);
-                
-            }
-        }
+				//printf("to call iclass_MP2 in kernel, iii, jjj, kkk, lll are %d, %d, %d, %d\n", iii, jjj, kkk, lll);
+                iclass_MP2(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, orbmp2i331); // Y should be added to orbmp2i331;                           				
+			}
+		}
+
+		delete[] orbmp2i331;
     }
 }
+*/
 
 /*
  sqr for double precision. there no internal function to do that in fast-math-lib of CUDA
@@ -166,9 +701,8 @@ __device__ __forceinline__ QUICKDouble quick_dsqr_MP2(QUICKDouble a)
  iclass subroutine is to generate 2-electron intergral using HRR and VRR method, which is the most
  performance algrithem for electron intergral evaluation. See description below for details
  */
-__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, QUICKDouble DNMax)
+__device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL, int nstepmp2s, int nsteplength, int nstep, int nbasistemp, QUICKDouble DNMax, QUICKDouble* orbmp2i331)
 {
-    
     /* 
      kAtom A, B, C ,D is the coresponding atom for shell ii, jj, kk, ll
      and be careful with the index difference between Fortran and C++, 
@@ -201,7 +735,8 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
     int kStartJ = devSim_MP2.kstart[JJ]-1;
     int kStartK = devSim_MP2.kstart[KK]-1;
     int kStartL = devSim_MP2.kstart[LL]-1;
-    
+ 
+	int nbasis = devSim_MP2.nbasis;   
     
     /*
      store saves temp contracted integral as [as|bs] type. the dimension should be allocatable but because
@@ -345,8 +880,8 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
     int KKK2 = LOC2(devSim_MP2.Qfbasis, KK, K, devSim_MP2.nshell, 4);
     int LLL1 = LOC2(devSim_MP2.Qsbasis, LL, L, devSim_MP2.nshell, 4);
     int LLL2 = LOC2(devSim_MP2.Qfbasis, LL, L, devSim_MP2.nshell, 4);
-    
-    
+
+
     // maxIJKL is the max of I,J,K,L
     int maxIJKL = (int)MAX(MAX(I,J),MAX(K,L));
     
@@ -359,131 +894,113 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
         hybrid_coeff = 1.0;
     }else if (devSim_MP2.method == B3LYP){
         hybrid_coeff = 0.2;
-    }else if (devSim_MP2.method == DFT){
+    }else if (devSim_MP2.method == BLYP){
         hybrid_coeff = 0.0;
     }
     
+	if(II<JJ && KK<LL){
     for (int III = III1; III <= III2; III++) {
-        for (int JJJ = MAX(III,JJJ1); JJJ <= JJJ2; JJJ++) {
-            for (int KKK = MAX(III,KKK1); KKK <= KKK2; KKK++) {
-                for (int LLL = MAX(KKK,LLL1); LLL <= LLL2; LLL++) {
-                    
+        //for (int JJJ = MAX(III,JJJ1); JJJ <= JJJ2; JJJ++) {
+		for (int JJJ = JJJ1; JJJ <= JJJ2; JJJ++) {
+            //for (int KKK = MAX(III,KKK1); KKK <= KKK2; KKK++) {
+			for (int KKK = KKK1; KKK <= KKK2; KKK++) {
+                //for (int LLL = MAX(KKK,LLL1); LLL <= LLL2; LLL++) {
+                for (int LLL = LLL1; LLL <= LLL2; LLL++) {  
+					/*
                     if (III < KKK || 
                         ((III == JJJ) && (III == LLL)) || 
                         ((III == JJJ) && (III  < LLL)) || 
                         ((JJJ == LLL) && (III  < JJJ)) ||
                         ((III == KKK) && (III  < JJJ)  && (JJJ < LLL))) {
                         
-                        
+					*/
                         QUICKDouble Y = (QUICKDouble) hrrwhole_MP2( I, J, K, L,\
                                                                III, JJJ, KKK, LLL, IJKLTYPE, store, \
                                                                RAx, RAy, RAz, RBx, RBy, RBz, \
                                                                RCx, RCy, RCz, RDx, RDy, RDz);
-                        
-                        //   if(abs(Y)*1e-2>devSim_MP2.integralCutoff){
-                        QUICKDouble DENSEKI = (QUICKDouble) LOC2(devSim_MP2.dense, KKK-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        QUICKDouble DENSEKJ = (QUICKDouble) LOC2(devSim_MP2.dense, KKK-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        QUICKDouble DENSELJ = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        QUICKDouble DENSELI = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        QUICKDouble DENSELK = (QUICKDouble) LOC2(devSim_MP2.dense, LLL-1, KKK-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim_MP2.dense, JJJ-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis);
-                        
-                        
-                        // ATOMIC ADD VALUE 1
-                        QUICKDouble _tmp = 2.0;
-                        if (KKK==LLL) {
-                            _tmp = 1.0;
-                        }
-                        
-                        QUICKDouble val1d = _tmp*DENSELK*Y;
-                        //if (abs(val1d) > devSim_MP2.integralCutoff ) {
-                        QUICKULL val1 = (QUICKULL) (fabs(val1d*OSCALE) + (QUICKDouble)0.5);
-                        if ( val1d < (QUICKDouble)0.0) val1 = 0ull - val1;
-                        QUICKADD(LOC2(devSim_MP2.oULL, JJJ-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1);
-                        // }
-                        
-                        // ATOMIC ADD VALUE 2
-                        if ((LLL != JJJ) || (III!=KKK)) {
-                            _tmp = 2.0;
-                            if (III==JJJ) {
-                                _tmp = 1.0;
-                            }
-                            
-                            QUICKDouble val2d = _tmp*DENSEJI*Y;
-                            //   if (abs(val2d) > devSim_MP2.integralCutoff ) {
-                            QUICKULL val2 = (QUICKULL) (fabs(val2d*OSCALE) + (QUICKDouble)0.5);
-                            if ( val2d < (QUICKDouble)0.0) val2 = 0ull - val2;
-                            QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, KKK-1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2);    
-                            //  }
-                        }
-                        
-                        
-                        // ATOMIC ADD VALUE 3
-                        
-                        QUICKDouble val3d = hybrid_coeff*0.5*DENSELJ*Y;
-                        //if (abs(2*val3d) > devSim_MP2.integralCutoff ) {
-                        QUICKULL val3 = (QUICKULL) (fabs(val3d*OSCALE) + (QUICKDouble)0.5);
-                        if (((III == KKK) && (III <  JJJ) && (JJJ < LLL))) {
-                            val3 = (QUICKULL) (fabs(2*val3d*OSCALE) + (QUICKDouble)0.5);
-                        }
-                        if ( DENSELJ*Y < (QUICKDouble)0.0) val3 = 0ull - val3;
-                        QUICKADD(LOC2(devSim_MP2.oULL, KKK-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val3);
-                        //}
-                        
-                        // ATOMIC ADD VALUE 4
-                        if (KKK != LLL) {
-                            QUICKDouble val4d = hybrid_coeff*0.5*DENSEKJ*Y;
-                            // if (abs(val4d) > devSim_MP2.integralCutoff ) {
-                            QUICKULL val4 = (QUICKULL) (fabs(val4d*OSCALE) + (QUICKDouble)0.5);
-                            if ( val4d < (QUICKDouble)0.0) val4 = 0ull - val4;
-                            QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, III-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val4);
-                            //}
-                        }
-                        
-                        
-                        
-                        // ATOMIC ADD VALUE 5
-                        QUICKDouble val5d = hybrid_coeff*0.5*DENSELI*Y;
-                        //if (abs(val5d) > devSim_MP2.integralCutoff ) {
-                        QUICKULL val5 = (QUICKULL) (fabs(val5d*OSCALE) + (QUICKDouble)0.5);
-                        if ( val5d < (QUICKDouble)0.0) val5 = 0ull - val5;
-                        
-                        if ((III != JJJ && III<KKK) || ((III == JJJ) && (III == KKK) && (III < LLL)) || ((III == KKK) && (III <  JJJ) && (JJJ < LLL))) {
-                            QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,KKK)-1, MIN(JJJ,KKK)-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val5);
-                        }
-                        
-                        
-                        // ATOMIC ADD VALUE 5 - 2
-                        if ( III != JJJ && JJJ == KKK) {
-                            QUICKADD(LOC2(devSim_MP2.oULL, JJJ-1, KKK-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val5);
-                        }
-                        //}
-                        
-                        // ATOMIC ADD VALUE 6
-                        if (III != JJJ) {
-                            if (KKK != LLL) {
-                                QUICKDouble val6d = hybrid_coeff*0.5*DENSEKI*Y;
-                                //      if (abs(val6d) > devSim_MP2.integralCutoff ) {
-                                
-                                QUICKULL val6 = (QUICKULL) (fabs(val6d*OSCALE) + (QUICKDouble)0.5);
-                                if ( val6d < (QUICKDouble)0.0) val6 = 0ull - val6;
-                                
-                                QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,LLL)-1, MIN(JJJ,LLL)-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val6);
-                                
-                                // ATOMIC ADD VALUE 6 - 2
-                                if (JJJ == LLL && III!= KKK) {
-                                    QUICKADD(LOC2(devSim_MP2.oULL, LLL-1, JJJ-1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull-val6);                    
-                                }
+ 						
+						
+						//first quarter transformation:
+						if(fabs(Y)>devSim_MP2.integralCutoff)
+						{
+							for(int i3mp2=1; i3mp2<=nsteplength;i3mp2++)
+							{
+								int i3mp2new = nstepmp2s+i3mp2-1;
+								QUICKDouble atemp = LOC2(devSim_MP2.coefficient, KKK-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y;
+								QUICKDouble btemp = LOC2(devSim_MP2.coefficient, LLL-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y;
+
+								// AO index in its shell
+								int IIInew = III- devSim_MP2.Ksumtype[II]+1;
+								int JJJnew = JJJ- devSim_MP2.Ksumtype[JJ]+1;
+								//printf("Here III is %d, II is %d, devSim_MP2.Ksumtype[II] is %d, IIInew is %d\n",III, II, devSim_MP2.Ksumtype[II], IIInew);
+								// KKK -> i3mp2, note i3mp2 is from [1, nsteplength], nsteplength is from nstep, MAX(nstep) is nElec/2
+								QUICKADD(LOC5(orbmp2i331,i3mp2-1,LLL-1,IIInew-1,JJJnew-1,0, nstep, nbasis, nbasistemp, nbasistemp, 2), atemp);
+								// KKK -> i3mp2
+								QUICKADD(LOC5(orbmp2i331,i3mp2-1,LLL-1,JJJnew-1,IIInew-1,1, nstep, nbasis, nbasistemp, nbasistemp, 2), atemp);
+								// LLL -> i3mp2
+								QUICKADD(LOC5(orbmp2i331,i3mp2-1,KKK-1,IIInew-1,JJJnew-1,0, nstep, nbasis, nbasistemp, nbasistemp, 2), btemp);
+								// LLL -> i3mp2
+								QUICKADD(LOC5(orbmp2i331,i3mp2-1,KKK-1,JJJnew-1,IIInew-1,1, nstep, nbasis, nbasistemp, nbasistemp, 2), btemp);
+							}
+						}
+
+						
+                }
+            }
+        }
+    }
+	} 
+	else
+    {
+        for(int III=III1; III<=III2; III++)
+        {
+            if(MAX(III,JJJ1)<=JJJ2)
+            {
+                for(int JJJ=MAX(III,JJJ1);JJJ<=JJJ2;JJJ++)
+                {
+                    for(int KKK=KKK1; KKK<=KKK2; KKK++)
+                    {
+                        if(MAX(KKK,LLL1)<=LLL2)
+                        {
+                            for(int LLL=MAX(KKK,LLL1);LLL<=LLL2;LLL++)
+                            {
+                                QUICKDouble Y = (QUICKDouble) hrrwhole_MP2( I, J, K, L,\
+                                                               III, JJJ, KKK, LLL, IJKLTYPE, store, \
+                                                               RAx, RAy, RAz, RBx, RBy, RBz, \
+                                                               RCx, RCy, RCz, RDx, RDy, RDz);
+								if(fabs(Y)>devSim_MP2.integralCutoff)
+								{
+									for(int i3mp2=1;i3mp2<=nsteplength;i3mp2++)
+									{
+										int i3mp2new = nstepmp2s+i3mp2-1;
+										QUICKDouble atemp = LOC2(devSim_MP2.coefficient, KKK-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y;
+										QUICKDouble btemp = LOC2(devSim_MP2.coefficient, LLL-1, i3mp2new-1, devSim_MP2.nbasis, devSim_MP2.nbasis)*Y;
+										int IIInew = III - devSim_MP2.Ksumtype[II]+1;
+										int JJJnew = JJJ - devSim_MP2.Ksumtype[JJ]+1;
+										QUICKADD(LOC5(orbmp2i331, i3mp2-1, LLL-1, IIInew-1, JJJnew-1, 0, nstep, nbasis, nbasistemp, nbasistemp, 2),atemp);										
+										if(JJJ != III)
+										{
+											QUICKADD(LOC5(orbmp2i331, i3mp2-1, LLL-1, JJJnew-1, IIInew-1, 1, nstep, nbasis, nbasistemp, nbasistemp, 2),atemp);
+										}
+										if(KKK != LLL)
+										{
+											QUICKADD(LOC5(orbmp2i331, i3mp2-1, KKK-1, IIInew-1, JJJnew-1, 0, nstep, nbasis, nbasistemp, nbasistemp, 2),btemp);
+											if(III != JJJ)
+											{
+												QUICKADD(LOC5(orbmp2i331, i3mp2-1, KKK-1, JJJnew-1, IIInew-1, 1, nstep, nbasis, nbasistemp, nbasistemp, 2),btemp);
+											}
+										}
+
+									}
+								}
                             }
-                            //}
                         }
                     }
-                    //} 
-                    
                 }
             }
         }
     }
+
     return;
 }
 
@@ -497,8 +1014,27 @@ __device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalT
                          QUICKDouble ABCDtemp,QUICKDouble ABtemp, \
                          QUICKDouble CDtemp, QUICKDouble ABcom, QUICKDouble CDcom)
 {
+	//unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x;
+ 
+	// here is fine
+	//printf("at the begining of vertical_MP2, I, J, K, L are %d, %d, %d, %d\n",I, J, K, L);   
+	
+	/*
+	if(offside==0)
+	{	
+		printf("Ptempx is %lf, Ptempy is %lf, Ptempz is %lf, WPtempx is %lf, Qtempx is %lf, WQtempx is %lf, ABCDtemp is %lf, CDtemp is %lf, VY(0,0,0) is %lf\n",\
+				Ptempx, Ptempy, Ptempz, WPtempx, Qtempx, WQtempx, ABCDtemp, CDtemp, VY(0,0,0));
+	}
+	*/
     
     LOC2(store, 0, 0, STOREDIM, STOREDIM) += VY( 0, 0, 0);
+	
+	/*
+	if(offside==0)
+    {
+		printf("at the begining of vertical_MP2, store[0] is %lf, this is case %d\n", store[0], (I+J)*10+K+L);
+	}
+	*/
 	switch ((I+J)*10+K+L){
         case 1:
         {
@@ -1086,7 +1622,8 @@ __device__ void vertical_MP2(int I, int J, int K, int L, QUICKDouble* YVerticalT
         default:
         {
 #ifndef CUDA_SP
-            if (K+L>=1){
+            //printf("fall into default case, I, J, K, L are %d, %d, %d, %d\n",I, J, K, L);
+			if (K+L>=1){
                 //SSPS(0, YVerticalTemp, Qtempx, Qtempy, Qtempz, WQtempx, WQtempy, WQtempz);
                 QUICKDouble x_0_1_0 = Qtempx * VY( 0, 0, 0) + WQtempx * VY( 0, 0, 1);
                 QUICKDouble x_0_2_0 = Qtempy * VY( 0, 0, 0) + WQtempy * VY( 0, 0, 1);
@@ -5479,30 +6016,38 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \
                                 QUICKDouble RCx,QUICKDouble RCy,QUICKDouble RCz, \
                                 QUICKDouble RDx,QUICKDouble RDy,QUICKDouble RDz)
 {
+	//printf("at the beginning of hrrwhole_MP2\n");
     QUICKDouble Y;
 #ifdef CUDA_SP
+	printf("def CUDA_SP\n")
     int NAx = LOC2(devSim_MP2.KLMN,0,III-1,3,devSim_MP2.nbasis);
     int NAy = LOC2(devSim_MP2.KLMN,1,III-1,3,devSim_MP2.nbasis);
     int NAz = LOC2(devSim_MP2.KLMN,2,III-1,3,devSim_MP2.nbasis);
-    
+   	//printf("NAx, NAy, NAz are done\n");
+	
     int NBx = LOC2(devSim_MP2.KLMN,0,JJJ-1,3,devSim_MP2.nbasis);
     int NBy = LOC2(devSim_MP2.KLMN,1,JJJ-1,3,devSim_MP2.nbasis);
     int NBz = LOC2(devSim_MP2.KLMN,2,JJJ-1,3,devSim_MP2.nbasis);
-    
+	//printf("NBx, NBy, NBz are done\n");    
+
     int NCx = LOC2(devSim_MP2.KLMN,0,KKK-1,3,devSim_MP2.nbasis);
     int NCy = LOC2(devSim_MP2.KLMN,1,KKK-1,3,devSim_MP2.nbasis);
     int NCz = LOC2(devSim_MP2.KLMN,2,KKK-1,3,devSim_MP2.nbasis);
+	//printf("NCx, NCy, NCz are done\n");
     
     int NDx = LOC2(devSim_MP2.KLMN,0,LLL-1,3,devSim_MP2.nbasis);
     int NDy = LOC2(devSim_MP2.KLMN,1,LLL-1,3,devSim_MP2.nbasis);
     int NDz = LOC2(devSim_MP2.KLMN,2,LLL-1,3,devSim_MP2.nbasis);
-    
+	//printf("NDx, NDy, NDz are done\n");    
+
     
     int MA = LOC3(devTrans_MP2, NAx, NAy, NAz, TRANSDIM, TRANSDIM, TRANSDIM);
     int MB = LOC3(devTrans_MP2, NBx, NBy, NBz, TRANSDIM, TRANSDIM, TRANSDIM);
     int MC = LOC3(devTrans_MP2, NCx, NCy, NCz, TRANSDIM, TRANSDIM, TRANSDIM);
     int MD = LOC3(devTrans_MP2, NDx, NDy, NDz, TRANSDIM, TRANSDIM, TRANSDIM);
     
+	//printf("inside hrrwhole_MP2, IJKLTYPE is %d\n", IJKLTYPE);
+
     switch (IJKLTYPE) {
         case 0:
         case 10:
@@ -5728,11 +6273,10 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \
             break;
     }
 #else
-    
     int angularL[8], angularR[8];
     QUICKDouble coefAngularL[8], coefAngularR[8];
-    Y = (QUICKDouble) 0.0;
-    
+	Y = (QUICKDouble) 0.0;
+		
     int numAngularL = lefthrr_MP2(RAx, RAy, RAz, RBx, RBy, RBz, 
                               LOC2(devSim_MP2.KLMN,0,III-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,III-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,III-1,3,devSim_MP2.nbasis),
                               LOC2(devSim_MP2.KLMN,0,JJJ-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,JJJ-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,JJJ-1,3,devSim_MP2.nbasis),
@@ -5741,14 +6285,15 @@ __device__ QUICKDouble hrrwhole_MP2(int I, int J, int K, int L, \
                               LOC2(devSim_MP2.KLMN,0,KKK-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,KKK-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,KKK-1,3,devSim_MP2.nbasis),
                               LOC2(devSim_MP2.KLMN,0,LLL-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,1,LLL-1,3,devSim_MP2.nbasis), LOC2(devSim_MP2.KLMN,2,LLL-1,3,devSim_MP2.nbasis),
                               L, coefAngularR, angularR);
-    
+
     for (int i = 0; i<numAngularL; i++) {
         for (int j = 0; j<numAngularR; j++) {
-            Y += coefAngularL[i] * coefAngularR[j] * LOC2(store, angularL[i]-1, angularR[j]-1 , STOREDIM, STOREDIM);
-        }
+			Y += coefAngularL[i] * coefAngularR[j] * LOC2(store, angularL[i]-1, angularR[j]-1 , STOREDIM, STOREDIM);
+		}
     }
     
     Y = Y * devSim_MP2.cons[III-1] * devSim_MP2.cons[JJJ-1] * devSim_MP2.cons[KKK-1] * devSim_MP2.cons[LLL-1];
+
 #endif
     return Y;
 }  
@@ -5761,6 +6306,7 @@ __device__ int lefthrr_MP2(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz,
                        int KLMNBx, int KLMNBy, int KLMNBz,
                        int IJTYPE,QUICKDouble* coefAngularL, int* angularL)
 {           
+
     int numAngularL;
     switch (IJTYPE) {
             
@@ -5769,7 +6315,7 @@ __device__ int lefthrr_MP2(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz,
             numAngularL = 1;
             coefAngularL[0] = 1.0;
             angularL[0] = (int) LOC3(devTrans_MP2, KLMNAx, KLMNAy, KLMNAz, TRANSDIM, TRANSDIM, TRANSDIM);
-            break;
+			break;
         }
         case 1:
         {
diff --git a/src/cuda/gpu_common.h b/src/cuda/gpu_common.h
index a24f80731..b15380240 100644
--- a/src/cuda/gpu_common.h
+++ b/src/cuda/gpu_common.h
@@ -53,6 +53,7 @@ fflush(stdout);\
 #define LOC2(A,i1,i2,d1,d2)  A[i1+(i2)*(d1)]
 #define LOC3(A,i1,i2,i3,d1,d2,d3) A[i3+((i2)+(i1)*(d2))*(d3)]
 #define LOC4(A,i1,i2,i3,i4,d1,d2,d3,d4) A[i4+(i3+((i2)+(i1)*(d2))*(d3))*(d4)]
+#define LOC5(A,i1,i2,i3,i4,i5,d1,d2,d3,d4,d5) A[i5+(i4+(i3+((i2)+(i1)*(d2))*(d3))*(d4))*(d5)]
 
 #define MAX(A,B)    (A>=B?A:B)
 #define MIN(A,B)    (A<B?A:B)
diff --git a/src/cuda/gpu_get2e.cu b/src/cuda/gpu_get2e.cu
index aa76eba6e..7826836da 100644
--- a/src/cuda/gpu_get2e.cu
+++ b/src/cuda/gpu_get2e.cu
@@ -297,12 +297,17 @@ void getAOInt(_gpu_type gpu, QUICKULL intStart, QUICKULL intEnd, cudaStream_t st
 // interface to call Kernel subroutine
 void get2e(_gpu_type gpu)
 {
+	//printf("inside void get2e(_gpu_type gpu)\n");
+	//printf("gpu->maxL is %d\n", gpu->maxL);
     // Part spd
 //    nvtxRangePushA("SCF 2e");
 
+	//printf("before enter get2e_kernel\n");
     QUICK_SAFE_CALL((get2e_kernel<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>()));
- 
+	//printf("gpu->maxL is %d\n", gpu->maxL); 
+
 #ifdef CUDA_SPDF
+	//printf("CUDA_SPDF\n");
     if (gpu->maxL >= 3) {
         // Part f-1
         QUICK_SAFE_CALL((get2e_kernel_spdf<<<gpu->blocks, gpu->twoEThreadsPerBlock>>>()));
diff --git a/src/cuda/gpu_get2e_subs.h b/src/cuda/gpu_get2e_subs.h
index 95de8ac3d..eceeb5a8d 100644
--- a/src/cuda/gpu_get2e_subs.h
+++ b/src/cuda/gpu_get2e_subs.h
@@ -108,14 +108,18 @@ __global__ void
 __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
 #endif
 {
+	//if(threadIdx.x == 0)
+	//printf("inside get2e_kernel\n");
     unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x;
     int totalThreads = blockDim.x*gridDim.x;
-    
+	//printf("offside is %d\n", offside);
+	//printf("totalThreads is %d\n", totalThreads); 
+   
     // jshell and jshell2 defines the regions in i+j and k+l axes respectively.    
     // sqrQshell= Qshell x Qshell; where Qshell is the number of sorted shells (see gpu_upload_basis_ in gpu.cu)
     // for details on sorting. 
  
-#ifdef int_spd
+#ifdef int_spd	//kernel 0, zone 0,1,2,3,4
 /*
  Here we walk through full cutoff matrix.
 
@@ -129,11 +133,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
  |_____________|  |
 
 */
-
+	//printf("is int_spd");
     QUICKULL jshell = (QUICKULL) devSim.sqrQshell;
     QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell;
+	//printf("jshell and jshell2 are %llu\n", jshell);
 
-#elif defined int_spdf
+#elif defined int_spdf	//kernel 1, zone 1,3,4
 
 /*  
  Here we walk through following region of the cutoff matrix.
@@ -152,18 +157,18 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
     QUICKULL jshell = (QUICKULL) devSim.sqrQshell;
     QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - devSim.fStart;
     
-#elif defined int_spdf2
+#elif defined int_spdf2	//kernel 2, zone 2,3,4
 
     QUICKULL jshell = (QUICKULL) devSim.sqrQshell;
     QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - devSim.fStart;
 
-#elif defined int_spdf3
+#elif defined int_spdf3	//kernel 3, zone 3,4
 
     QUICKULL jshell0 = (QUICKULL) devSim.fStart;
     QUICKULL jshell = (QUICKULL) devSim.sqrQshell - jshell0;
     QUICKULL jshell2 = (QUICKULL) devSim.sqrQshell - jshell0;
     
-#elif defined int_spdf4
+#elif defined int_spdf4	//kernel 4, zone 4
     
     QUICKULL jshell0 = (QUICKULL) devSim.fStart;
     QUICKULL jshell00 = (QUICKULL) devSim.ffStart;
@@ -236,6 +241,9 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
         QUICKULL a = (QUICKULL) i/jshell;
         QUICKULL b = (QUICKULL) (i - a*jshell);
 
+		//printf("a is %llu", a);
+		//printf("b is %llu", b);
+
 #elif defined int_spdf
         
         
@@ -348,7 +356,7 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
             a = 0;
             b = 0;
         }
-
+// a,b are thread ordinates (x,y)
 #endif
 
 #ifdef CUDA_MPIV
@@ -361,6 +369,12 @@ __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_kernel_spdf10()
         int ii = devSim.sorted_Q[II];
         int kk = devSim.sorted_Q[KK];
         
+		//printf("II is %d\n",II);
+		//printf("KK is %d\n",KK);
+		//printf("ii is %d\n",ii);
+		//printf("kk is %d\n",kk);
+			
+
         if (ii<=kk){
             
             
diff --git a/src/cuda/gpu_type.h b/src/cuda/gpu_type.h
index 62ca4f3c7..7a0b3ed38 100644
--- a/src/cuda/gpu_type.h
+++ b/src/cuda/gpu_type.h
@@ -31,7 +31,15 @@ struct gpu_calculated_type {
     int                             nbasis; // number of basis sets
     cuda_buffer_type<QUICKDouble>*  o;      // O matrix
     cuda_buffer_type<QUICKDouble>*  dense;  // Density Matrix
-    cuda_buffer_type<QUICKULL>*     oULL;   // Unsigned long long int type O matrix
+    cuda_buffer_type<QUICKDouble>*  coefficient; //Coefficient Matrix
+	//cuda_buffer_type<QUICKDouble>*  Y_Matrix;  // Y Matrix
+	cuda_buffer_type<QUICKDouble>*  mp2cor;
+	//cuda_buffer_type<QUICKDouble>*  orbmp2i331; //first quarter of ERI transformation
+	//cuda_buffer_type<QUICKDouble>*  orbmp2j331; //second quarter of ERI transformation
+	//cuda_buffer_type<QUICKDouble>*  orbmp2k331; //third quarter of ERI transformation
+	//cuda_buffer_type<QUICKDouble>*  orbmp2;		//forth quarter of ERI transformation
+	cuda_buffer_type<QUICKDouble>*  molorbe;
+	cuda_buffer_type<QUICKULL>*     oULL;   // Unsigned long long int type O matrix
     cuda_buffer_type<QUICKDouble>*  distance; // distance matrix
 };
 
@@ -170,6 +178,7 @@ struct gpu_simulation_type {
     int                             fStart;
     int                             ffStart;
     int                             maxL;
+	int								nfrozencore;
 
 	//New XC implementation
     int npoints;                                //Total number of packed grid points
@@ -273,7 +282,15 @@ struct gpu_simulation_type {
     QUICKDouble*                    o;
     QUICKULL*                       oULL;
     QUICKDouble*                    dense;
-    
+	QUICKDouble*					coefficient;
+	//QUICKDouble*					Y_Matrix;
+	QUICKDouble*					mp2cor;
+	//QUICKDouble* 					orbmp2i331;
+	//QUICKDouble*                    orbmp2j331;
+	//QUICKDouble*                    orbmp2k331;
+	//QUICKDouble*                    orbmp2;
+	QUICKDouble*                    molorbe;
+
     QUICKDouble*                    distance;
     QUICKDouble*                    Xcoeff;
     QUICKDouble*                    expoSum;
@@ -330,7 +347,7 @@ struct gpu_basis_type {
     
     int                             fStart;
     int                             ffStart;
-    
+    int								nfrozencore;
     // Gaussian Type function
 
     cuda_buffer_type<int>*          ncontract;
@@ -430,7 +447,8 @@ struct gpu_type {
     int                             jshell;
     int                             jbasis;
     int                             maxL;
-    
+	int								nfrozencore;  
+
     cuda_buffer_type<int>*          iattype;
     cuda_buffer_type<QUICKDouble>*  xyz;
     cuda_buffer_type<QUICKDouble>*  chg;
diff --git a/src/amber_interface.f90 b/src/deprecated/amber_interface.f90
similarity index 100%
rename from src/amber_interface.f90
rename to src/deprecated/amber_interface.f90
diff --git a/src/calmp2divcon.f90 b/src/deprecated/calmp2divcon.f90
similarity index 100%
rename from src/calmp2divcon.f90
rename to src/deprecated/calmp2divcon.f90
diff --git a/src/debug_quick.f90 b/src/deprecated/debug_quick.f90
similarity index 100%
rename from src/debug_quick.f90
rename to src/deprecated/debug_quick.f90
diff --git a/src/deprecated/dipole.f90 b/src/deprecated/dipole.f90
new file mode 100644
index 000000000..8df356290
--- /dev/null
+++ b/src/deprecated/dipole.f90
@@ -0,0 +1,498 @@
+!
+!	dipole.f90
+!	new_quick
+!
+!	Created by Yipu Miao on 3/4/11.
+!	Copyright 2011 University of Florida. All rights reserved.
+!
+!   written by Ed Brothers. March 18,2002
+!   3456789012345678901234567890123456789012345678901234567890123456789012<<STOP
+
+    subroutine dipole
+    ! The purpose of this subroutine is to generate the Mulliken and Lowdin
+    ! charges, and then calculate the dipole moment.
+
+    use allmod
+    implicit none
+    double precision xyzdipole(3,natom)
+    double precision HOLDIJ,DENSEJI
+    double precision totweight,cmx,cmy,cmz,weight
+    double precision Sx,Sy,Sz
+    double precision totdip,xdip,ydip,zdip
+    double precision, external :: xmoment
+    integer i,j,k,ibas,jbas,Icon,Jcon
+    
+    
+
+    call prtact(ioutfile,"Begin Charge and Dipole Calculation")
+    call cpu_time(timer_begin%TDip)
+
+
+    ! Part 1. Mulliken and Lowdin charge
+    ! First, calculate P*S.  Note P is symmetric, so we can use the fast
+    ! muliplier.  Also note we are using the total denisity matrix. Store
+    ! this is HOLD.
+    if ( .not. quick_method%unrst) then
+        do I = 1,nbasis
+            do J = 1,nbasis
+                HOLDIJ = 0.0D0
+                do K = 1,nbasis
+                    HOLDIJ = HOLDIJ + quick_qm_struct%dense(K,I)*quick_qm_struct%s(K,J)
+                enddo
+                quick_scratch%hold(I,J) = HOLDIJ
+            enddo
+        enddo
+    else
+        do I = 1,nbasis
+            do J = 1,nbasis
+                quick_scratch%hold2(J,I) = quick_qm_struct%dense(J,I)+quick_qm_struct%denseB(J,I)
+            enddo
+        enddo
+        do I = 1,nbasis
+            do J = 1,nbasis
+                HOLDIJ = 0.0D0
+                do K = 1,nbasis
+                    HOLDIJ = HOLDIJ + quick_scratch%hold2(K,I)*quick_qm_struct%s(K,J)
+                enddo
+                quick_scratch%hold(I,J) = HOLDIJ
+            enddo
+        enddo
+    endif
+
+    ! Mulliken Charge of atom A = core charge A - (Sum over u on A) PS(uu)
+    do I = 1,natom
+        quick_qm_struct%Mulliken(I) = quick_molspec%chg(I)
+        do J = quick_basis%first_basis_function(I),quick_basis%last_basis_function(I)
+            quick_qm_struct%Mulliken(I) = quick_qm_struct%Mulliken(I) - quick_scratch%hold(J,J)
+        enddo
+    enddo
+
+    ! At this point we have the Mulliken charges. Now calculate the Lowdin
+    ! charges:
+    ! Lowdin Charge of atom A = core charge A -
+    ! - (Sum over u on A)[S^(1/2)PS^(1/2)](uu)
+    ! Now remember S^(-1/2) = X.  Thus we have to calculate
+    ! XSPSX = S^(-1/2)SPSS^(-1/2)= S^(1/2)PS^(1/2)
+
+    ! Currently, HOLD contains PS.  Use the fast multiplier to get SPS and place
+    ! it in HOLD2.
+    do I=1,nbasis
+        do J=1,nbasis
+            HOLDIJ = 0.0D0
+            do K=1,nbasis
+                HOLDIJ = HOLDIJ + quick_qm_struct%s(K,I)*quick_scratch%hold(K,J)
+            enddo
+            quick_scratch%hold2(I,J) = HOLDIJ
+        enddo
+    enddo
+
+    ! Now we have two slow multiplication steps to get to XSPSX.
+    do I=1,nbasis
+        do J=1,nbasis
+            HOLDIJ = 0.0D0
+            do K=1,nbasis
+                HOLDIJ = HOLDIJ + quick_qm_struct%x(I,K)*quick_scratch%hold2(K,J)
+            enddo
+            quick_scratch%hold(I,J) = HOLDIJ
+        enddo
+    enddo
+
+    do I=1,nbasis
+        do J=1,nbasis
+            HOLDIJ = 0.0D0
+            do K=1,nbasis
+                HOLDIJ = HOLDIJ + quick_scratch%hold(I,K)*quick_qm_struct%x(K,J)
+            enddo
+            quick_scratch%hold2(I,J) = HOLDIJ
+        enddo
+    enddo
+
+    ! So now HOLD2 contains XSPSX.  Use this to calculate the Lowdin charges.
+    do I=1,natom
+        quick_qm_struct%Lowdin(I) = quick_molspec%chg(I)
+        do J=quick_basis%first_basis_function(I),quick_basis%last_basis_function(I)
+            quick_qm_struct%Lowdin(I) = quick_qm_struct%Lowdin(I) - quick_scratch%hold2(J,J)
+        enddo
+    enddo
+
+    write (ioutfile,'(4x,"ATOMIC CHARGES")')
+    write (ioutfile,'(3x,"ATOM          MULLIKEN            LOWDIN")')
+    do I=1,natom
+        write (ioutfile,'(3x,A2,7x,F12.4,7x,F12.4)') symbol(quick_molspec%iattype(I)), &
+        quick_qm_struct%Mulliken(I),quick_qm_struct%Lowdin(I)
+    enddo
+    write (ioutfile,'(3x,A5,3x,F12.4,7x,F12.4)') "TOTAL", &
+        sum(quick_qm_struct%Mulliken(1:natom)),sum(quick_qm_struct%Lowdin(1:natom))
+
+
+    ! Part 2. Dipole calculation
+    ! First move the molecule's center of mass to the origin.
+    totweight=0.d0
+    cmx = 0.d0
+    cmy = 0.d0
+    cmz = 0.d0
+    do I=1,natom
+        weight = emass(quick_molspec%iattype(I))
+        totweight = weight+totweight
+        cmx = cmx+xyz(1,I)*weight
+        cmy = cmy+xyz(2,I)*weight
+        cmz = cmz+xyz(3,I)*weight
+    enddo
+    cmx = cmx/totweight
+    cmy = cmy/totweight
+    cmz = cmz/totweight
+    do I=1,natom
+        xyzdipole(1,I)=xyz(1,I)-cmx
+        xyzdipole(2,I)=xyz(2,I)-cmy
+        xyzdipole(3,I)=xyz(3,I)-cmz
+    enddo
+
+
+    xdip=0.d0
+    ydip=0.d0
+    zdip=0.d0
+
+    do I=1,natom
+        xdip = xdip+quick_molspec%chg(I)*xyzdipole(1,I)
+        ydip = ydip+quick_molspec%chg(I)*xyzdipole(2,I)
+        zdip = zdip+quick_molspec%chg(I)*xyzdipole(3,I)
+    enddo
+
+    do Ibas=1,nbasis
+        do Jbas=Ibas,nbasis
+            Sx =0.d0
+            Sy =0.d0
+            Sz =0.d0
+            DENSEJI=quick_qm_struct%dense(Jbas,Ibas)
+            if (quick_method%unrst) DENSEJI=DENSEJI+quick_qm_struct%denseB(Jbas,Ibas)
+            if (Jbas /= Ibas) DENSEJI=DENSEJI*2.d0
+            do Icon=1,ncontract(ibas)
+                do Jcon=1,ncontract(jbas)
+                    Sx =Sx + &
+                    dcoeff(Jcon,Jbas)*dcoeff(Icon,Ibas) &
+                    *xmoment(aexp(Jcon,Jbas),aexp(Icon,Ibas), &
+                    itype(1,Jbas),itype(2,Jbas),itype(3,Jbas), &
+                    itype(1,Ibas),itype(2,Ibas),itype(3,Ibas), &
+                    1,0,0, &
+                    xyzdipole(1,quick_basis%ncenter(Jbas)),xyzdipole(2,quick_basis%ncenter(Jbas)), &
+                    xyzdipole(3,quick_basis%ncenter(Jbas)),xyzdipole(1,quick_basis%ncenter(Ibas)), &
+                    xyzdipole(2,quick_basis%ncenter(Ibas)),xyzdipole(3,quick_basis%ncenter(Ibas)), &
+                    0.d0,0.d0,0.d0)
+                    Sy =Sy + &
+                    dcoeff(Jcon,Jbas)*dcoeff(Icon,Ibas) &
+                    *xmoment(aexp(Jcon,Jbas),aexp(Icon,Ibas), &
+                    itype(1,Jbas),itype(2,Jbas),itype(3,Jbas), &
+                    itype(1,Ibas),itype(2,Ibas),itype(3,Ibas), &
+                    0,1,0, &
+                    xyzdipole(1,quick_basis%ncenter(Jbas)),xyzdipole(2,quick_basis%ncenter(Jbas)), &
+                    xyzdipole(3,quick_basis%ncenter(Jbas)),xyzdipole(1,quick_basis%ncenter(Ibas)), &
+                    xyzdipole(2,quick_basis%ncenter(Ibas)),xyzdipole(3,quick_basis%ncenter(Ibas)), &
+                    0.d0,0.d0,0.d0)
+                    Sz =Sz + &
+                    dcoeff(Jcon,Jbas)*dcoeff(Icon,Ibas) &
+                    *xmoment(aexp(Jcon,Jbas),aexp(Icon,Ibas), &
+                    itype(1,Jbas),itype(2,Jbas),itype(3,Jbas), &
+                    itype(1,Ibas),itype(2,Ibas),itype(3,Ibas), &
+                    0,0,1, &
+                    xyzdipole(1,quick_basis%ncenter(Jbas)),xyzdipole(2,quick_basis%ncenter(Jbas)), &
+                    xyzdipole(3,quick_basis%ncenter(Jbas)),xyzdipole(1,quick_basis%ncenter(Ibas)), &
+                    xyzdipole(2,quick_basis%ncenter(Ibas)),xyzdipole(3,quick_basis%ncenter(Ibas)), &
+                    0.d0,0.d0,0.d0)
+
+                enddo
+            enddo
+            xdip = xdip - Sx*DENSEJI
+            ydip = ydip - Sy*DENSEJI
+            zdip = zdip - Sz*DENSEJI
+        enddo
+    enddo
+
+    totdip = ((xdip*xdip+ydip*ydip+zdip*zdip)**.5d0)*2.541765d0
+    write (ioutfile,'(/,4x,"DIPOLE (DEBYE)")')
+    write (ioutfile,'(6x,"X",9x,"Y",9x,"Z",8x,"TOTAL")')
+    write (ioutfile,'(4f10.4)') xdip*2.541765d0,ydip*2.541765d0,zdip*2.541765d0,totdip
+    call cpu_time(timer_end%TDip)
+    call prtact(ioutfile,"End Charge and Dipole Calculation")
+    
+    return
+    end subroutine dipole
+    
+!*******************************************************
+! xmoment   
+!-------------------------------------------------------
+! Ed Brothers. September 25, 2002
+! 3456789012345678901234567890123456789012345678901234567890123456789012<<STOP
+
+    double precision function xmoment(a,b,i,j,k,ii,jj,kk, &
+    iux,iuy,iuz,Ax,Ay,Az, &
+    Bx,By,Bz,Cx,Cy,Cz)
+    use quick_constants_module
+    implicit double precision(a-h,o-z)
+
+    ! Variables needed later:
+    !    pi=3.1415926535897932385
+    g = a+b
+    Px = (a*Ax + b*Bx)/g
+    Py = (a*Ay + b*By)/g
+    Pz = (a*Az + b*Bz)/g
+
+    ! The purpose of this subroutine is to calculate moment around point
+    ! C.  This is used for calculating dipole moments.
+
+    ! The notation is the same used throughout: gtfs with orbital exponents a
+    ! and b on A and B with angular momentums defined by i,j,k (a's x, y
+    ! and z exponents, respectively) and ii,jj,k and kk on B with the core at
+    ! (Cx,Cy,Cz) with charge Z. New to this are the iux, iuy, and iuz terms
+    ! which determine the moment type being calculated.
+
+    ! The this is taken from the recursive relation found in Obara and Saika,
+    ! J. Chem. Phys. 84 (7) 1986, 3963.
+
+    ! apass and bpass are used to avoid swapping the two variables when
+    ! control passes back to the main program.
+
+    apass=a
+    bpass=b
+
+    xmoment = xmomentrecurse(apass,bpass,i,j,k,ii,jj,kk, &
+    iux,iuy,iuz, &
+    Ax,Ay,Az,Bx,By,Bz, &
+    Cx,Cy,Cz,Px,Py,Pz,g)
+
+    return
+    end function xmoment
+
+!*******************************************************
+! xmomentrecurse
+!-------------------------------------------------------
+! Ed Brothers. September 25, 2002.
+! 3456789012345678901234567890123456789012345678901234567890123456789012<<STOP
+
+    double precision recursive function xmomentrecurse(a,b, &
+    i,j,k,ii,jj,kk,iux,iuy,iuz, &
+    Ax,Ay,Az,Bx,By,Bz, &
+    Cx,Cy,Cz,Px,Py,Pz,g) &
+    result(xmomentrec)
+
+    implicit double precision(a-h,o-z)
+    dimension iexponents(9),center(12)
+
+! The this is taken from the recursive relation found in Obara and Saika,
+! J. Chem. Phys. 84 (7) 1986, 3963.
+
+! If i=j=k=ii=jj=kk=iux=iuy=iuz=0, this is simply an overlap integral.
+
+    if (i+j+k+ii+jj+kk+iux+iuy+iuz == 0) then
+       g_table = g**(-1.5)
+        xmomentrec=overlap(a,b,0,0,0,0,0,0, &
+        Ax,Ay,Az,Bx,By,Bz,Px,Py,Pz,g_table)
+
+    ! Otherwise, use the recusion relation from Obara and Saika.  The first
+    ! step is to find the lowest nonzero angular momentum exponent or,
+    ! if all of those are zero, the lowest nonzeo iux, iuy or iuz. This is
+    ! because the more exponents equal zero the fewer terms need to be
+    ! calculated, and each recursive loop reduces the angular momentum
+    ! exponents. This therefore reorders the atoms and sets the exponent
+    ! to be reduced.
+
+    else
+        xmomentrec=0.d0
+        iexponents(1) = i
+        iexponents(2) = j
+        iexponents(3) = k
+        iexponents(4) = ii
+        iexponents(5) = jj
+        iexponents(6) = kk
+        iexponents(7) = iux
+        iexponents(8) = iuy
+        iexponents(9) = iuz
+        center(7) = Cx
+        center(8) = Cy
+        center(9) = Cz
+        center(10)= Px
+        center(11)= Py
+        center(12)= Pz
+        ilownum=300
+        ilowex=300
+        do L=1,6
+            if (iexponents(L) < ilowex .and. iexponents(L) /= 0) then
+                ilowex=iexponents(L)
+                ilownum=L
+            endif
+        enddo
+        if (ilownum <= 3) then
+            center(1)=Ax
+            center(2)=Ay
+            center(3)=Az
+            center(4)=Bx
+            center(5)=By
+            center(6)=Bz
+        else
+            center(4)=Ax
+            center(5)=Ay
+            center(6)=Az
+            center(1)=Bx
+            center(2)=By
+            center(3)=Bz
+            iexponents(4) = i
+            iexponents(5) = j
+            iexponents(6) = k
+            iexponents(1) = ii
+            iexponents(2) = jj
+            iexponents(3) = kk
+            if(ilownum <= 6) ilownum = ilownum - 3
+            temp=b
+            b=a
+            a=temp
+        endif
+
+    ! The reson for the following two variables is explained fully in the
+    ! repulsion code.
+
+        apass=a
+        bpass=b
+
+    ! There are two possibilities:
+    ! 1)  ilownum is equal to 300.  This meant that i=j=k=ii=jj=kk=0.
+    ! If iux,iuy,or iuz was equal to zero, we would not be this far
+    ! in the code. Thus the integral is of the form (s|u|s).
+    ! If this is the case:
+
+        if (ilownum == 300) then
+            do L=7,9
+                if (iexponents(L) < ilowex .and. iexponents(L) /= 0) then
+                    ilowex=iexponents(L)
+                    ilownum=L
+                endif
+            enddo
+            PC = center(ilownum+3)-center(ilownum)
+            iexponents(ilownum)=iexponents(ilownum)-1
+            if (PC /= 0.d0) then
+                xmomentrec=xmomentrec+PC* &
+                xmomentrecurse(apass,bpass,0,0,0,0,0,0, &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                apass=a
+                bpass=b
+            endif
+            if(iexponents(ilownum) /= 0) then
+                coeff = dble(iexponents(ilownum))/(2.d0*g)
+                iexponents(ilownum)=iexponents(ilownum)-1
+                xmomentrec=xmomentrec+coeff* &
+                xmomentrecurse(apass,bpass,0,0,0,0,0,0, &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                iexponents(ilownum)=iexponents(ilownum)+1
+                a=apass
+                b=bpass
+            endif
+
+
+        ! Otherwise:
+        ! 2)  The ilow number is referring to an angular momentum orbital
+        ! exponent.
+
+        else
+
+        ! The first step is lowering the orbital exponent by one.
+
+            iexponents(ilownum) = iexponents(ilownum)-1
+
+        ! At this point, calculate the first term of the recursion
+        ! relation.
+
+            PA = center(9+ilownum)-center(ilownum)
+            if (PA /= 0) then
+                xmomentrec=xmomentrec+PA* &
+                xmomentrecurse(apass,bpass, &
+                iexponents(1),iexponents(2),iexponents(3), &
+                iexponents(4),iexponents(5),iexponents(6), &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                apass=a
+                bpass=b
+            endif
+
+        ! The next term only arises if the angual momentum of the dimension
+        ! of A that has already been lowered is not zero.  In other words, if a
+        ! (px|ux|px) was passed to this subroutine, we are now considering
+        ! (s|ux|px), and the following term does not arise, as the x expoent
+        ! on A is zero.
+
+            if (iexponents(ilownum) /= 0) then
+                coeff = dble(iexponents(ilownum))/(2.d0*g)
+                iexponents(ilownum) = iexponents(ilownum)-1
+                xmomentrec=xmomentrec+coeff* &
+                xmomentrecurse(apass,bpass, &
+                iexponents(1),iexponents(2),iexponents(3), &
+                iexponents(4),iexponents(5),iexponents(6), &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                iexponents(ilownum) = iexponents(ilownum)+1
+                apass=a
+                bpass=b
+            endif
+
+        ! The next two terms only arise is the angual momentum of the dimension
+        ! of A that has already been lowered is not zero in B.  If a
+        ! (px|u1x|px) was passed to this subroutine, we are now considering
+        ! (s|u1x|px), and the following term does arise, as the x exponent on
+        ! B is 1.
+
+            if (iexponents(ilownum+3) /= 0) then
+                coeff = dble(iexponents(ilownum+3))/(2.d0*g)
+                iexponents(ilownum+3) = iexponents(ilownum+3)-1
+                xmomentrec=xmomentrec+coeff* &
+                xmomentrecurse(apass,bpass, &
+                iexponents(1),iexponents(2),iexponents(3), &
+                iexponents(4),iexponents(5),iexponents(6), &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                iexponents(ilownum+3) = iexponents(ilownum+3)+1
+                apass=a
+                bpass=b
+            endif
+
+        ! The next two terms only arises if the exponent of the operator
+        ! corresponding to the dimension of A that has already been lowered
+        ! is not zero.  If a (px|u1x|px) was passed to this subroutine,
+        ! we are now considering (s|u1x|px), and the following term
+        ! does arise. If we had started with (px|u1y|px), it would now be
+        ! (s|u1y|px), and it would not arise.
+
+            if (iexponents(ilownum+6) /= 0) then
+                coeff = dble(iexponents(ilownum+6))/(2.d0*g)
+                iexponents(ilownum+6) = iexponents(ilownum+6)-1
+                xmomentrec=xmomentrec+coeff* &
+                xmomentrecurse(apass,bpass, &
+                iexponents(1),iexponents(2),iexponents(3), &
+                iexponents(4),iexponents(5),iexponents(6), &
+                iexponents(7),iexponents(8),iexponents(9), &
+                center(1),center(2),center(3), &
+                center(4),center(5),center(6), &
+                center(7),center(8),center(9), &
+                center(10),center(11),center(12),g)
+                iexponents(ilownum+6) = iexponents(ilownum+6)+1
+                apass=a
+                bpass=b
+            endif
+        endif
+    endif
+    return
+    end 
+
diff --git a/src/electdii.f90 b/src/deprecated/electdii.f90
similarity index 100%
rename from src/electdii.f90
rename to src/deprecated/electdii.f90
diff --git a/src/mpi_fake.f90 b/src/deprecated/mpi_fake.f90
similarity index 100%
rename from src/mpi_fake.f90
rename to src/deprecated/mpi_fake.f90
diff --git a/src/main.f90 b/src/main.f90
index e064d653f..1f6646256 100644
--- a/src/main.f90
+++ b/src/main.f90
@@ -204,7 +204,7 @@ program quick
       quick_basis%ncenter, quick_basis%kstart, quick_basis%katom, &
       quick_basis%ktype, quick_basis%kprim, quick_basis%kshell,quick_basis%Ksumtype, &
       quick_basis%Qnumber, quick_basis%Qstart, quick_basis%Qfinal, quick_basis%Qsbasis, quick_basis%Qfbasis, &
-      quick_basis%gccoeff, quick_basis%cons, quick_basis%gcexpo, quick_basis%KLMN)
+      quick_basis%gccoeff, quick_basis%cons, quick_basis%gcexpo, quick_basis%KLMN, nfrozencore)
  
       call gpu_upload_cutoff_matrix(Ycutoff, cutPrim)
     endif
diff --git a/src/modules/quick_basis_module.f90 b/src/modules/quick_basis_module.f90
index 9d64457da..6dca2280a 100644
--- a/src/modules/quick_basis_module.f90
+++ b/src/modules/quick_basis_module.f90
@@ -41,6 +41,7 @@ module quick_basis_module
 
         ! total shell number
         integer, pointer :: nshell
+        integer, pointer :: nfrozencore
 
         ! total primitive guassian function number
         integer, pointer :: nprim
@@ -126,7 +127,7 @@ module quick_basis_module
    double precision, dimension(:,:), allocatable :: aexp,dcoeff                         ! to be sub
 
 
-   integer,target :: nshell,nprim,jshell,jbasis
+   integer,target :: nshell,nprim,jshell,jbasis,nfrozencore
    integer,target :: nbasis
    integer :: maxcontract
 
diff --git a/src/modules/quick_constants_module.f90 b/src/modules/quick_constants_module.f90
index 62e90820a..d11742680 100644
--- a/src/modules/quick_constants_module.f90
+++ b/src/modules/quick_constants_module.f90
@@ -110,6 +110,17 @@ module quick_constants_module
              2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 4, 3, 2, 1, &
        2, 1, 2, 3, 4, 5/)
 
+    integer, dimension(1:92) :: FROZENCORE = &
+     (/0,                                                                   0,  &
+       0,   0,                                          2,  2,  2,  2,  2,  2,  &
+       2,   2,                                          10, 10, 10, 10, 10, 10, &
+       10,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 18, 18, 18, 18, 18, 18, &  
+       18,  18, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 36, 36, 36, 36, 36, 36, &
+       36,  36, &
+        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, &
+                46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 68, 68, 68, 68, 68, 68, &
+       68,  68, 68, 68, 68, 68/)
+
     double precision, dimension(0:83) :: EMASS
 
     data EMASS &
diff --git a/src/modules/quick_method_module.f90 b/src/modules/quick_method_module.f90
index 4b0b4f34b..167e99ec6 100644
--- a/src/modules/quick_method_module.f90
+++ b/src/modules/quick_method_module.f90
@@ -39,6 +39,7 @@ module quick_method_module
         logical :: writePMat = .false. ! flag to write density matrix
         logical :: readSAD = .true.    ! flag to read SAD guess
         logical :: writeSAD = .false.  ! flag to write SAD guess
+        logical :: frzCore = .false.   ! frozen core approximation
         logical :: diisSCF =  .false.  ! DIIS SCF
         logical :: prtGap =  .false.   ! flag to print HOMO-LUMO gap
         logical :: opt =  .false.      ! optimization
@@ -359,6 +360,7 @@ subroutine print_quick_method(self,io,ierr)
             if (self%readSAD)   write(io,'(" READ SAD GUESS FROM FILE")')
             if (self%writeSAD)   write(io,'(" WRITE SAD GUESS TO FILE")')
     
+            if (self%frzCore)   write(io, '(" FROZEN CORE APPROXIMATION")')
             if (self%zmat)      write(io,'(" Z-MATRIX CONSTRUCTION")')
             if (self%dipole)    write(io,'(" DIPOLE")')
             if (self%ecp)       write(io,'(" ECP BASIS SET")')
@@ -540,6 +542,7 @@ subroutine read_quick_method(self,keywd,ierr)
                self%writeSAD = .true.
                self%readSAD = .false. ! switch off reading of SAD guess which was set to true by default
             end if
+            if (index(keyWD,'FROZENC').ne.0)  self%frzCore=.true. ! “FROZENCORE” clashes with “CORE”
             if (index(keyWD,'ZMAKE').ne.0)      self%zmat=.true.
             if (index(keyWD,'DIPOLE').ne.0)     self%dipole=.true.
             if (index(keyWD,'WRITE').ne.0)      self%writePMat=.true.
@@ -680,6 +683,7 @@ subroutine init_quick_method(self,ierr)
             self%readDMX =  .false.  ! flag to read density matrix
             self%readSAD =  .true.   ! flag to read sad guess
             self%writeSAD = .false.  ! flag to write sad guess
+            self%frzCore = .false.   ! frozen core approximation
             self%diisSCF =  .false.  ! DIIS SCF
             self%prtGap =  .false.   ! flag to print HOMO-LUMO gap
             self%opt =  .false.      ! optimization
diff --git a/src/scf.f90 b/src/scf.f90
index 154525d6e..a4c3bf7df 100644
--- a/src/scf.f90
+++ b/src/scf.f90
@@ -183,8 +183,10 @@ subroutine electdiis(jscf,ierr)
    deltaO = .false.
    idiis = 0
    ! Now Begin DIIS
+   !print *, "before diis, quick_qm_struct%o is ", quick_qm_struct%o
    do while (.not.diisdone)
-
+    !print *, "in this iteration, quick_qm_struct%o is ", quick_qm_struct%o
+    print *, "in this iteration, idiis is ", idiis 
 
       call cpu_time(timer_begin%TSCF)
       !--------------------------------------------
@@ -211,6 +213,8 @@ subroutine electdiis(jscf,ierr)
       if(jscf.ge.quick_method%ncyc) deltaO = .true.
 
       if (quick_method%debug)  call debug_SCF(jscf)
+    
+      !print *, "in this iteration, before call scf_operator quick_qm_struct%o is ", quick_qm_struct%o
 
       if (quick_method%SEDFT) then
          call sedftoperator ! Semi-emperical DFT Operator
@@ -218,6 +222,10 @@ subroutine electdiis(jscf,ierr)
          call scf_operator(deltaO)
       endif
 
+      !if (idiis.eq.1) then
+      !  print *, "in this iteration, after call scf_operator quick_qm_struct%o is", quick_qm_struct%o
+      !endif
+
       if (quick_method%debug)  call debug_SCF(jscf)
 
       ! Terminate Operator timer
@@ -481,6 +489,10 @@ subroutine electdiis(jscf,ierr)
          ! First you have to transpose this into an orthogonal basis, which
          ! is accomplished by calculating Transpose[X] . O . X.
          !-----------------------------------------------
+        !if (idiis.eq.1) then        
+        !    print *, "before Diagonalize the operator matrix to form, quick_qm_struct%o is",quick_qm_struct%o
+        !    print *, ""
+        !endif
 #if defined(CUDA) || defined(CUDA_MPIV)
 
         call cpu_time(timer_begin%TDiag)
@@ -489,15 +501,38 @@ subroutine electdiis(jscf,ierr)
               quick_qm_struct%vec, quick_qm_struct%co, &
               V2, nbasis)
          call cpu_time(timer_end%TDiag)
+    
+         !if (idiis.eq.1) then
+         !   print *, "in cuda after cuda_diag, quick_scratch%hold is",quick_scratch%hold
+         !   print *, ""
+         ! endif
 
          call cublas_DGEMM ('n', 'n', nbasis, nbasis, nbasis, 1.0d0, quick_qm_struct%x, &
                nbasis, quick_scratch%hold, nbasis, 0.0d0, quick_qm_struct%o,nbasis)
+        
+        !if (idiis.eq.1) then
+        !    print *, "in cuda after DGEMM ,quick_qm_struct%o is",quick_qm_struct%o
+        !    print *, "" 
+        !endif
+
 #else
          call DGEMM ('n', 'n', nbasis, nbasis, nbasis, 1.0d0, quick_qm_struct%o, &
                nbasis, quick_qm_struct%x, nbasis, 0.0d0, quick_scratch%hold,nbasis)
 
+         !if (idiis.eq.1) then
+         !   print *, "in serial after first DGEMM ,quick_scratch%hold is",quick_scratch%hold
+         !   print *, "" 
+        !endif
+
          call DGEMM ('n', 'n', nbasis, nbasis, nbasis, 1.0d0, quick_qm_struct%x, &
                nbasis, quick_scratch%hold, nbasis, 0.0d0, quick_qm_struct%o,nbasis)
+        
+        !if (idiis.eq.1) then
+         !   print *, "in serial after second DGEMM ,quick_scratch%hold
+         !   is",quick_scratch%hold
+         !   print *, "" 
+        !endif
+
 
          ! Now diagonalize the operator matrix.
          call cpu_time(timer_begin%TDiag)
@@ -511,11 +546,15 @@ subroutine electdiis(jscf,ierr)
          call cpu_time(timer_end%TDiag)
 
 #endif
+        !if (idiis.eq.1) then
+        !    print *, "after  Diagonalize the operator matrix to form, quick_qm_struct%o is",quick_qm_struct%o
+        !    print *, ""
+        !endif
 
          ! Calculate C = XC' and form a new density matrix.
          ! The C' is from the above diagonalization.  Also, save the previous
          ! Density matrix to check for convergence.
-         !        call DMatMul(nbasis,X,VEC,CO)    ! C=XC'
+         !        call DMatMul(nbasis,X,VEC,CO)    ! C=XC'(step 9)
 
 #if defined(CUDA) || defined(CUDA_MPIV)
 
@@ -611,7 +650,8 @@ subroutine electdiis(jscf,ierr)
             if (quick_method%prtgap) write (ioutfile,'(" HOMO-LUMO GAP (EV) =",11x,F12.6)') &
                   (quick_qm_struct%E((quick_molspec%nelec/2)+1) - quick_qm_struct%E(quick_molspec%nelec/2))*AU_TO_EV
             diisdone=.true.
-
+            
+            !print *, "diis is done, quick_qm_struct%o is", quick_qm_struct%o 
 
          endif
          if(jscf >= quick_method%iscf-1) then
diff --git a/src/scf_operator.f90 b/src/scf_operator.f90
index b5d3c9eef..cec6cabfb 100644
--- a/src/scf_operator.f90
+++ b/src/scf_operator.f90
@@ -84,7 +84,10 @@ subroutine scf_operator(deltaO)
    call cpu_time(timer_begin%T2e)
 
 #if defined CUDA || defined CUDA_MPIV
-   if (quick_method%bCUDA) then
+
+  print *, "in scf_operator, get CUDA"
+
+  if (quick_method%bCUDA) then
 
       if(quick_method%HF)then      
          call gpu_upload_method(0, 1.0d0)
@@ -105,6 +108,8 @@ subroutine scf_operator(deltaO)
 
    if (quick_method%nodirect) then
 #ifdef CUDA
+      print *, "call gpu_addint"
+      print *, ""
       call gpu_addint(quick_qm_struct%o, intindex, intFileName)
 #else
 #ifndef MPI
@@ -120,14 +125,17 @@ subroutine scf_operator(deltaO)
 ! The next two terms define the two electron part.
 !-----------------------------------------------------------------
 #if defined CUDA || defined CUDA_MPIV
-      if (quick_method%bCUDA) then          
-         call gpu_get2e(quick_qm_struct%o)  
+      if (quick_method%bCUDA) then  
+      !print *, "before gpu_get2e, quick_qm_struct%o is ", quick_qm_struct%o        
+         call gpu_get2e(quick_qm_struct%o)  !c function
+      !print *, "after gpu_get2e, quick_qm_struct%o is ", quick_qm_struct%o
       else                                  
 #endif
 !  Schwartz cutoff is implemented here. (ab|cd)**2<=(ab|ab)*(cd|cd)
 !  Reference: Strout DL and Scuseria JCP 102(1995),8448.
 
 #if defined MPIV && !defined CUDA_MPIV 
+   print *, "defined MPIV and not defined CUDA_MPIV"
 !  Every nodes will take about jshell/nodes shells integrals such as 1 water, which has 
 !  4 jshell, and 2 nodes will take 2 jshell respectively.
    if(bMPI) then
@@ -141,9 +149,15 @@ subroutine scf_operator(deltaO)
       enddo
    endif        
 #else
+    !print *, "Here in scf_operator, jshell is", jshell
+    !print *, "Here in scf_operator, before adding 2e integrals, quick_qm_struct%o is"
+    !print *, quick_qm_struct%o
       do II=1,jshell
-         call get2e(II)
+         call get2e(II) !!serial get2e and added to quick_qm_struct%o: get2e->shell->iclass
       enddo
+    !print *, "Here in scf_operator, after adding 2e integrals, quick_qm_struct%o is"
+    !print *, quick_qm_struct%o
+    !print *, " "
 #endif
 
 #if defined CUDA || defined CUDA_MPIV 
diff --git a/src/shell.f90 b/src/shell.f90
index b44632f55..ad57c758c 100644
--- a/src/shell.f90
+++ b/src/shell.f90
@@ -868,7 +868,7 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD)
             do KKK=max(III,KKK1),KKK2
                do LLL=max(KKK,LLL1),LLL2
                   if (III.lt.JJJ .and. III.lt. KKK .and. KKK.lt. LLL) then
-                     call hrrwhole
+                     call hrrwhole  !get Y for get2e
                      if (abs(Y).gt.quick_method%maxIntegralCutoff) then
                         A = (III-1)*nbasis+JJJ-1
                         B = (KKK-1)*nbasis+LLL-1
@@ -895,7 +895,7 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD)
                         endif
                      endif
                   else if((III.LT.KKK).OR.(JJJ.LE.LLL))then
-                     call hrrwhole
+                     call hrrwhole  
                      if (abs(Y).gt.quick_method%maxintegralCutoff) then
 
                         A = (III-1)*nbasis+JJJ-1
@@ -934,12 +934,13 @@ subroutine iclass(I,J,K,L,NNA,NNC,NNAB,NNCD)
    else
 
 
-      if(II.lt.JJ.and.II.lt.KK.and.KK.lt.LL)then
+      !print *, "quick_method%x_hybrid_coeff is ", quick_method%x_hybrid_coeff
+      if(II.lt.JJ.and.II.lt.KK.and.KK.lt.LL)then    !2e integral
          do III=III1,III2
             do JJJ=JJJ1,JJJ2
                do KKK=KKK1,KKK2
                   do LLL=LLL1,LLL2
-                     call hrrwhole
+                     call hrrwhole !get Y for get2e
                      !write(*,*) Y,III,JJJ,KKK,LLL
                      DENSEKI=quick_qm_struct%dense(KKK,III)
                      DENSEKJ=quick_qm_struct%dense(KKK,JJJ)
@@ -1303,6 +1304,7 @@ end subroutine attrashellenergy
 
 
 ! Vertical Recursion by Xiao HE 07/07/07 version
+!subroutine shellmp2(nstepmp2s,nsteplength, Y_Matrix)
 subroutine shellmp2(nstepmp2s,nsteplength)
    use allmod
 
@@ -1313,6 +1315,9 @@ subroutine shellmp2(nstepmp2s,nsteplength)
    double precision RA(3),RB(3),RC(3),RD(3)
 
    double precision Qtemp(3),WQtemp(3),CDtemp,ABcom,Ptemp(3),WPtemp(3),ABtemp,CDcom,ABCDtemp
+   ! add Y_Matrix here
+   !double precision :: Y_Matrix(nbasis*nbasis, nbasis*nbasis) 
+
    integer II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
    common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
 
@@ -1320,13 +1325,20 @@ subroutine shellmp2(nstepmp2s,nsteplength)
 
    COMMON /COM1/RA,RB,RC,RD
 
+   !print *, "in shellmp2, quick_basis%katom are", quick_basis%katom ! indice of central atoms
+    !print *, "in shell mp2, II, JJ, KK, LL are", II,JJ,KK,LLL
+    !print *, "in shellmp2, quick_basis%ktype are", quick_basis%ktype
+
    do M=1,3
       RA(M)=xyz(M,quick_basis%katom(II))
       RB(M)=xyz(M,quick_basis%katom(JJ))
       RC(M)=xyz(M,quick_basis%katom(KK))
       RD(M)=xyz(M,quick_basis%katom(LL))
+      !print *, "in shellmp2, RA(M), RB(M), RC(M), RD(M), M are", RA(M),RB(M),RC(M),RD(M),M
    enddo
-
+    
+   !print *, "in shellmp2, II, JJ, KK, LL are", II, JJ, KK, LL
+   
    NII1=quick_basis%Qstart(II)
    NII2=quick_basis%Qfinal(II)
    NJJ1=quick_basis%Qstart(JJ)
@@ -1430,6 +1442,11 @@ subroutine shellmp2(nstepmp2s,nsteplength)
       enddo
    enddo
 
+   !print *, "NII1, NII2 is", NII1, NII2
+   !print *, "NJJ1, NJJ2 is", NJJ1, NJJ2
+   !print *, "NKK1, NKK2 is", NKK1, NKK2
+   !print *, "NLL1, NLL2 is", NLL1, NLL2
+   !print *, "Sumindex is", Sumindex
 
    do I=NII1,NII2
       NNA=Sumindex(I-1)+1
@@ -1439,8 +1456,18 @@ subroutine shellmp2(nstepmp2s,nsteplength)
             NNC=Sumindex(k-1)+1
             do L=NLL1,NLL2
                NNCD=SumIndex(K+L)
-               call classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
-               !                   call class
+                !! NNA,NNC,NNAB,NNCD not used?
+                !This prescreening is adopted from the cuda version
+                DNmaxmp2 = max(4.0d0*cutmatrix(II,JJ), &
+                            4.0d0*cutmatrix(KK,LL), &
+                            cutmatrix(II,LL), &
+                            cutmatrix(II,KK), &
+                            cutmatrix(JJ,KK), &
+                            cutmatrix(JJ,LL))
+                if((Ycutoff(II,JJ)*Ycutoff(KK,LL).gt.quick_method%integralCutoff) .and.&
+                    (Ycutoff(II,JJ)*Ycutoff(KK,LL))*DNmaxmp2.gt.quick_method%integralCutoff)then
+                    call classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
+                endif
             enddo
          enddo
       enddo
@@ -1744,6 +1771,7 @@ End subroutine classmp2divcon
 
 
 ! Horrizontal recursion and Fock matrix builder by Xiao HE 07/07/07 version
+!subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength, Y_Matrix)
 subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
    ! subroutine class
    use allmod
@@ -1756,6 +1784,8 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
    double precision FM(0:13)
    double precision RA(3),RB(3),RC(3),RD(3)
    double precision X44(129600)
+   ! add Y_Matrix here, why here the dimension needs to be specified?
+   ! double precision, dimension(nbasis*nbasis,nbasis*nbasis):: Y_Matrix
 
    COMMON /COM1/RA,RB,RC,RD
    COMMON /COM2/AA,BB,CC,DD,AB,CD,ROU,ABCD
@@ -1766,6 +1796,7 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
    common /xiaostore/store
    common /hrrstore/II,JJ,KK,LL,NBI1,NBI2,NBJ1,NBJ2,NBK1,NBK2,NBL1,NBL2
 
+! commenting starts here
    ITT=0
    do JJJ=1,quick_basis%kprim(JJ)
       Nprij=quick_basis%kstart(JJ)+JJJ-1
@@ -1797,7 +1828,7 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
          store(MM1,MM2)=Ytemp
       enddo
    enddo
-
+! commenting ends here
 
    NBI1=quick_basis%Qsbasis(II,I)
    NBI2=quick_basis%Qfbasis(II,I)
@@ -1834,8 +1865,23 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
    NBI1=quick_basis%Qsbasis(II,NII1)
    NBJ1=quick_basis%Qsbasis(JJ,NJJ1)
 
-   II111=quick_basis%ksumtype(II)+NBI1
-   JJ111=quick_basis%ksumtype(JJ)+NBJ1
+   !II111=quick_basis%ksumtype(II)+NBI1
+   !JJ111=quick_basis%ksumtype(JJ)+NBJ1
+    II111=quick_basis%ksumtype(II)
+    JJ111=quick_basis%ksumtype(JJ)
+   
+   !print *, "in classmp2, II111, JJ111 are", II111, JJ111
+   !print *, "in classmp2, NBI1 and NBJ1 should always be 0 while they are:", NBI1, NBJ1 
+   !if(NBI1.NE.0 .OR. NBJ1.NE.0) then
+   !     print *, "got nonezero NBI1 or NBJ1"
+   !endif
+
+   ! from here inspection
+   !print *, "in classmp2, before getting Y,"
+   !print *, "III1, III2 are", III1, III2
+   !print *, "JJJ1, JJJ2 are", JJJ1, JJJ2
+   !print *, "KKK1, KKK2 are", KKK1, KKK2
+   !print *, "LLL1, LLL2 are", LLL1, LLL2
 
    if(II.lt.JJ.and.KK.lt.LL)then
 
@@ -1843,24 +1889,65 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
          do JJJ=JJJ1,JJJ2
             do KKK=KKK1,KKK2
                do LLL=LLL1,LLL2
-
+!#ifndef CUDA                
+                  !print *, "before calling hrrwhole"
+                  !print *, "get Y is ", Y
                   call hrrwhole
-                  if (dabs(Y).gt.quick_method%integralCutoff) then
+                  !print *, "FIRST CALL after calling hrrwhole"
+                  !print "('in serial III,JJJ,KKK,LLL,Y are', i2, i2, i2, i2, f10.6)", III, JJJ, KKK, LLL, Y
+!#endif
+!#ifdef CUDA
+!                  Y = 0.0
+                  ! Here we retrieve the Y value calculated in gpu_MP2.cu/hrrwhole_MP2. Two points to note:
+                  ! 1. Cuda code only calculated Y_Matrix(i,j,k,l) where i<k. For
+                  ! the case i>k, use the sysmetry Y_Matrix(i,j,k,l) = Y_Matrix(k,l,i,j)
+                  ! 2. In Fortran arrays are stored by column in memory.
+                  !if(quick_method%bCUDA) then
+                  !  if((III-1)*nbasis+JJJ > (KKK-1)*nbasis+LLL) then
+                        !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL)
+                        !print *, ""
+                  !      Y = Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL)  
+                  !  else
+                        !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ)
+                        !print *, ""
+                  !      Y = Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ)
+                  !  endif
+                  !endif
+!#endif
+                  !print *, ""
+                  !print "('in classmp2, III,JJJ,KKK,LLL are', i3, i3, i3, i3, ', II,JJ,KK,LL are', &
+                  !          i3, i3, i3, i3, ', I,J,K,L are', i3, i3, i3, i3)", III,JJJ,KKK,LLL,II,JJ,KK,LL,I,J,K,L
+                  !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')", LLL, IIInew, JJJnew
+                  !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')", LLL, JJJnew, IIInew                  
+                  !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')", KKK, IIInew, JJJnew
+                  !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')", KKK, JJJnew, IIInew 
+
+                    if (dabs(Y).gt.quick_method%integralCutoff) then
                      do i3mp2=1,nsteplength
                         i3mp2new=nstepmp2s+i3mp2-1
                         atemp=quick_qm_struct%co(KKK,i3mp2new)*Y
                         btemp=quick_qm_struct%co(LLL,i3mp2new)*Y
                         IIInew=III-II111+1
-                        JJJnew=JJJ-JJ111+1
-
-                        orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= &
-                              orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp
-                        orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= &
-                              orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp
-                        orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= &
-                              orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp
-                        orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= &
-                              orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp
+                        JJJnew=JJJ-JJ111+1 
+
+                        !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew,JJJnew, 1 are', i3, i3, i3, ' 1')",&
+                        !     LLL, IIInew, JJJnew
+                        !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",& 
+                        !    LLL, JJJnew, IIInew
+                        !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",& 
+                        !    KKK, IIInew, JJJnew
+                        !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",&
+                        !     KKK, JJJnew, IIInew
+                       
+
+                        !print "('for orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1),i3mp2,LLL,IIInew,JJJnew are', i2,i2,i2,i2)",i3mp2,LLL,IIInew,JJJnew                    
+                        orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp
+                        !print "('for orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2),i3mp2,LLL,JJJnew,IIInew are', i2,i2,i2,i2)",i3mp2,LLL,JJJnew,IIInew
+                        orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp
+                        !print "('for orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1),i3mp2,KKK,IIInew,JJJnew are',i2,i2,i2,i2)",i3mp2,KKK,IIInew,JJJnew
+                        orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp
+                        !print "('for orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2),i3mp2,KKK,JJJnew,IIInew are',i2,i2,i2,i2)",i3mp2,KKK,JJJnew,IIInew
+                        orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp
                      enddo
                   endif
                enddo
@@ -1876,8 +1963,30 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
                do KKK=KKK1,KKK2
                   if(max(KKK,LLL1).le.LLL2)then
                      do LLL=max(KKK,LLL1),LLL2
-
+!#ifndef CUDA
+                        !print *, "second call:before calling hrrwhole"
+                        !print *, "get Y is ", Y
                         call hrrwhole
+                        !print *, "second call:after calling hrrwhole"
+                        !print "('in serial III,JJJ,KKK,LLL,Y are', i2, i2, i2, i2, f10.6)", III, JJJ, KKK, LLL, Y
+!#endif
+!#ifdef CUDA
+!                        Y = 0.0
+                        !if(quick_method%bCUDA) then
+                        !    if((III-1)*nbasis+JJJ > (KKK-1)*nbasis+LLL) then
+                                !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((III-1)*nbasis+JJJ, (KKK-1)*nbasis+LLL)
+                                !print *, ""                        
+                        !        Y=Y_Matrix((III-1)*nbasis+JJJ,(KKK-1)*nbasis+LLL)
+                        !    else
+                                !print *, "from cuda code corresponding Y_Matrix value is",Y_Matrix((KKK-1)*nbasis+LLL, (III-1)*nbasis+JJJ)
+                                !print *, ""
+                        !        Y=Y_Matrix((KKK-1)*nbasis+LLL,(III-1)*nbasis+JJJ)
+                        !    endif
+                        !endif
+!#endif    
+                        !print *, ""
+                        !print "('else in classmp2, III,JJJ,KKK,LLL are', i3, i3, i3, i3,', II,JJ,KK,LL are', &
+                        !    i3, i3, i3, i3, ', I,J,K,L are', i3, i3, i3, i3)",III,JJJ,KKK,LLL,II,JJ,KK,LL,I,J,K,L
                         if (dabs(Y).gt.quick_method%integralCutoff) then
                            do i3mp2=1,nsteplength
                               i3mp2new=nstepmp2s+i3mp2-1
@@ -1887,18 +1996,22 @@ subroutine classmp2(I,J,K,L,NNA,NNC,NNAB,NNCD,nstepmp2s,nsteplength)
                               IIInew=III-II111+1
                               JJJnew=JJJ-JJ111+1
 
-                              orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= &
-                                    orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp
+                              !print "('for orbmp2i331 with i3mp2=5, LLL, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",&
+                              !   LLL, IIInew, JJJnew
+                              orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)= orbmp2i331(i3mp2,LLL,IIInew,JJJnew,1)+atemp
                               if(JJJ.ne.III)then
-                                 orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= &
-                                       orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp
+                                !print "('for orbmp2i331 with i3mp2=5, LLL, JJJnew, IIInew, 2 are', i3, i3, i3, ' 2')",&
+                                 !LLL, JJJnew, IIInew 
+                                orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)= orbmp2i331(i3mp2,LLL,JJJnew,IIInew,2)+atemp
                               endif
                               if(KKK.ne.LLL)then
-                                 orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= &
-                                       orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp
+                                 !print "('for orbmp2i331 with i3mp2=5, KKK, IIInew, JJJnew, 1 are', i3, i3, i3, ' 1')",&
+                                  !   KKK, IIInew, JJJnew
+                                 orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)= orbmp2i331(i3mp2,KKK,IIInew,JJJnew,1)+btemp
                                  if(III.ne.JJJ)then
-                                    orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= &
-                                          orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp
+                                    !print "('for orbmp2i331 with i3mp2=5, KKK, JJJnew, IIInew, 1 are', i3, i3, i3, ' 2')",&
+                                     !    KKK, JJJnew, IIInew
+                                    orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)= orbmp2i331(i3mp2,KKK,JJJnew,IIInew,2)+btemp
                                  endif
                               endif
 
diff --git a/src/subs/eigval.f90 b/src/subs/deprecated/eigval.f90
similarity index 100%
rename from src/subs/eigval.f90
rename to src/subs/deprecated/eigval.f90