diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e0f63b54..3ec8ee57 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(progress
   prg_chebyshev_mod.F90
   prg_densitymatrix_mod.F90
   prg_dos_mod.F90
+  prg_ewald_mod.F90
   prg_extras_mod.F90
   prg_genz_mod.F90
   prg_graph_mod.F90
@@ -46,6 +47,7 @@ add_library(progress
   prg_subgraphloop_mod.F90
   prg_system_mod.F90
   prg_timer_mod.F90
+  prg_xlbokernel_mod.F90
   prg_xlbo_mod.F90)
   #prg_xlkernel_mod.F90)
 
@@ -94,6 +96,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_chebyshev_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_densitymatrix_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_dos_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_ewald_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_extras_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_genz_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_graph_mod.mod
@@ -122,6 +125,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_system_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_timer_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbo_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbokernel_mod.mod
   #  ${CMAKE_CURRENT_BINARY_DIR}/prg_xlkernel_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/slaterkosterforce_latte_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/tbparams_latte_mod.mod
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
new file mode 100644
index 00000000..5dbb24ea
--- /dev/null
+++ b/src/prg_ewald_mod.F90
@@ -0,0 +1,1222 @@
+! Ewald sum routines for kernel calculation
+module prg_ewald_mod
+
+  use bml
+  use prg_timer_mod
+  use prg_parallel_mod
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: Ewald_Real_Space_Single
+  public :: Ewald_Real_Space_Single_latte
+  public :: Ewald_Real_Space
+  public :: Ewald_Real_Space_latte
+  public :: Ewald_Real_Space_Test
+  public :: Ewald_Real_Space_Matrix_latte
+  public :: Ewald_k_space_latte_single
+  public :: Ewald_k_space_latte
+  public :: Ewald_k_space_Test
+  public :: Ewald_k_space_Matrix_latte
+
+contains
+
+  !> Find Coulomb potential on site I from single charge at site J
+  subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
+       DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
+    real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do k = -1,1
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RXYZ(1,J)+k*box(1,1)
+          Rb(2) = RXYZ(2,J)+m*box(2,2)
+          Rb(3) = RXYZ(3,J)+l*box(3,3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(Element_Pointer(J))
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Pointer(I).eq.Element_Pointer(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+            endif
+          endif
+        enddo
+      enddo
+    enddo
+
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single_latte
+
+  subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
+
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
+    real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+    do k = -1,1
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RX(J)+k*Lbox(1)
+          Rb(2) = RY(J)+m*Lbox(2)
+          Rb(3) = RZ(J)+l*Lbox(3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(J)
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Type(I).eq.Element_Type(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+              FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+                   + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+              FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+                   + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+            endif
+
+            FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+            FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+            FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+          endif
+        enddo
+      enddo
+    enddo
+
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single
+
+  subroutine  Ewald_Real_Space_Matrix_latte(E,RXYZ,Box,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, Nr_Elem
+    real(PREC), intent(in)    :: COULACC
+    real(PREC), intent(out)   :: E(Nr_atoms,Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    integer, intent(in)       :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: I,J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    E = 0.0
+
+    !$OMP PARALLEL DO DEFAULT(PRIVATE) SHARED(E,U,Element_Pointer,RXYZ,totnebcoul,nebcoul,coulcut,calpha)
+       
+    do I = 1,Nr_atoms
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do newj = 1,totnebcoul(I)
+      J = NEBCOUL(1, NEWJ, I)
+      PBCI = NEBCOUL(2, NEWJ, I)
+      PBCJ = NEBCOUL(3, NEWJ, I)
+      PBCK = NEBCOUL(4, NEWJ, I)
+      Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+           REAL(PBCK)*BOX(3,1)
+
+      Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+           REAL(PBCK)*BOX(3,2)
+
+      Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+           REAL(PBCK)*BOX(3,3)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(Element_Pointer(J))
+        DC = Rab/dR
+
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        E(I,J) = E(I,J) + CA
+        ccnt = ccnt + 1
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Pointer(I).eq.Element_Pointer(J)) then
+          E(I,J) = E(I,J) - EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          E(I,J) = E(I,J) - ((EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    E = KECONST*E
+
+  end subroutine Ewald_Real_Space_Matrix_latte
+
+  subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
+       DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    integer, intent(in)       :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do newj = 1,totnebcoul(I)
+      J = NEBCOUL(1, NEWJ, I)
+      PBCI = NEBCOUL(2, NEWJ, I)
+      PBCJ = NEBCOUL(3, NEWJ, I)
+      PBCK = NEBCOUL(4, NEWJ, I)
+      Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+           REAL(PBCK)*BOX(3,1)
+
+      Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+           REAL(PBCK)*BOX(3,2)
+
+      Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+           REAL(PBCK)*BOX(3,3)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(Element_Pointer(J))
+        DC = Rab/dR
+
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Pointer(I).eq.Element_Pointer(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_latte
+
+  subroutine  Ewald_Real_Space_Test(COULOMBV,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,nnRx,nnRy,nnRz,nrnnlist,nnType,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+    !    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(nnI,J,Rb,Rab,dR,MAGR,MAGR2,TJ,DC,Z,NUMREP_ERFC,CA) &
+    !    !$OMP REDUCTION(+:COULOMBV)
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+    !    !$OMP END PARALLEL DO
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Test
+
+  subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+               + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+          FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+               + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+        endif
+
+        FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+        FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+        FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+      endif
+    enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space
+  
+  subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
+            do I = 1,Nr_atoms
+              DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+            enddo
+            !$OMP END PARALLEL DO
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte
+
+  subroutine Ewald_k_Space_Matrix_latte(E,RXYZ,Box,Nr_atoms,COULACC,Max_Nr_Neigh,nebcoul,totnebcoul)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh,nebcoul(4,Max_Nr_Neigh,Nr_atoms),totnebcoul(Nr_atoms)
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: E(Nr_atoms,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, newj, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    !COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    E = 0.0
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) 
+            do I = 1,Nr_atoms
+              DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+           !   COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+           !   SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
+            do I = 1,Nr_atoms
+               do newj = 1,totnebcoul(I)
+                 J = NEBCOUL(1, NEWJ, I)
+                 E(I,J) = E(I,J) + KEPREF*(COSLIST(I)*COSLIST(J)+SINLIST(I)*SINLIST(J))
+          !    COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+               enddo
+               E(I,I) = E(I,I) - CORRFACT
+            enddo
+            !$OMP END PARALLEL DO
+
+            !KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! Point self energy
+    !CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    !COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_Matrix_latte
+
+  subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULACC)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, J
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2
+    real(PREC)                :: CORRFACT,FOURCALPHA2
+    real(PREC)                :: RECIPVECS(3,3)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: IDOT, JDOT, COSJDOT, SINJDOT, KEPREF
+
+    integer                   :: I,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,IDOT,JDOT,COSJDOT,SINJDOT,L,M,N,MMIN,MMAX,NMIN,NMAX,L11,M22,K,K2,PREFACTOR,KEPREF)
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M22 = M*RECIPVECS(2,2)
+
+        do N = NMIN,NMAX
+          K(1) = L11
+          K(2) = M22
+          K(3) = N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+
+          PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+          KEPREF = KECONST*PREFACTOR
+          JDOT = K(1)*RXYZ(1,J) + K(2)*RXYZ(2,J) + K(3)*RXYZ(3,J)
+          SINJDOT = sin(JDOT)
+          COSJDOT = cos(JDOT)
+          do I = 1,Nr_atoms
+            IDOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+            COULOMBV(I) = COULOMBV(I) + KEPREF*DELTAQ(J)*(COSJDOT*cos(IDOT)+SINJDOT*sin(IDOT))
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte_single
+
+  subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+    !
+    implicit none
+    !
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+    !
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/LBox(1)
+    RECIPVECS(2,2) = TWO*pi/LBox(2)
+    RECIPVECS(3,3) = TWO*pi/LBox(3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
+            do I = 1,Nr_atoms
+              DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+            !
+            KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+            enddo
+            !$OMP END PARALLEL DO
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_Test
+
+end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 5c30947c..633ea1c8 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -5,11 +5,13 @@
 !!
 module prg_implicit_fermi_mod
 
+  use omp_lib
   use bml
   use prg_normalize_mod
   use prg_densitymatrix_mod
   use prg_timer_mod
   use prg_parallel_mod
+  use prg_ewald_mod
 
   implicit none
 
@@ -18,13 +20,173 @@ module prg_implicit_fermi_mod
   integer, parameter :: dp = kind(1.0d0)
 
   public :: prg_implicit_fermi
+  public :: prg_implicit_fermi_save_inverse
   public :: prg_implicit_fermi_zero
   public :: prg_test_density_matrix
   public :: prg_implicit_fermi_response
+  public :: prg_implicit_fermi_first_order_response
   public :: prg_finite_diff
 
 contains
 
+  !> Recursive Implicit Fermi Dirac for finite temperature.
+  !! \param Inv_bml Inverses generated by algorithm.
+  !! \param h_bml Input Hamiltonian matrix.
+  !! \param p_bml Output density matrix.
+  !! \param nsteps Number of recursion steps.
+  !! \param nocc Number of occupied states.
+  !! \param mu Shifted chemical potential
+  !! \param beta Input inverse temperature.
+  !! \param occErrLimit Occupation error limit.
+  !! \param threshold Threshold for multiplication.
+  !! \param tol Tolerance for linear system solver.
+  !! \param SCF_IT The current SCF iteration.
+  !! \param occiter Counts the total nr of DM calculations during MD.
+  !! See \cite{niklasson2003}
+  subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc, &
+       mu, beta, occErrLimit, threshold, tol,SCF_IT, occiter)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: h_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, Inv_bml(nsteps)
+    integer, intent(in) :: nsteps, SCF_IT
+    real(dp), intent(in) :: nocc, threshold
+    real(dp), intent(in) :: tol
+    real(dp), intent(in) :: occErrLimit, beta
+    real(dp), intent(inout) :: mu
+    integer, intent(inout) :: occiter
+
+    type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
+    real(dp) :: trdPdmu, trP0, occErr, alpha, newerr
+    real(dp) :: cnst, ofactor, mustep, preverr
+    real(dp), allocatable :: trace(:), gbnd(:)
+    character(20) :: bml_type
+    integer :: N, M, i, iter, muadj, prev, maxiter
+
+    bml_type = bml_get_type(h_bml)
+    N = bml_get_N(h_bml)
+    M = bml_get_M(h_bml)
+
+    allocate(trace(2))
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
+
+    occErr = 10.0_dp
+    newerr = 1000_dp
+    preverr = 1000_dp
+    alpha = 1.0_dp
+    prev = 1
+    iter = 0
+    maxiter = 30
+    cnst = beta/(1.0_dp*2**(nsteps+2))
+
+    if (SCF_IT .eq. 1) then
+      alpha = 4.0_dp
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+      ! Generate good starting guess for (2*(P2-P)+1)^-1 using conjugate gradient
+      call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+      ! Y = 2*(P2-P) + II
+      call bml_copy(p2_bml, y_bml)
+      call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+      call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+      call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+      do i = 1, nsteps
+        call bml_copy(I_bml, Inv_bml(i))
+      enddo
+    end if
+
+    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < maxiter)
+      iter = iter + 1
+      muadj = 0
+      write(*,*) 'mu =', mu
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+
+      do i = 1, nsteps
+        call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+        ! Y = 2*(P2-P) + I
+        call bml_copy(p2_bml, y_bml)
+        call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+        call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+        ! Find inverse ai = (2*(P2-P)+I)^-1
+        !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, w_bml, d_bml, aux_bml, tol, threshold)
+        !call bml_copy(Inv_bml(i),ai_bml)
+        if (iter .eq. 1) then
+          call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+        endif
+        call prg_newtonschulz(y_bml, Inv_bml(i), d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+        call bml_multiply(Inv_bml(i), p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+        !call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
+      enddo
+
+      trP0 = bml_trace(p_bml)
+      trdPdmu = beta*(trP0 - bml_sum_squares(p_bml)) ! sum p(i,j)**2
+      occErr = abs(trP0 - nocc)
+      write(*,*) 'occerr =', occErr
+
+      ! If occupation error is too large, do bisection method
+      if (occerr > 1.0_dp) then
+        if (nocc-trP0 < 0.0_dp .and. prev .eq. -1) then
+          prev = -1
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then
+          prev = 1
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. -1) then
+          prev = 1
+          alpha = alpha/2
+        else
+          prev = -1
+          alpha = alpha/2
+        endif
+        mu = mu + prev*alpha
+        muadj = 1
+
+        ! Otherwise do Newton
+      else if (occErr .gt. occErrLimit) then
+        mustep = (nocc -trP0)/trdPdmu
+        mu = mu + mustep
+        muadj = 1
+        preverr = occErr
+      end if
+    enddo
+
+    if (iter .ge. maxiter) then
+      write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
+    end if
+    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
+    ! For now we recompute the DM one extra time if mu was adjusted.
+    !if (muadj .eq. 1) then
+    ! Adjust occupation
+    ! call bml_copy(p_bml, d_bml)
+    ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
+    ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
+    ! ofactor = ((nocc - trP0)/trdPdmu) * beta
+    ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !end if
+    occiter = occiter + iter
+    call bml_scale(2.0_dp,p_bml)
+    deallocate(trace)
+
+    call bml_deallocate(p2_bml)
+    call bml_deallocate(w_bml)
+    call bml_deallocate(d_bml)
+    call bml_deallocate(y_bml)
+    call bml_deallocate(aux_bml)
+    call bml_deallocate(ai_bml)
+    call bml_deallocate(I_bml)
+
+  end subroutine prg_implicit_fermi_save_inverse
+
   !> Recursive Implicit Fermi Dirac for finite temperature.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
@@ -68,7 +230,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux1_bml)
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux2_bml)
     endif
@@ -106,7 +268,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
           else
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
-          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, aux1_bml, w_bml, tol, threshold)
         enddo
       else
         write(*,*) "Doing NS"
@@ -123,9 +285,9 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
           if (i .eq. 1) then
-            call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+            call prg_conjgrad(y_bml, ai_bml, I_bml, aux1_bml, d_bml, w_bml, 0.9_dp, threshold)
           end if
-          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux1_bml, I_bml, tol, threshold)
           call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
         enddo
 
@@ -134,8 +296,10 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then
+        mu = mu + (nocc - trP0)/trdPdmu
+      end if
       write(*,*) "mu =", mu
     enddo
 
@@ -146,7 +310,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
 
     call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("P adjusted occupation",p_bml,0,10,0,10)
 
     deallocate(trace)
@@ -155,7 +319,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_deallocate(aux1_bml)
       call bml_deallocate(aux2_bml)
     endif
@@ -185,7 +349,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     real(dp), intent(in) :: mu, threshold
     real(dp), intent(inout), optional :: tol
 
-    type(bml_matrix_t) :: w_bml, y_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
+    type(bml_matrix_t) :: w_bml, y_bml, c_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
     real(dp) :: cnst
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
@@ -201,6 +365,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, c_bml)
     if (method .eq. 1) then
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
@@ -221,7 +386,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_copy(p2_bml, y_bml)
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
-        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, c_bml, tol, threshold)
       enddo
     else
       write(*,*) "Doing NS"
@@ -232,9 +397,9 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         if (i .eq. 1) then
-          call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+          call prg_conjgrad(y_bml, ai_bml, I_bml, c_bml, d_bml, w_bml, 0.9_dp, threshold)
         end if
-        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, c_bml, I_bml, tol, threshold)
         call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
       enddo
     endif
@@ -246,6 +411,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
+    call bml_deallocate(c_bml)
     if (method .eq. 1) then
       call bml_deallocate(ai_bml)
       call bml_deallocate(I_bml)
@@ -253,6 +419,111 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
 
   end subroutine prg_implicit_fermi_zero
 
+
+  !> Calculate first order density matrix response to perturbations using Implicit Fermi Dirac.
+  !! \param H0_bml Input Hamiltonian matrix.
+  !! \param H1_bml Input First order perturbation of H0.
+  !! \param P0_bml Output density matrix.
+  !! \param P1_bml Output First order density matrix response.
+  !! \param nsteps Number of recursion steps.
+  !! \param mu0 Shifted chemical potential.
+  !! \param beta Input inverse temperature.
+  !! \param nocc Number of occupied states.
+  !! \param threshold Threshold for matrix algebra.
+  !! See \cite{niklasson2015}
+  subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bml, &
+       Inv_bml, nsteps, mu0, beta, nocc, threshold)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, Inv_bml(nsteps)
+    type(bml_matrix_t), intent(inout) :: P0_bml,P1_bml
+    real(dp), intent(in) :: mu0, threshold
+    real(dp)  :: mu1
+    real(dp), intent(in) :: beta, nocc
+    integer, intent(in) :: nsteps
+    type(bml_matrix_t) :: B0_bml, B_bml, C_bml, C0_bml
+    character(20) :: bml_type
+    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst
+    integer :: N, M, i, j, k
+
+    bml_type = bml_get_type(H0_bml)
+    N = bml_get_N(H0_bml)
+    M = bml_get_M(H0_bml)
+
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B0_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
+
+    cnst = beta/(2**(2+nsteps))
+
+    ! P0 = 0.5*II - cnst*(H0-mu0*II)
+    call bml_copy(H0_bml, P0_bml)
+    call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
+
+    ! P1 =  - cnst*H1
+    call bml_copy(H1_bml, P1_bml)
+    call bml_scale(-1.0_dp*cnst, P1_bml)
+    do i = 1, nsteps
+
+      ! Calculate coefficient matrices
+      ! C0 = P0^2
+      call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! C = P0*P1+P1*P0, B = 2(P1 - C)
+      call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+      call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_copy(P1_bml, B_bml)
+      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+      ! Get next P0
+      call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! Get next P1
+      ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
+      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+    enddo
+
+    !    do i = 1, nsteps-1
+    ! D = A^-1*P0
+    !      call bml_multiply(Inv_bml(i), B0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+    !      call bml_multiply(C0_bml, B0_bml, B_bml, 1.0_dp, 0.0_dp, threshold)
+    ! B0 = A^-1*P0^2
+    !      call bml_copy(B_bml,B0_bml)
+    ! B = I + D -P0*D
+    !      call bml_add(B_bml, C0_bml, -1.0_dp, 1.0_dp, threshold)
+    !      call bml_scale_add_identity(B_bml, 1.0_dp, 1.0_dp, threshold)
+    ! P1 = 2D*P1(I+D-P0*D)
+    !      call bml_multiply(C0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+    !      call bml_multiply(C_bml, B_bml, P1_bml, 2.0_dp, 0.0_dp, threshold)
+    !    enddo
+    !      call bml_multiply(B0_bml, P1_bml, C_bml, 2.0_dp, 0.0_dp, threshold)
+    !      call bml_copy(P1_bml, B_bml)
+    !      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+    ! Get next P1
+    !      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+    !      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+
+
+    ! dPdmu = beta*P0(I-P0)
+    !call bml_copy(P0_bml, B_bml)
+    !call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+    !call bml_multiply(P0_bml, B_bml, C_bml, beta, 0.0_dp, threshold)
+    !dPdmu_trace = bml_trace(C_bml)
+    !p1_trace = bml_trace(P1_bml)
+    !mu1 =  - p1_trace/dPdmu_trace
+    if (abs(dPdmu_trace) > 1e-8) then
+      !call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+    endif
+
+    call bml_deallocate(B_bml)
+    call bml_deallocate(B0_bml)
+    call bml_deallocate(C_bml)
+    call bml_deallocate(C0_bml)
+
+
+  end subroutine prg_implicit_fermi_first_order_response
+
+
   !> Calculate density matrix response to perturbations using Implicit Fermi Dirac.
   !! \param H0_bml Input Hamiltonian matrix.
   !! \param H1_bml, H2_bml, H3_bml Input First to third order perturbations of H0.
@@ -279,7 +550,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     real(dp), allocatable, intent(inout) :: mu(:)
     real(dp), intent(in) :: beta, occ_tol, lin_tol, nocc
     integer, intent(in) :: nsteps
-    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, C0_bml, T_bml, Ti_bml
+    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, tmp3_bml, C0_bml, T_bml, Ti_bml
     type(bml_matrix_t), allocatable :: B_bml(:), P_bml(:), C_bml(:), H_bml(:)
     real(dp), allocatable :: p_trace(:), trace(:)
     character(20) :: bml_type
@@ -305,6 +576,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     end do
 
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp1_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp3_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, T_bml)
@@ -371,10 +643,10 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
         call bml_scale_add_identity(T_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find T-inverse
         if (i .eq. 1) then
-          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, 0.01_dp, threshold)
+          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, tmp3_bml,0.01_dp, threshold)
           call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
         end if
-        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, lin_tol, threshold)
+        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, tmp3_bml, I_bml, lin_tol, threshold)
         ! Get next P0
         call bml_multiply(Ti_bml, C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
         ! Get next P1
@@ -608,31 +880,38 @@ end subroutine prg_setup_linsys
   !! \param tol Convergence criterion (Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
 
-  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, tol, threshold)
+  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml
-    type(bml_matrix_t), intent(in) :: a_bml
+    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml, d_bml
+    type(bml_matrix_t), intent(in) :: a_bml, I_bml
     real(dp), intent(in) :: threshold, tol
-    real(dp) :: norm
-    integer :: i
+    real(dp) :: err,prev_err,scaled_tol
+    integer :: i,N,N2
 
-    norm = 1.0
+    N = bml_get_N(a_bml)
+    err = 100000.0
     i = 0
-    do while(norm > tol)
+    do while(err > tol)
+      !write(*,*) 'iter = ', i
+      !write(*,*) 'ns error =', err
       call bml_copy(ai_bml, tmp_bml)
       call bml_multiply(a_bml, ai_bml, r_bml, 1.0_dp, 0.0_dp, threshold)
       call bml_scale_add_identity(r_bml, -1.0_dp, 1.0_dp, threshold)
-      norm = bml_fnorm(r_bml)
-      !   write(*,*) "norm = ", norm
-      if (norm < tol) then
-        exit
-      end if
-      call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      prev_err = err
+      err = bml_fnorm(r_bml)
+      !write(*,*) "err = ", err
+      !write(*,*) "prev_err = ", prev_err
+      if (10*prev_err < err) then
+        write(*,*) 'NS did not converge, calling conjugate gradient'
+        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.00001_dp, threshold)
+      else
+        call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      endif
       i = i + 1
     enddo
-    ! write(*,*) "Number of NS iterations:", i
+    !write(*,*) "Number of NS iterations:", i
   end subroutine prg_newtonschulz
 
   ! Preconditioned CG, preconditioner inverse diagonal of A
@@ -717,47 +996,47 @@ end subroutine prg_pcg
   !! \param w_bml Auxillary matrix
   !! \param cg_tol Convergence condition (OBS squared Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
-  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, d_bml, w_bml, cg_tol, threshold)
+  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(in) :: A_bml
-    type(bml_matrix_t), intent(inout) :: p_bml, p2_bml, d_bml, w_bml
+    type(bml_matrix_t), intent(in) :: A_bml, p2_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, tmp_bml, d_bml, w_bml
     real(dp), intent(in) :: cg_tol, threshold
 
     real(dp) :: alpha, beta
     integer :: k
     real(dp) :: r_norm_old, r_norm_new
 
-    call bml_multiply(A_bml, p_bml, p2_bml, -1.0_dp, 1.0_dp, threshold)
-    r_norm_new = bml_sum_squares(p2_bml)
+    call bml_copy(p2_bml,tmp_bml)
+    call bml_multiply(A_bml, p_bml, tmp_bml, -1.0_dp, 1.0_dp, threshold)
+    r_norm_new = bml_sum_squares(tmp_bml)
     k = 0
 
     do while (r_norm_new .gt. cg_tol)
 
-      !   write(*,*) r_norm_new
+      !  write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
-        write(*,*) r_norm_new
-        call bml_copy(p2_bml, d_bml)
+        call bml_copy(tmp_bml, d_bml)
       else
         beta = r_norm_new/r_norm_old
-        call bml_add(d_bml, p2_bml, beta, 1.0_dp, threshold)
+        call bml_add(d_bml, tmp_bml, beta, 1.0_dp, threshold)
       endif
 
       call bml_multiply(A_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
       alpha = r_norm_new/bml_trace_mult(d_bml, w_bml)
 
       call bml_add(p_bml, d_bml, 1.0_dp, alpha, threshold)
-      call bml_add(p2_bml, w_bml, 1.0_dp, -alpha, threshold)
+      call bml_add(tmp_bml, w_bml, 1.0_dp, -alpha, threshold)
       r_norm_old = r_norm_new
-      r_norm_new = bml_sum_squares(p2_bml)
+      r_norm_new = bml_sum_squares(tmp_bml)
       if (k .gt. 50) then
         write(*,*) "Conjugate gradient is not converging"
         stop
       endif
     enddo
-    write(*,*) "Number of CG-iterations:", k
+    !write(*,*) "Number of CG-iterations:", k
 
   end subroutine prg_conjgrad
 
@@ -872,8 +1151,10 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then
+        mu = mu + (nocc - trP0)/trdPdmu
+      end if
       !write(*,*) "mu = ", mu
     enddo
 
@@ -884,7 +1165,7 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
 
     call bml_multiply(p_bml, aux_bml, aux1_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("Diagonalization - Adjusted occupation",p_bml,0,10,0,10)
 
     call bml_deallocate(eigenvectors_bml)
diff --git a/src/prg_syrotation_mod.F90 b/src/prg_syrotation_mod.F90
index 44d7f91e..16f28b7e 100644
--- a/src/prg_syrotation_mod.F90
+++ b/src/prg_syrotation_mod.F90
@@ -190,7 +190,7 @@ subroutine prg_rotate(rot,r,verbose)
     v2(2)=rot%v2(2)
     v2(3)=rot%v2(3)
 
-    vQ(1)=rot%vQ(1)	      !Rotation center
+    vQ(1)=rot%vQ(1)           !Rotation center
     vQ(2)=rot%vQ(2)
     vQ(3)=rot%vQ(3)
 
@@ -232,7 +232,7 @@ subroutine prg_rotate(rot,r,verbose)
       v2=pq2-vQ
     endif
 
-    vtr(1)=0.0_dp	    !Translation
+    vtr(1)=0.0_dp           !Translation
     vtr(2)=0.0_dp
     vtr(3)=0.0_dp
 
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
new file mode 100644
index 00000000..3a25c8ce
--- /dev/null
+++ b/src/prg_xlbokernel_mod.F90
@@ -0,0 +1,867 @@
+!> Pre-conditioned O(N) calculation of the kernel for XL-BOMD.
+!! \ingroup PROGRESS
+!! \brief Here are subroutines implementing Niklasson's scheme for
+!!  low-rank, Krylov subspace approximation of the kernel.
+module prg_xlbokernel_mod
+
+  use omp_lib
+  use bml
+  use prg_normalize_mod
+  use prg_densitymatrix_mod
+  use prg_timer_mod
+  use prg_parallel_mod
+  use prg_ewald_mod
+  use prg_implicit_fermi_mod
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: prg_kernel_multirank
+  public :: prg_kernel_multirank_latte
+  public :: prg_kernel_matrix_multirank
+  public :: prg_full_kernel
+  public :: prg_full_kernel_latte
+
+contains
+
+  subroutine Invert(A,AI,N)
+
+    implicit none
+    integer, parameter                          :: PREC = 8
+    integer, intent(in)                         :: N
+    real(PREC),                 intent(in)      :: A(N,N)
+    real(PREC), intent(out)                     :: AI(N,N)
+    real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
+    integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
+    integer                                     :: I,J,K
+
+    external DGETRF
+    external DGETRI
+
+    AI = A
+    LDA = N
+    M = N
+    LWORK = N+N*N
+
+    call DGETRF(M, N, AI, LDA, IPIV, INFO)
+    call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
+
+  end subroutine Invert
+
+  !> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
+  !! \param KRes The low rank approximation
+  !! \param KK0_bml The pre-conditioner K0.
+  !! \param Res The residual q[n]-n
+  !! \param FelTol Relative error tolerance for approximation
+  !! \param L Number of vectors used.
+  !! \param LMAX Maximum nr of vectors to use.
+  !! \param NUMRANK Nr of vectors to use.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param mu The chemical potiential.
+  !! \param beta Scaled inverse temperature.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy.
+  !! \param nebcoul Neighbour lists.
+  !! \param totnebcoul Number of neighbours list.
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Nocc Occupation.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
+       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu, Nocc
+    real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
+    real(dp), intent(in)      :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(20)             :: bml_type
+    integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,N,MN
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp, start, finish
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml, X_bml, Y_bml
+
+    call timer_prg_init()
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
+
+    call bml_transpose(Z_bml,ZT_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_set_row(Res_bml,1,Res,1.0_dp*1e-10)
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    K0Res = row_NA
+    dr = K0Res
+
+    write(*,*) 'resnorm =', norm2(Res), 'kresnorm =', norm2(dr)
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
+      I = I + 1
+      !write(*,*) 'dr =', norm2(dr)
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
+
+      ! Compute H1 = H(v)
+      dq_v = v
+      call prg_timer_start(1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
+             dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
+
+      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call prg_timer_stop(1,1)
+
+
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1,Nr_atoms-1
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
+          row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
+
+      ! H1 = 1/2(S*H1+H1*S)
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
+
+      ! H1 = Z^T H1 Z
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      ! D1 = Z D1 Z^T
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute dq/dv
+      call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,1.0_dp*1e-10)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
+
+    enddo
+
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
+    call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
+    call prg_timer_shutdown()
+
+  end subroutine prg_kernel_multirank_latte
+
+  ! Above routine but for development code
+  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_set_row(Res_bml,1,Res,ONE*1e-14)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    K0Res = row_NA
+    dr = K0Res
+
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
+
+      dq_v = v
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,threshold)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
+
+    enddo
+
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
+
+  end subroutine prg_kernel_multirank
+
+  !> Compute full inverse Jacobian of q[n]-n (for LATTE)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
+       Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
+       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml, &
+       Nocc,m_rec,threshold,beta,Nr_elem,nebcoul,totnebcoul)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta,Nocc
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
+    integer,    intent(in)         :: Hinxlist(HDIM)
+    real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml
+    real(PREC), intent(inout)      :: mu0
+    integer,    intent(in)         :: Element_Pointer(Nr_atoms),nebcoul(4,Max_Nr_Neigh,Nr_atoms),totnebcoul(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms), err,tol,start,ewaldk,ewaldr,ewaldkacc,ewaldracc,response,respacc
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It,N,MN
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:),E(:,:),EReal(:,:),EKspace(:,:)
+    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml, X_bml, Y_bml
+    character(20)                  :: bml_type
+
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
+    call bml_transpose(Z_bml,ZT_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    allocate(EReal(Nr_atoms,Nr_atoms)); allocate(EKspace(Nr_atoms,Nr_atoms))
+    allocate(E(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
+    dq_v = ZERO
+    JJ = ZERO
+    KK = ZERO
+
+    call Ewald_Real_Space_Matrix_latte(EReal,RXYZ,Box,Hubbard_U,Element_Pointer, & 
+            Nr_atoms,Coulomb_acc,Nebcoul,Totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+    call Ewald_k_Space_Matrix_latte(EKspace,RXYZ,Box,Nr_atoms,Coulomb_acc,Max_Nr_Neigh,Nebcoul,Totnebcoul)
+    E = EReal + EKspace
+    write(*,*) 'Beginning response calculations'
+    ewaldracc = 0.0; ewaldkacc = 0.0; respacc = 0.0;
+    do J = 1,Nr_atoms
+      write(*,*) 'J =',J
+      call cpu_time(start)
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
+        !call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
+        !     dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+        !Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
+      call cpu_time(ewaldr)
+      ewaldracc = ewaldracc + ewaldr-start
+      !call Ewald_k_Space_latte_single(Coulomb_Pot_k,J,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc)
+      !Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call cpu_time(ewaldk)
+      ewaldkacc = ewaldkacc + ewaldk-ewaldr
+      Coulomb_Pot_dq_v = matmul(E,dq_v)
+
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms-1
+        do K = Hinxlist(I)+1,Hinxlist(I+1)
+          row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
+
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
+
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,Nocc,threshold)
+      call cpu_time(response)
+      respacc = respacc + response-ewaldk
+
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+      JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+      dq_v = ZERO
+    enddo
+
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+
+    write(*,*) 'ewaldracc =', ewaldracc
+    write(*,*) 'ewaldkacc =', ewaldkacc
+    write(*,*) 'implcit response time =', respacc
+    call Invert(JJ,KK,Nr_atoms)
+
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+    deallocate(E); deallocate(EReal); deallocate(EKspace)
+    call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
+
+  end subroutine prg_full_kernel_latte
+
+  !> Compute full inverse Jacobian of q[n]-n (for development code)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RX,RY,RZ Nuclear coordinates.
+  !! \param Lbox Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Type List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param TIMERATIO Parameter for Ewald
+  !! \param nnRx,nnRy,nnRz Neighbour lists.
+  !! \param nrnnlist Number of neighbours list.
+  !! \param nnType Refers to original order of atoms.
+  !! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param Znuc List of nuclear charges.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param diagonal Auxillary vector.
+
+  subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, &
+       nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Nocc,Znuc,m_rec,threshold,beta,diagonal)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
+    character(10), intent(in)      :: Element_Type(Nr_atoms)
+    integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms)
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
+    dq_v = ZERO
+    JJ = ZERO
+    KK = ZERO
+
+    do J = 1,Nr_atoms
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
+        call Ewald_Real_Space_Single(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,I,RX,RY,RZ,LBox, &
+             dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
+      !  call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+      !       TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      diagonal = 0.0_dp
+      do I = 1,HDIM
+        call bml_set_row(H1_bml,I,diagonal,threshold)
+      enddo
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms
+        do K = H_INDEX_START(I),H_INDEX_END(I)
+          diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call bml_set_diagonal(H1_bml,diagonal,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      dq_v = ZERO
+    enddo
+
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+    call Invert(JJ,KK,Nr_atoms)
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+
+  end subroutine prg_full_kernel
+
+
+  ! Compute the low-rank kernel matrix. (For development code)
+  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu,Nocc
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    type(bml_matrix_t),intent(inout) :: KK0_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(out)     :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,col,row
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp,elem
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
+
+    allocate(row1(HDIM));allocate(row2(HDIM));
+
+    dr = Res
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
+
+      dq_v = v
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
+
+      ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, Nocc, threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      dr = dq_dv - v
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+
+      ! Does not work, need to normalize matrix
+      if ((Fel > FelTol).AND.(I < (LMAX))) then
+        deallocate(O, M)
+      else
+        do row = 1,Nr_atoms
+          do col = 1,Nr_atoms
+            elem = 0.0
+            do K = 1,L
+              do J = 1,L
+                elem = elem + M(J,K)*vi(row,J)*fi(col,K)
+              enddo
+            enddo
+            if (abs(elem) > threshold) then
+              call bml_set_element(KK0_bml,row,col,elem)
+            endif
+          enddo
+        enddo
+        deallocate(O,M)
+      endif
+    enddo
+    deallocate(row1); deallocate(row2);
+
+  end subroutine prg_kernel_matrix_multirank
+
+end module prg_xlbokernel_mod
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cbc484e7..f2d8d5df 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -56,6 +56,7 @@ progress_test(prg_density_T)
 progress_test(prg_density_T_fermi)
 progress_test(prg_sp2_basic)
 progress_test(prg_implicit_fermi)
+progress_test(prg_implicit_fermi_save_inverse)
 progress_test(prg_sp2_alg1_dense)
 progress_test(prg_sp2_alg2_dense)
 progress_test(prg_sp2_alg1_ellpack)
diff --git a/tests/src/main.F90 b/tests/src/main.F90
index cd2fe3d2..9e437799 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -31,7 +31,8 @@ program main
 
   implicit none
 
-  integer :: norb, mdim, verbose, i
+  integer :: norb, mdim, verbose
+  type(bml_matrix_t) :: inv_bml(10)
   type(bml_matrix_t) :: ham_bml
   type(bml_matrix_t) :: rho_bml, rho1_bml
   type(bml_matrix_t) :: rho_ortho_bml
@@ -59,7 +60,7 @@ program main
   real(dp) :: mineval, maxeval, occerrlimit
   real(dp), allocatable :: gbnd(:)
   integer :: minsp2iter, icount, nodesPerPart, occsteps
-  integer :: norecs
+  integer :: norecs,occiter,i
   integer :: maxsp2iter, npts, sp2all_timer, sp2all_timer_init
   integer, allocatable :: pp(:), signlist(:)
   real(dp), allocatable :: vv(:)
@@ -189,6 +190,8 @@ program main
 
     mu = 0.2_dp
     beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    bml_type = 'ellpack'
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
@@ -196,7 +199,7 @@ program main
     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-    call prg_implicit_fermi(ham_bml, rho1_bml, 10, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 10e-8_dp)
+    call prg_implicit_fermi(ham_bml, rho1_bml, norecs, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 1e-6_dp)
 
     mu = 0.2_dp
 
@@ -205,11 +208,44 @@ program main
     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
 
     error_calc = bml_fnorm(rho1_bml)
+    write(*,*) error_calc
+
     if(error_calc.gt.0.1_dp)then
       write(*,*) "Error in Implicit Fermi expansion ","Error = ",error_calc
       error stop
     endif
 
+  case("prg_implicit_fermi_save_inverse")
+
+    mu = 0.2_dp
+    beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    nocc = 10.0_dp
+
+    do i = 1,norecs
+      call bml_identity_matrix(bml_type,bml_element_real,dp,norb,norb,inv_bml(i))
+    enddo
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
+
+    mu = 0.2_dp
+    call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)
+    write(*,*) mu
+    call bml_scale(0.5_dp,rho_bml)
+    call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
+
+    error_calc = bml_fnorm(rho1_bml)
+    if(error_calc.gt.0.1_dp)then
+      write(*,*) "Error in Implicit Fermi expansion save inverse","Error = ",error_calc
+      error stop
+    endif
+
   case("prg_sp2_basic") !Sp2 original version
 
     call prg_timer_start(loop_timer)
@@ -979,11 +1015,13 @@ program main
       error stop
     endif
 
+
     call prg_timer_stop(loop_timer)
 
   case("prg_buildzsparse")  ! Building inverse overlap factor matrix (Lowdin method)
 
     write(*,*) "Testing buildzsparse from prg_genz_mod"
+
     error_tol = 1.0d-2
     bml_type = "ellpack"
 
@@ -1005,7 +1043,6 @@ program main
     call prg_buildZsparse(over_bml,aux_bml,1,mdim,bml_type,zk1_bml,zk2_bml,zk3_bml&
          &,zk4_bml,zk5_bml,zk6_bml,4,4,3,threshold,threshold,.true.,1)
 
-
     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
 
     error_calc = bml_fnorm(aux_bml)
@@ -1095,6 +1132,8 @@ program main
     call bml_add_deprecated(-1.0_dp,rho_bml,1.0_dp,rho1_bml,0.0_dp)
     error_calc = bml_fnorm(rho_bml)
     write(*,*)error_calc
+
+
     if(error_calc.gt.error_tol)then
       write(*,*) "Error is too high", error_calc
       error stop
@@ -1136,7 +1175,6 @@ program main
     endif
 
 
-
   case default
 
     write(*,*)"ERROR: unknown test ",test