Skip to content

Commit

Permalink
use stack memory
Browse files Browse the repository at this point in the history
  • Loading branch information
WeiqunZhang committed Sep 12, 2014
1 parent 4f84407 commit ea878b0
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 1 deletion.
10 changes: 9 additions & 1 deletion GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ NDEBUG := t
COMP := GNU
# COMP := Intel

K_USE_AUTOMATIC := t

###
ifdef MPI
# Set USE_MPI_WRAPPERS := t to use mpif90 or mpiifot,
# Set USE_MPI_WRAPPERS := t to use mpif90 or mpiifort,
# otherwise you need to specify mpi_include_dir, mpi_lib_dir, and mpi_libraries.
USE_MPI_WRAPPERS := t
ifndef USE_MPI_WRAPPERS
Expand Down Expand Up @@ -76,6 +78,12 @@ libraries =
fpp_flags =
fld_flags =

ifdef K_USE_AUTOMATIC
F90PPFLAGS += -DK_USE_AUTOMATIC
else
F90PPFLAGS =
endif

ifeq ($(wildcard comps/$(COMP).mak),)
$(error "comps/$(COMP).mak does not exist")
else
Expand Down
10 changes: 10 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ specified in the GNUmakefile:
* MIC :=
Set this if compiling for Intel Xeon Phi.

* K_USE_AUTOMATIC := t
This determines whether some arrays in kernels.F90 will be automatic
or allocatable. Automatic arrays are usually on the stack.
When OpenMP is used, allocating memory on the stack is usually
faster than on the heap. However, one must make sure there is
adequate space on the stack; otherwise a segmentation fault might
occur. Note that the size of the stack for threads can be adjusted
by the OMP_STACKSIZE environment variable and the shell might also
impose a limit on stack size.

* MKVERBOSE := t
This determines the verbosity of the building process.

Expand Down
51 changes: 51 additions & 0 deletions src/kernels.f90 → src/kernels.F90
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,18 @@ subroutine hypterm_3d (lo,hi,dx,cons,clo,chi,q,qlo,qhi,rhs,rlo,rhi)

integer :: i,j,k,n
double precision :: dxinv(3)

#ifdef K_USE_AUTOMATIC
double precision :: tmpx(lo(1)-4:hi(1)+4)
double precision :: tmpy(lo(1) :hi(1) ,lo(2)-4:hi(2)+4)
double precision :: tmpz(lo(1) :hi(1) ,lo(2) :hi(2) ,lo(3)-4:hi(3)+4)
#else
double precision, allocatable :: tmpx(:), tmpy(:,:),tmpz(:,:,:)

allocate(tmpx(lo(1)-4:hi(1)+4))
allocate(tmpy(lo(1) :hi(1) ,lo(2)-4:hi(2)+4))
allocate(tmpz(lo(1) :hi(1) ,lo(2) :hi(2) ,lo(3)-4:hi(3)+4))
#endif

do i=1,3
dxinv(i) = 1.0d0 / dx(i)
Expand Down Expand Up @@ -317,7 +324,9 @@ subroutine hypterm_3d (lo,hi,dx,cons,clo,chi,q,qlo,qhi,rhs,rlo,rhi)
enddo
enddo

#ifndef K_USE_AUTOMATIC
deallocate(tmpx,tmpy,tmpz)
#endif

end subroutine hypterm_3d

Expand All @@ -335,9 +344,14 @@ subroutine narrow_diffterm_3d (lo,hi,dx,q,qlo,qhi,rhs_g,glo,ghi,mu,xi,lam,dxy)

integer :: i, dlo(3), dhi(3)
double precision :: dxinv(3), dx2inv(3)

#ifdef K_USE_AUTOMATIC
double precision :: rhs(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3),ncons)
#else
double precision, allocatable :: rhs(:,:,:,:)

allocate(rhs(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3),ncons))
#endif

do i = 1,3
dxinv(i) = 1.0d0 / dx(i)
Expand All @@ -357,7 +371,9 @@ subroutine narrow_diffterm_3d (lo,hi,dx,q,qlo,qhi,rhs_g,glo,ghi,mu,xi,lam,dxy)
rhs_g(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3),:) &
+ rhs(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3),:)

#ifndef K_USE_AUTOMATIC
deallocate(rhs)
#endif

end subroutine narrow_diffterm_3d

Expand All @@ -375,6 +391,21 @@ subroutine diffterm_1(q,qlo,qhi,rhs,rlo,rhi,mu,xi, lo,hi,dlo,dhi,dxinv)
!
integer :: i,j,k
double precision, dimension(lo(1):hi(1)) :: tauxx,tauyy,tauzz,divu
#ifdef K_USE_AUTOMATIC
double precision :: ux ( lo(1): hi(1),dlo(2):dhi(2),dlo(3):dhi(3))
double precision :: vx ( lo(1): hi(1),dlo(2):dhi(2),dlo(3):dhi(3))
double precision :: wx ( lo(1): hi(1),dlo(2):dhi(2),dlo(3):dhi(3))
double precision :: uy (dlo(1):dhi(1), lo(2): hi(2),dlo(3):dhi(3))
double precision :: vy (dlo(1):dhi(1), lo(2): hi(2),dlo(3):dhi(3))
double precision :: wy (dlo(1):dhi(1), lo(2): hi(2),dlo(3):dhi(3))
double precision :: uz (dlo(1):dhi(1),dlo(2):dhi(2), lo(3): hi(3))
double precision :: vz (dlo(1):dhi(1),dlo(2):dhi(2), lo(3): hi(3))
double precision :: wz (dlo(1):dhi(1),dlo(2):dhi(2), lo(3): hi(3))
double precision :: vsm (dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3))
double precision :: tmpx(dlo(1):dhi(1))
double precision :: tmpy( lo(1): hi(1),dlo(2):dhi(2))
double precision :: tmpz( lo(1): hi(1), lo(2): hi(2),dlo(3):dhi(3))
#else
double precision, allocatable, dimension(:,:,:) :: ux,uy,uz,vx,vy,vz,wx,wy,wz
double precision, allocatable :: tmpx(:), tmpy(:,:),tmpz(:,:,:)
double precision, allocatable, dimension(:,:,:) :: vsm
Expand All @@ -396,6 +427,7 @@ subroutine diffterm_1(q,qlo,qhi,rhs,rlo,rhi,mu,xi, lo,hi,dlo,dhi,dxinv)
allocate(tmpx(dlo(1):dhi(1)))
allocate(tmpy( lo(1): hi(1),dlo(2):dhi(2)))
allocate(tmpz( lo(1): hi(1), lo(2): hi(2),dlo(3):dhi(3)))
#endif

do k=dlo(3),dhi(3)
do j=dlo(2),dhi(2)
Expand Down Expand Up @@ -675,9 +707,11 @@ subroutine diffterm_1(q,qlo,qhi,rhs,rlo,rhi,mu,xi, lo,hi,dlo,dhi,dxinv)
end do
end do

#ifndef K_USE_AUTOMATIC
deallocate(tmpx,tmpy,tmpz)
deallocate(vsm)
deallocate(ux,uy,uz,vx,vy,vz,wx,wy,wz)
#endif

end subroutine diffterm_1

Expand All @@ -699,6 +733,20 @@ subroutine diffterm_2(q,qlo,qhi,rhs,rlo,rhi,mu,xi,lam,dxy,lo,hi,dlo,dhi,dxinv,dx
double precision :: mmtmp(8,lo(1):hi(1)+1)
double precision :: ene_c

#ifdef K_USE_AUTOMATIC
double precision :: vsp(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3))
double precision :: dpy(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),nspecies)
double precision :: dxe(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),nspecies)
double precision :: dpe(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3))

double precision :: Hg(lo(1):hi(1)+1,lo(2):hi(2)+1,lo(3):hi(3)+1,2:ncons)

double precision :: M8p (8,lo(1):hi(1)+1,lo(2):hi(2)+1,lo(3):hi(3)+1)

double precision :: sumdrY(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3))
double precision :: sumrYv(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3))
double precision :: gradp (lo(1):hi(1),lo(2):hi(2),lo(3):hi(3))
#else
double precision, allocatable, dimension(:,:,:) :: vsp, dpe
double precision, allocatable, dimension(:,:,:,:) :: Hg, dpy, dxe
! dxy: diffusion coefficient of X in equation for Y
Expand All @@ -722,6 +770,7 @@ subroutine diffterm_2(q,qlo,qhi,rhs,rlo,rhi,mu,xi,lam,dxy,lo,hi,dlo,dhi,dxinv,dx
allocate(sumdrY(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3)))
allocate(sumrYv(lo(1):hi(1),lo(2):hi(2),lo(3):hi(3)))
allocate(gradp (lo(1):hi(1),lo(2):hi(2),lo(3):hi(3)))
#endif

do k=dlo(3),dhi(3)
do j=dlo(2),dhi(2)
Expand Down Expand Up @@ -2115,8 +2164,10 @@ subroutine diffterm_2(q,qlo,qhi,rhs,rlo,rhi,mu,xi,lam,dxy,lo,hi,dlo,dhi,dxinv,dx
end do
end do

#ifndef K_USE_AUTOMATIC
deallocate(Hg,dpy,dxe,dpe,vsp,M8p)
deallocate(sumdrY,sumryv,gradp)
#endif

end subroutine diffterm_2

Expand Down

0 comments on commit ea878b0

Please sign in to comment.