Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions configs/spock_intel_RTX5090.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# NTU-spock
CUDA_PATH /work1/koarakawaii/NVIDIA_HPC_SDK/nvidia_hpc_sdk/Linux_x86_64/25.9/compilers
FFTW2_PATH /software/fftw/2.1.5-intel-2023.1.0-openmpi-4.1.5-ucx_mt
FFTW3_PATH /software/fftw/3.3.10-intel-2023.1.0-openmpi-4.1.5-ucx_mt
MPI_PATH /software/openmpi/4.1.5-ucx_mt-intel-2023.1.0
HDF5_PATH /software/hdf5/1.10.6-intel-2023.1.0
GRACKLE_PATH
GSL_PATH /software/gsl/2.6-intel-2023.1.0
LIBYT_PATH
CUFFTDX_PATH /software/cuFFTDx/22.11

# compilers
CXX icpc
CXX_MPI mpicxx

# flags
CXXFLAG -g
CXXFLAG -O2
CXXFLAG -fp-model precise
#CXXFLAG -std=c++11
#CXXFLAG -gxx-name=YOUR_G++
CXXFLAG -w1
CXXFLAG -Wno-unknown-pragmas -diag-disable 3180 -diag-disable 10441

OPENMPFLAG -fopenmp

LIBFLAG -limf -L/work1/koarakawaii/NVIDIA_HPC_SDK/nvidia_hpc_sdk/Linux_x86_64/25.9/cuda/13.0/targets/x86_64-linux/lib

NVCCFLAG_COM -O3
#NVCCFLAG_COM -use_fast_math
NVCCFLAG_FLU -Xptxas -dlcm=ca -prec-div=false -ftz=true
NVCCFLAG_POT -Xptxas -dlcm=ca

# for debugging
#CXXFLAG -fstack-protector-all
#CXXFLAG -fstack-protector-strong # somehow it can capture issues not detected by -fstack-protector-all
#LIBFLAG -lssp

# gpu
GPU_COMPUTE_CAPABILITY 1200 # 5090
24 changes: 22 additions & 2 deletions include/CUFLU.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@
# else
# define FLU_BLOCK_SIZE_X 512 // not optimized yet
# endif
# elif ( GPU_ARCH == BLACKWELL )
# ifdef FLOAT8
# define FLU_BLOCK_SIZE_X 256
# else
# define FLU_BLOCK_SIZE_X 512 // not optimized yet
# endif
# else
# define FLU_BLOCK_SIZE_X NULL_INT
# ifdef GPU
Expand Down Expand Up @@ -420,6 +426,12 @@
# else
# define FLU_BLOCK_SIZE_X 512 // not optimized yet
# endif
# elif ( GPU_ARCH == BLACKWELL )
# ifdef FLOAT8
# define FLU_BLOCK_SIZE_X 256
# else
# define FLU_BLOCK_SIZE_X 512 // not optimized yet
# endif
# else
# define FLU_BLOCK_SIZE_X NULL_INT
# ifdef GPU
Expand Down Expand Up @@ -502,6 +514,13 @@
# define FLU_BLOCK_SIZE_Y 32 // not optimized yet
# endif

# elif ( GPU_ARCH == BLACKWELL )
# ifdef FLOAT8
# define FLU_BLOCK_SIZE_Y 16 // not optimized yet
# else
# define FLU_BLOCK_SIZE_Y 32 // not optimized yet
# endif

# else
# define FLU_BLOCK_SIZE_Y NULL_INT
# ifdef GPU
Expand Down Expand Up @@ -577,7 +596,8 @@ using complex_type = typename FFT::value_type;

// use shuffle reduction in the KEPLER and later GPUs
# if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER )
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \
GPU_ARCH == BLACKWELL )
# define DT_FLU_USE_SHUFFLE
# endif

Expand All @@ -593,7 +613,7 @@ using complex_type = typename FFT::value_type;
// for information on warp size
#ifdef __CUDACC__
#if ( GPU_ARCH == FERMI || GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER )
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || GPU_ARCH == BLACKWELL )
// CUPOT.h will define WARP_SIZE as well
# ifndef WARP_SIZE
# define WARP_SIZE 32
Expand Down
14 changes: 11 additions & 3 deletions include/CUPOT.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@
# else
# define POT_BLOCK_SIZE_Z 4 // not optimized yet
# endif
# elif ( GPU_ARCH == BLACKWELL )
# ifdef FLOAT8
# define POT_BLOCK_SIZE_Z 2 // not optimized yet
# else
# define POT_BLOCK_SIZE_Z 4 // not optimized yet
# endif
# else
# define POT_BLOCK_SIZE_Z NULL_INT
# ifdef GPU
Expand All @@ -128,7 +134,8 @@
// --> although strictly speaking the shuffle functions do NOT work for double precision, but experiments
// show that residual_sum += (float)residual, where residual_sum is double, gives acceptable accuracy
# if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER )
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \
GPU_ARCH == BLACKWELL )
# define SOR_USE_SHUFFLE
# endif

Expand Down Expand Up @@ -185,7 +192,8 @@

// use shuffle reduction in the KEPLER and later GPUs
#if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER )
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \
GPU_ARCH == BLACKWELL )
# define DT_GRA_USE_SHUFFLE
#endif

Expand All @@ -195,7 +203,7 @@
// for information on warp size
#ifdef __CUDACC__
#if ( GPU_ARCH == FERMI || GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER )
GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || GPU_ARCH == BLACKWELL )
// CUFLU.h will define WARP_SIZE as well
# ifndef WARP_SIZE
# define WARP_SIZE 32
Expand Down
21 changes: 12 additions & 9 deletions include/Macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,29 @@
#define AMPERE 7
#define ADA_LOVELACE 8
#define HOPPER 9
#define BLACKWELL 10

#ifdef GPU
#if ( GPU_COMPUTE_CAPABILITY >= 200 && GPU_COMPUTE_CAPABILITY < 300 )
#if ( GPU_COMPUTE_CAPABILITY >= 200 && GPU_COMPUTE_CAPABILITY < 300 )
# define GPU_ARCH FERMI
#elif ( GPU_COMPUTE_CAPABILITY >= 300 && GPU_COMPUTE_CAPABILITY < 500 )
#elif ( GPU_COMPUTE_CAPABILITY >= 300 && GPU_COMPUTE_CAPABILITY < 500 )
# define GPU_ARCH KEPLER
#elif ( GPU_COMPUTE_CAPABILITY >= 500 && GPU_COMPUTE_CAPABILITY < 600 )
#elif ( GPU_COMPUTE_CAPABILITY >= 500 && GPU_COMPUTE_CAPABILITY < 600 )
# define GPU_ARCH MAXWELL
#elif ( GPU_COMPUTE_CAPABILITY >= 600 && GPU_COMPUTE_CAPABILITY < 700 )
#elif ( GPU_COMPUTE_CAPABILITY >= 600 && GPU_COMPUTE_CAPABILITY < 700 )
# define GPU_ARCH PASCAL
#elif ( GPU_COMPUTE_CAPABILITY >= 700 && GPU_COMPUTE_CAPABILITY < 750 )
#elif ( GPU_COMPUTE_CAPABILITY >= 700 && GPU_COMPUTE_CAPABILITY < 750 )
# define GPU_ARCH VOLTA
#elif ( GPU_COMPUTE_CAPABILITY >= 750 && GPU_COMPUTE_CAPABILITY < 800 )
#elif ( GPU_COMPUTE_CAPABILITY >= 750 && GPU_COMPUTE_CAPABILITY < 800 )
# define GPU_ARCH TURING
#elif ( GPU_COMPUTE_CAPABILITY >= 800 && GPU_COMPUTE_CAPABILITY < 890 )
#elif ( GPU_COMPUTE_CAPABILITY >= 800 && GPU_COMPUTE_CAPABILITY < 890 )
# define GPU_ARCH AMPERE
#elif ( GPU_COMPUTE_CAPABILITY >= 890 && GPU_COMPUTE_CAPABILITY < 900 )
#elif ( GPU_COMPUTE_CAPABILITY >= 890 && GPU_COMPUTE_CAPABILITY < 900 )
# define GPU_ARCH ADA_LOVELACE
#elif ( GPU_COMPUTE_CAPABILITY >= 900 && GPU_COMPUTE_CAPABILITY < 1000 )
#elif ( GPU_COMPUTE_CAPABILITY >= 900 && GPU_COMPUTE_CAPABILITY < 1000 )
# define GPU_ARCH HOPPER
#elif ( GPU_COMPUTE_CAPABILITY >= 1000 && GPU_COMPUTE_CAPABILITY < 1210 )
# define GPU_ARCH BLACKWELL
#else
# error : ERROR : Unknown GPU_COMPUTE_CAPABILITY !!
#endif // GPU_COMPUTE_CAPABILITY
Expand Down
2 changes: 2 additions & 0 deletions src/Auxiliary/Aux_TakeNote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,8 @@ void Aux_TakeNote()
fprintf( Note, "GPU_ARCH ADA_LOVELACE\n" );
# elif ( GPU_ARCH == HOPPER )
fprintf( Note, "GPU_ARCH HOPPER\n" );
# elif ( GPU_ARCH == BLACKWELL )
fprintf( Note, "GPU_ARCH BLACKWELL\n" );
# else
fprintf( Note, "GPU_ARCH UNKNOWN\n" );
# endif
Expand Down
6 changes: 5 additions & 1 deletion src/GPU_API/CUAPI_DiagnoseDevice.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ void CUAPI_DiagnoseDevice()
else if ( DeviceProp.major == 8 && DeviceProp.minor == 6 ) NCorePerMP = 128;
else if ( DeviceProp.major == 8 && DeviceProp.minor == 9 ) NCorePerMP = 128;
else if ( DeviceProp.major == 9 ) NCorePerMP = 128;
else if ( DeviceProp.major == 12 && DeviceProp.minor == 0 ) NCorePerMP = 128;
else
fprintf( stderr, "WARNING : unable to determine the number of cores per multiprocessor for version %d.%d ...\n",
DeviceProp.major, DeviceProp.minor );
Expand Down Expand Up @@ -86,6 +87,9 @@ void CUAPI_DiagnoseDevice()

Aux_GetCPUInfo( FileName );

int clockRate; // in unit of kHz
CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, GetDeviceID) );

fprintf( Note, "\n" );
fprintf( Note, "GPU Info :\n" );
fprintf( Note, "Number of GPUs : %d\n" , DeviceCount );
Expand All @@ -95,7 +99,7 @@ void CUAPI_DiagnoseDevice()
fprintf( Note, "CUDA Runtime Version : %d.%d\n" , RuntimeVersion/1000, RuntimeVersion%100 );
fprintf( Note, "CUDA Major Revision Number : %d\n" , DeviceProp.major );
fprintf( Note, "CUDA Minor Revision Number : %d\n" , DeviceProp.minor );
fprintf( Note, "Clock Rate : %f GHz\n" , DeviceProp.clockRate/1.0e6 );
fprintf( Note, "Clock Rate : %f GHz\n" , clockRate/1.0e6 );
fprintf( Note, "Global Memory Size : %ld MB\n" , (long)DeviceProp.totalGlobalMem/1024/1024 );
fprintf( Note, "Constant Memory Size : %ld KB\n" , (long)DeviceProp.totalConstMem/1024 );
fprintf( Note, "Shared Memory Size per Block : %ld KB\n" , (long)DeviceProp.sharedMemPerBlock/1024 );
Expand Down
5 changes: 3 additions & 2 deletions src/GPU_API/CUAPI_SetDevice.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ void CUAPI_SetDevice( const int Mode )
// set the device ID
void **d_TempPtr = NULL;
int SetDeviceID, GetDeviceID = 999;
int computeMode;
cudaDeviceProp DeviceProp;

switch ( Mode )
Expand All @@ -77,9 +78,9 @@ void CUAPI_SetDevice( const int Mode )

// make sure that the "exclusive" compute mode is adopted
CUDA_CHECK_ERROR( cudaGetDevice( &GetDeviceID ) );
CUDA_CHECK_ERROR( cudaGetDeviceProperties( &DeviceProp, GetDeviceID ) );
CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, GetDeviceID) );

if ( DeviceProp.computeMode != cudaComputeModeExclusive )
if ( computeMode != cudaComputeModeExclusive )
{
Aux_Message( stderr, "WARNING : \"exclusive\" compute mode is NOT enabled for \"%s\" at Rank %2d",
"OPT__GPUID_SELECT == -2", MPI_Rank );
Expand Down
19 changes: 18 additions & 1 deletion src/GPU_API/CUAPI_SetMemSize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
// (1) GPU_NSTREAM
if ( GPU_NStream <= 0 )
{
if ( DeviceProp.deviceOverlap )
int gpuOverlap;
CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&gpuOverlap, cudaDevAttrGpuOverlap, GetDeviceID) );

if ( gpuOverlap )
{
# if ( MODEL == HYDRO )
# if ( GPU_ARCH == FERMI )
Expand All @@ -57,6 +60,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
GPU_NStream = 4;
# elif ( GPU_ARCH == HOPPER )
GPU_NStream = 4;
# elif ( GPU_ARCH == BLACKWELL )
GPU_NStream = 4;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand All @@ -80,6 +85,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
GPU_NStream = 4;
# elif ( GPU_ARCH == HOPPER )
GPU_NStream = 4;
# elif ( GPU_ARCH == BLACKWELL )
GPU_NStream = 4;
# else
# error : ERROR : UNKNOWN GPU_ARCH !!
# endif
Expand Down Expand Up @@ -118,6 +125,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == HOPPER )
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == BLACKWELL )
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand All @@ -141,6 +150,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == HOPPER )
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == BLACKWELL )
Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand Down Expand Up @@ -173,6 +184,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == HOPPER )
Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == BLACKWELL )
Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand Down Expand Up @@ -203,6 +216,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == HOPPER )
Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == BLACKWELL )
Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand Down Expand Up @@ -232,6 +247,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr
Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == HOPPER )
Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# elif ( GPU_ARCH == BLACKWELL )
Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount;
# else
# error : UNKNOWN GPU_ARCH !!
# endif
Expand Down
2 changes: 1 addition & 1 deletion src/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ def set_gpu( gpus, flags, args ):
gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=128"
else:
gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=70"
elif 500 <= compute_capability and compute_capability <= 900:
elif 500 <= compute_capability and compute_capability <= 1210:
if args["double"]:
gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=192"
else:
Expand Down