diff --git a/configs/spock_intel_RTX5090.config b/configs/spock_intel_RTX5090.config new file mode 100644 index 000000000..964cbef46 --- /dev/null +++ b/configs/spock_intel_RTX5090.config @@ -0,0 +1,40 @@ +# NTU-spock +CUDA_PATH /work1/koarakawaii/NVIDIA_HPC_SDK/nvidia_hpc_sdk/Linux_x86_64/25.9/compilers +FFTW2_PATH /software/fftw/2.1.5-intel-2023.1.0-openmpi-4.1.5-ucx_mt +FFTW3_PATH /software/fftw/3.3.10-intel-2023.1.0-openmpi-4.1.5-ucx_mt +MPI_PATH /software/openmpi/4.1.5-ucx_mt-intel-2023.1.0 +HDF5_PATH /software/hdf5/1.10.6-intel-2023.1.0 +GRACKLE_PATH +GSL_PATH /software/gsl/2.6-intel-2023.1.0 +LIBYT_PATH +CUFFTDX_PATH /software/cuFFTDx/22.11 + +# compilers +CXX icpc +CXX_MPI mpicxx + +# flags +CXXFLAG -g +CXXFLAG -O2 +CXXFLAG -fp-model precise +#CXXFLAG -std=c++11 +#CXXFLAG -gxx-name=YOUR_G++ +CXXFLAG -w1 +CXXFLAG -Wno-unknown-pragmas -diag-disable 3180 -diag-disable 10441 + +OPENMPFLAG -fopenmp + +LIBFLAG -limf -L/work1/koarakawaii/NVIDIA_HPC_SDK/nvidia_hpc_sdk/Linux_x86_64/25.9/cuda/13.0/targets/x86_64-linux/lib + +NVCCFLAG_COM -O3 +#NVCCFLAG_COM -use_fast_math +NVCCFLAG_FLU -Xptxas -dlcm=ca -prec-div=false -ftz=true +NVCCFLAG_POT -Xptxas -dlcm=ca + +# for debugging +#CXXFLAG -fstack-protector-all +#CXXFLAG -fstack-protector-strong # somehow it can capture issues not detected by -fstack-protector-all +#LIBFLAG -lssp + +# gpu +GPU_COMPUTE_CAPABILITY 1200 # 5090 diff --git a/include/CUFLU.h b/include/CUFLU.h index 501af9b1f..70d445c8e 100644 --- a/include/CUFLU.h +++ b/include/CUFLU.h @@ -355,6 +355,12 @@ # else # define FLU_BLOCK_SIZE_X 512 // not optimized yet # endif +# elif ( GPU_ARCH == BLACKWELL ) +# ifdef FLOAT8 +# define FLU_BLOCK_SIZE_X 256 +# else +# define FLU_BLOCK_SIZE_X 512 // not optimized yet +# endif # else # define FLU_BLOCK_SIZE_X NULL_INT # ifdef GPU @@ -420,6 +426,12 @@ # else # define FLU_BLOCK_SIZE_X 512 // not optimized yet # endif +# elif ( GPU_ARCH == BLACKWELL ) +# ifdef FLOAT8 +# define FLU_BLOCK_SIZE_X 256 +# else +# define FLU_BLOCK_SIZE_X 512 // not optimized yet +# endif # else # define FLU_BLOCK_SIZE_X NULL_INT # ifdef GPU @@ -502,6 +514,13 @@ # define FLU_BLOCK_SIZE_Y 32 // not optimized yet # endif +# elif ( GPU_ARCH == BLACKWELL ) +# ifdef FLOAT8 +# define FLU_BLOCK_SIZE_Y 16 // not optimized yet +# else +# define FLU_BLOCK_SIZE_Y 32 // not optimized yet +# endif + # else # define FLU_BLOCK_SIZE_Y NULL_INT # ifdef GPU @@ -577,7 +596,8 @@ using complex_type = typename FFT::value_type; // use shuffle reduction in the KEPLER and later GPUs # if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \ - GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER ) + GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \ + GPU_ARCH == BLACKWELL ) # define DT_FLU_USE_SHUFFLE # endif @@ -593,7 +613,7 @@ using complex_type = typename FFT::value_type; // for information on warp size #ifdef __CUDACC__ #if ( GPU_ARCH == FERMI || GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \ - GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER ) + GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || GPU_ARCH == BLACKWELL ) // CUPOT.h will define WARP_SIZE as well # ifndef WARP_SIZE # define WARP_SIZE 32 diff --git a/include/CUPOT.h b/include/CUPOT.h index 8843f5a05..1fb5ee810 100644 --- a/include/CUPOT.h +++ b/include/CUPOT.h @@ -107,6 +107,12 @@ # else # define POT_BLOCK_SIZE_Z 4 // not optimized yet # endif +# elif ( GPU_ARCH == BLACKWELL ) +# ifdef FLOAT8 +# define POT_BLOCK_SIZE_Z 2 // not optimized yet +# else +# define POT_BLOCK_SIZE_Z 4 // not optimized yet +# endif # else # define POT_BLOCK_SIZE_Z NULL_INT # ifdef GPU @@ -128,7 +134,8 @@ // --> although strictly speaking the shuffle functions do NOT work for double precision, but experiments // show that residual_sum += (float)residual, where residual_sum is double, gives acceptable accuracy # if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \ - GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER ) + GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \ + GPU_ARCH == BLACKWELL ) # define SOR_USE_SHUFFLE # endif @@ -185,7 +192,8 @@ // use shuffle reduction in the KEPLER and later GPUs #if ( GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \ - GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER ) + GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || \ + GPU_ARCH == BLACKWELL ) # define DT_GRA_USE_SHUFFLE #endif @@ -195,7 +203,7 @@ // for information on warp size #ifdef __CUDACC__ #if ( GPU_ARCH == FERMI || GPU_ARCH == KEPLER || GPU_ARCH == MAXWELL || GPU_ARCH == PASCAL || GPU_ARCH == VOLTA || \ - GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER ) + GPU_ARCH == TURING || GPU_ARCH == AMPERE || GPU_ARCH == ADA_LOVELACE || GPU_ARCH == HOPPER || GPU_ARCH == BLACKWELL ) // CUFLU.h will define WARP_SIZE as well # ifndef WARP_SIZE # define WARP_SIZE 32 diff --git a/include/Macro.h b/include/Macro.h index 98e42ef8e..a38dc3547 100644 --- a/include/Macro.h +++ b/include/Macro.h @@ -32,26 +32,29 @@ #define AMPERE 7 #define ADA_LOVELACE 8 #define HOPPER 9 +#define BLACKWELL 10 #ifdef GPU -#if ( GPU_COMPUTE_CAPABILITY >= 200 && GPU_COMPUTE_CAPABILITY < 300 ) +#if ( GPU_COMPUTE_CAPABILITY >= 200 && GPU_COMPUTE_CAPABILITY < 300 ) # define GPU_ARCH FERMI -#elif ( GPU_COMPUTE_CAPABILITY >= 300 && GPU_COMPUTE_CAPABILITY < 500 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 300 && GPU_COMPUTE_CAPABILITY < 500 ) # define GPU_ARCH KEPLER -#elif ( GPU_COMPUTE_CAPABILITY >= 500 && GPU_COMPUTE_CAPABILITY < 600 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 500 && GPU_COMPUTE_CAPABILITY < 600 ) # define GPU_ARCH MAXWELL -#elif ( GPU_COMPUTE_CAPABILITY >= 600 && GPU_COMPUTE_CAPABILITY < 700 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 600 && GPU_COMPUTE_CAPABILITY < 700 ) # define GPU_ARCH PASCAL -#elif ( GPU_COMPUTE_CAPABILITY >= 700 && GPU_COMPUTE_CAPABILITY < 750 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 700 && GPU_COMPUTE_CAPABILITY < 750 ) # define GPU_ARCH VOLTA -#elif ( GPU_COMPUTE_CAPABILITY >= 750 && GPU_COMPUTE_CAPABILITY < 800 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 750 && GPU_COMPUTE_CAPABILITY < 800 ) # define GPU_ARCH TURING -#elif ( GPU_COMPUTE_CAPABILITY >= 800 && GPU_COMPUTE_CAPABILITY < 890 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 800 && GPU_COMPUTE_CAPABILITY < 890 ) # define GPU_ARCH AMPERE -#elif ( GPU_COMPUTE_CAPABILITY >= 890 && GPU_COMPUTE_CAPABILITY < 900 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 890 && GPU_COMPUTE_CAPABILITY < 900 ) # define GPU_ARCH ADA_LOVELACE -#elif ( GPU_COMPUTE_CAPABILITY >= 900 && GPU_COMPUTE_CAPABILITY < 1000 ) +#elif ( GPU_COMPUTE_CAPABILITY >= 900 && GPU_COMPUTE_CAPABILITY < 1000 ) # define GPU_ARCH HOPPER +#elif ( GPU_COMPUTE_CAPABILITY >= 1000 && GPU_COMPUTE_CAPABILITY < 1210 ) +# define GPU_ARCH BLACKWELL #else # error : ERROR : Unknown GPU_COMPUTE_CAPABILITY !! #endif // GPU_COMPUTE_CAPABILITY diff --git a/src/Auxiliary/Aux_TakeNote.cpp b/src/Auxiliary/Aux_TakeNote.cpp index ef833f01f..7d2132d7e 100644 --- a/src/Auxiliary/Aux_TakeNote.cpp +++ b/src/Auxiliary/Aux_TakeNote.cpp @@ -417,6 +417,8 @@ void Aux_TakeNote() fprintf( Note, "GPU_ARCH ADA_LOVELACE\n" ); # elif ( GPU_ARCH == HOPPER ) fprintf( Note, "GPU_ARCH HOPPER\n" ); +# elif ( GPU_ARCH == BLACKWELL ) + fprintf( Note, "GPU_ARCH BLACKWELL\n" ); # else fprintf( Note, "GPU_ARCH UNKNOWN\n" ); # endif diff --git a/src/GPU_API/CUAPI_DiagnoseDevice.cu b/src/GPU_API/CUAPI_DiagnoseDevice.cu index 424f23700..2d0c84b13 100644 --- a/src/GPU_API/CUAPI_DiagnoseDevice.cu +++ b/src/GPU_API/CUAPI_DiagnoseDevice.cu @@ -53,6 +53,7 @@ void CUAPI_DiagnoseDevice() else if ( DeviceProp.major == 8 && DeviceProp.minor == 6 ) NCorePerMP = 128; else if ( DeviceProp.major == 8 && DeviceProp.minor == 9 ) NCorePerMP = 128; else if ( DeviceProp.major == 9 ) NCorePerMP = 128; + else if ( DeviceProp.major == 12 && DeviceProp.minor == 0 ) NCorePerMP = 128; else fprintf( stderr, "WARNING : unable to determine the number of cores per multiprocessor for version %d.%d ...\n", DeviceProp.major, DeviceProp.minor ); @@ -86,6 +87,9 @@ void CUAPI_DiagnoseDevice() Aux_GetCPUInfo( FileName ); + int clockRate; // in unit of kHz + CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, GetDeviceID) ); + fprintf( Note, "\n" ); fprintf( Note, "GPU Info :\n" ); fprintf( Note, "Number of GPUs : %d\n" , DeviceCount ); @@ -95,7 +99,7 @@ void CUAPI_DiagnoseDevice() fprintf( Note, "CUDA Runtime Version : %d.%d\n" , RuntimeVersion/1000, RuntimeVersion%100 ); fprintf( Note, "CUDA Major Revision Number : %d\n" , DeviceProp.major ); fprintf( Note, "CUDA Minor Revision Number : %d\n" , DeviceProp.minor ); - fprintf( Note, "Clock Rate : %f GHz\n" , DeviceProp.clockRate/1.0e6 ); + fprintf( Note, "Clock Rate : %f GHz\n" , clockRate/1.0e6 ); fprintf( Note, "Global Memory Size : %ld MB\n" , (long)DeviceProp.totalGlobalMem/1024/1024 ); fprintf( Note, "Constant Memory Size : %ld KB\n" , (long)DeviceProp.totalConstMem/1024 ); fprintf( Note, "Shared Memory Size per Block : %ld KB\n" , (long)DeviceProp.sharedMemPerBlock/1024 ); diff --git a/src/GPU_API/CUAPI_SetDevice.cu b/src/GPU_API/CUAPI_SetDevice.cu index a67c7a43c..8f8ef8a18 100644 --- a/src/GPU_API/CUAPI_SetDevice.cu +++ b/src/GPU_API/CUAPI_SetDevice.cu @@ -53,6 +53,7 @@ void CUAPI_SetDevice( const int Mode ) // set the device ID void **d_TempPtr = NULL; int SetDeviceID, GetDeviceID = 999; + int computeMode; cudaDeviceProp DeviceProp; switch ( Mode ) @@ -77,9 +78,9 @@ void CUAPI_SetDevice( const int Mode ) // make sure that the "exclusive" compute mode is adopted CUDA_CHECK_ERROR( cudaGetDevice( &GetDeviceID ) ); - CUDA_CHECK_ERROR( cudaGetDeviceProperties( &DeviceProp, GetDeviceID ) ); + CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, GetDeviceID) ); - if ( DeviceProp.computeMode != cudaComputeModeExclusive ) + if ( computeMode != cudaComputeModeExclusive ) { Aux_Message( stderr, "WARNING : \"exclusive\" compute mode is NOT enabled for \"%s\" at Rank %2d", "OPT__GPUID_SELECT == -2", MPI_Rank ); diff --git a/src/GPU_API/CUAPI_SetMemSize.cu b/src/GPU_API/CUAPI_SetMemSize.cu index b4e4d293e..3bfdc4379 100644 --- a/src/GPU_API/CUAPI_SetMemSize.cu +++ b/src/GPU_API/CUAPI_SetMemSize.cu @@ -36,7 +36,10 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr // (1) GPU_NSTREAM if ( GPU_NStream <= 0 ) { - if ( DeviceProp.deviceOverlap ) + int gpuOverlap; + CUDA_CHECK_ERROR( cudaDeviceGetAttribute(&gpuOverlap, cudaDevAttrGpuOverlap, GetDeviceID) ); + + if ( gpuOverlap ) { # if ( MODEL == HYDRO ) # if ( GPU_ARCH == FERMI ) @@ -57,6 +60,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr GPU_NStream = 4; # elif ( GPU_ARCH == HOPPER ) GPU_NStream = 4; +# elif ( GPU_ARCH == BLACKWELL ) + GPU_NStream = 4; # else # error : UNKNOWN GPU_ARCH !! # endif @@ -80,6 +85,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr GPU_NStream = 4; # elif ( GPU_ARCH == HOPPER ) GPU_NStream = 4; +# elif ( GPU_ARCH == BLACKWELL ) + GPU_NStream = 4; # else # error : ERROR : UNKNOWN GPU_ARCH !! # endif @@ -118,6 +125,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # elif ( GPU_ARCH == HOPPER ) Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; +# elif ( GPU_ARCH == BLACKWELL ) + Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # else # error : UNKNOWN GPU_ARCH !! # endif @@ -141,6 +150,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # elif ( GPU_ARCH == HOPPER ) Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; +# elif ( GPU_ARCH == BLACKWELL ) + Flu_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # else # error : UNKNOWN GPU_ARCH !! # endif @@ -173,6 +184,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # elif ( GPU_ARCH == HOPPER ) Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; +# elif ( GPU_ARCH == BLACKWELL ) + Pot_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # else # error : UNKNOWN GPU_ARCH !! # endif @@ -203,6 +216,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # elif ( GPU_ARCH == HOPPER ) Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; +# elif ( GPU_ARCH == BLACKWELL ) + Che_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # else # error : UNKNOWN GPU_ARCH !! # endif @@ -232,6 +247,8 @@ void CUAPI_SetMemSize( int &GPU_NStream, int &Flu_GPU_NPGroup, int &Pot_GPU_NPGr Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # elif ( GPU_ARCH == HOPPER ) Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; +# elif ( GPU_ARCH == BLACKWELL ) + Src_GPU_NPGroup = 1*GPU_NStream*DeviceProp.multiProcessorCount; # else # error : UNKNOWN GPU_ARCH !! # endif diff --git a/src/configure.py b/src/configure.py index 6d91253cc..b921106e1 100755 --- a/src/configure.py +++ b/src/configure.py @@ -957,7 +957,7 @@ def set_gpu( gpus, flags, args ): gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=128" else: gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=70" - elif 500 <= compute_capability and compute_capability <= 900: + elif 500 <= compute_capability and compute_capability <= 1210: if args["double"]: gpu_opts["MAXRREGCOUNT_FLU"] = "--maxrregcount=192" else: