diff --git a/CMakeLists.txt b/CMakeLists.txt index 57cd29f8..489ae821 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,12 @@ endif() if( GAUXC_ENABLE_HIP ) enable_language( HIP ) set( GAUXC_HAS_HIP TRUE CACHE BOOL "GauXC has HIP and will build HIP bindings" FORCE ) + if(NOT DEFINED ROCM_PATH) + message(FATAL_ERROR "ROCM_PATH must be set") + endif() + if( NOT DEFINED CMAKE_HIP_ARCHITECTURES ) + message( FATAL_ERROR "CMAKE_HIP_ARCHITECTURES must be set" ) + endif() endif() # Decided if we're compiling device bindings diff --git a/README.md b/README.md index 35fa5f97..b8d34695 100644 --- a/README.md +++ b/README.md @@ -221,8 +221,10 @@ target_link_libraries( my_target PUBLIC gauxc::gauxc ) | `GAUXC_ENABLE_MPI` | Enable MPI Bindings | `ON` | | `GAUXC_ENABLE_OPENMP` | Enable OpenMP Bindings | `ON` | | `CMAKE_CUDA_ARCHITECTURES` | CUDA architechtures (e.g. 70 for Volta, 80 for Ampere) | -- | +| `CMAKE_HIP_ARCHITECTURES` | HIP architechtures (e.g. gfx90a for MI250X) | -- | | `BLAS_LIBRARIES` | Full BLAS linker. | -- | | `MAGMA_ROOT_DIR` | Install prefix for MAGMA. | -- | +| `ROCM_PATH` | Install prefix for ROCM. | -- | diff --git a/cmake/gauxc-config.cmake.in b/cmake/gauxc-config.cmake.in index 426b7f10..648192f0 100644 --- a/cmake/gauxc-config.cmake.in +++ b/cmake/gauxc-config.cmake.in @@ -51,6 +51,20 @@ if( GAUXC_HAS_CUDA ) endif() endif() +if( GAUXC_HAS_HIP ) + enable_language( HIP ) + set (CMAKE_HIP_ARCHITECTURES @CMAKE_HIP_ARCHITECTURES@) + set (ROCM_PATH @ROCM_PATH@) + + list (PREPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/hipblas) + set(GPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "AMD GPU targets to compile for") + + find_package( hip REQUIRED ) + find_package( hipblas REQUIRED ) + + list(REMOVE_AT CMAKE_PREFIX_PATH 0 1 2) +endif + if( GAUXC_HAS_MPI ) find_dependency( MPI ) endif() diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake index cd3969d8..9a2c0757 100644 --- a/cmake/gauxc-dep-versions.cmake +++ b/cmake/gauxc-dep-versions.cmake @@ -10,8 +10,8 @@ set( GAUXC_CUB_REVISION 1.10.0 ) set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git ) set( GAUXC_CUTLASS_REVISION v2.10.0 ) -set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git ) -set( GAUXC_EXCHCXX_REVISION 21a4700a826ec0beae1311a1d59677393bcb168f ) +set( GAUXC_EXCHCXX_REPOSITORY https://github.com/ryanstocks00/ExchCXX.git ) +set( GAUXC_EXCHCXX_REVISION 8a0004609afc710bdad4367867026a9daa0a758b) set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git ) set( GAUXC_GAU2GRID_REVISION v2.0.6 ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7e51e9fa..6dc291fc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -224,3 +224,19 @@ install( FILES # Install Custom Find Modules include( ${linalg-cmake-modules_SOURCE_DIR}/LinAlgModulesMacros.cmake ) install_linalg_modules( INSTALL_CONFIGDIR ) + +# This allows specifying a lower compiler optimization level for NVHPC which fails to compile with the -O3 flag whilst leaving the remaining flags unchanged +if (DEFINED GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS) + get_target_property(default_compile_options gauxc COMPILE_OPTIONS) + get_target_property(gauxc_sources gauxc SOURCES) + set_target_properties(gauxc PROPERTIES COMPILE_OPTIONS "") + set_source_files_properties(${gauxc_sources} PROPERTIES COMPILE_OPTIONS "${default_compile_options}") + + file(GLOB OB_HOST_SRC_FILES ${CMAKE_CURRENT_LIST_DIR}/xc_integrator/local_work_driver/host/obara_saika/src/*.cxx) + set(adjusted_compile_options ${default_compile_options}) + foreach (flag "[\\/\\-]O3" "[\\/\\-]Ofast" "[\\/\\-]fast") + string(REGEX REPLACE ${flag} ${GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS} adjusted_compile_options "${adjusted_compile_options}") + endforeach() + message("-- Setting Obara-Saika COMPILE_OPTIONS to: ${adjusted_compile_options}") + set_source_files_properties(${OB_HOST_SRC_FILES} PROPERTIES COMPILE_OPTIONS "${adjusted_compile_options}") +endif() diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp index 9bb39011..186d639e 100644 --- a/src/exceptions/hipblas_exception.hpp +++ b/src/exceptions/hipblas_exception.hpp @@ -14,7 +14,7 @@ #ifdef GAUXC_HAS_HIP #include "hip/hip_runtime.h" -#include +#include namespace GauXC { diff --git a/src/runtime_environment/device/hip/CMakeLists.txt b/src/runtime_environment/device/hip/CMakeLists.txt index df97901b..fef39095 100644 --- a/src/runtime_environment/device/hip/CMakeLists.txt +++ b/src/runtime_environment/device/hip/CMakeLists.txt @@ -6,8 +6,13 @@ # See LICENSE.txt for details # +list (PREPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/hipblas) +set(GPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "AMD GPU targets to compile for") + find_package( hip REQUIRED ) find_package( hipblas REQUIRED ) +list(REMOVE_AT CMAKE_PREFIX_PATH 0 1 2) + target_sources( gauxc PRIVATE hip_backend.cxx ) target_link_libraries( gauxc PUBLIC hip::host roc::hipblas ) diff --git a/src/runtime_environment/device_specific/hip_device_constants.hpp b/src/runtime_environment/device_specific/hip_device_constants.hpp index 3a79fdf3..07adbd38 100644 --- a/src/runtime_environment/device_specific/hip_device_constants.hpp +++ b/src/runtime_environment/device_specific/hip_device_constants.hpp @@ -11,8 +11,12 @@ namespace GauXC { namespace hip { +#ifdef __HIP_PLATFORM_NVIDIA__ +static constexpr uint32_t warp_size = 32; +#else static constexpr uint32_t warp_size = 64; -static constexpr uint32_t max_threads_per_thread_block = 1024; +#endif +static constexpr uint32_t max_threads_per_thread_block = 512; static constexpr uint32_t max_warps_per_thread_block = max_threads_per_thread_block / warp_size; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index 6aea5225..71e8c387 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -457,7 +457,7 @@ void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, { dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + std::min(uint64_t(MGGA_KERNEL_SM_BLOCK), util::div_ceil( npts_max, MGGA_KERNEL_SM_BLOCK )), ntasks ); if(do_lapl) eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); @@ -614,17 +614,23 @@ __global__ void eval_vvar_kern( size_t ntasks, const auto* den_basis_prod_device = task.zmat; - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - register double den_reg = 0.; - if( tid_x < nbf and tid_y < npts ) { + int start_y = blockIdx.y * blockDim.y + threadIdx.y; + + for (int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + tid_x < nbf; + tid_x += blockDim.x * gridDim.x ) { + + for (int tid_y = start_y; + tid_y < npts; + tid_y += blockDim.y * gridDim.y ) { - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + den_reg += bf_col[ tid_y ] * db_col[ tid_y ]; + } } @@ -634,8 +640,8 @@ __global__ void eval_vvar_kern( size_t ntasks, den_reg = cuda::warp_reduce_sum( den_reg ); - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); + if( threadIdx.x == 0 and start_y < npts ) { + atomicAdd( den_eval_device + start_y, den_reg ); } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp index 5a5e78a5..9c2b4859 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -211,6 +211,76 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = sqrt_35*bf*x*y*(x*x - y*y)/2; + eval[npts * 1] = sqrt_70*bf*y*z*(3*x*x - y*y)/4; + eval[npts * 2] = sqrt_5*bf*x*y*(-x*x - y*y + 6*z*z)/2; + eval[npts * 3] = sqrt_10*bf*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 4] = bf*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval[npts * 5] = sqrt_10*bf*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 6] = sqrt_5*bf*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + eval[npts * 7] = sqrt_70*bf*x*z*(x*x - 3*y*y)/4; + eval[npts * 8] = sqrt_35*bf*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = sqrt_35*y*(bf*(3*x*x - y*y) + bf_x*x*(x*x - y*y))/2; + eval_x[npts * 1] = sqrt_70*y*z*(6*bf*x + bf_x*(3*x*x - y*y))/4; + eval_x[npts * 2] = sqrt_5*y*(-bf*(3*x*x + y*y - 6*z*z) - bf_x*x*(x*x + y*y - 6*z*z))/2; + eval_x[npts * 3] = sqrt_10*y*z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 4] = 3*bf*x*(x*x + y*y - 4*z*z)/2 + bf_x*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_x[npts * 5] = sqrt_10*z*(-bf*(9*x*x + 3*y*y - 4*z*z) - bf_x*x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 6] = sqrt_5*(-bf*x*(x*x - 3*z*z) - bf_x*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_x[npts * 7] = sqrt_70*z*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; + eval_x[npts * 8] = sqrt_35*(4*bf*x*(x*x - 3*y*y) + bf_x*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_y[npts * 0] = sqrt_35*x*(-bf*(-x*x + 3*y*y) + bf_y*y*(x*x - y*y))/2; + eval_y[npts * 1] = sqrt_70*z*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; + eval_y[npts * 2] = sqrt_5*x*(-bf*(x*x + 3*y*y - 6*z*z) - bf_y*y*(x*x + y*y - 6*z*z))/2; + eval_y[npts * 3] = sqrt_10*z*(-bf*(3*x*x + 9*y*y - 4*z*z) - bf_y*y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 4] = 3*bf*y*(x*x + y*y - 4*z*z)/2 + bf_y*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_y[npts * 5] = sqrt_10*x*z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 6] = sqrt_5*(bf*y*(y*y - 3*z*z) - bf_y*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_y[npts * 7] = sqrt_70*x*z*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; + eval_y[npts * 8] = sqrt_35*(-4*bf*y*(3*x*x - y*y) + bf_y*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_z[npts * 0] = sqrt_35*bf_z*x*y*(x*x - y*y)/2; + eval_z[npts * 1] = sqrt_70*y*(bf + bf_z*z)*(3*x*x - y*y)/4; + eval_z[npts * 2] = sqrt_5*x*y*(12*bf*z - bf_z*(x*x + y*y - 6*z*z))/2; + eval_z[npts * 3] = sqrt_10*y*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 4] = -2*bf*z*(3*x*x + 3*y*y - 2*z*z) + bf_z*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_z[npts * 5] = sqrt_10*x*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 6] = sqrt_5*(12*bf*z*(x*x - y*y) - bf_z*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + eval_z[npts * 7] = sqrt_70*x*(bf + bf_z*z)*(x*x - 3*y*y)/4; + eval_z[npts * 8] = sqrt_35*bf_z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + + template GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( @@ -239,8 +309,14 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + } else { + assert( false && "L < L_MAX" ); + } } // collocation_spherical_unnorm_angular @@ -284,6 +360,11 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + collocation_spherical_unnorm_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp index c7405df7..30d702a5 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp @@ -9,9 +9,12 @@ namespace GauXC { - constexpr double sqrt_15 = 3.872983346207417; constexpr double sqrt_3 = 1.7320508075688772; - constexpr double sqrt_6 = 2.449489742783178; + constexpr double sqrt_5 = 2.23606797749979; + constexpr double sqrt_15 = 3.872983346207417; constexpr double sqrt_10 = 3.1622776601683795; + constexpr double sqrt_6 = 2.449489742783178; + constexpr double sqrt_35 = 5.916079783099616; + constexpr double sqrt_70 = 8.366600265340756; } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp index 7ba02d5a..e23086cd 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp @@ -16,31 +16,45 @@ namespace hip { template __device__ T warp_reduce_sum( T val ) { +#ifdef __HIP_PLATFORM_NVIDIA__ + for(int i=(warp_sz/2); i>=1; i/=2) + val += __shfl_xor_sync(0xffffffff, val, i, warp_sz); + + return val; +#else using warp_reducer = hipcub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage + static __shared__ typename warp_reducer::TempStorage temp_storage[hip::max_warps_per_thread_block]; - int tid = + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int warp_lane = tid / warp_size; return warp_reducer( temp_storage[warp_lane] ).Sum( val ); +#endif } template __device__ T warp_reduce_prod( T val ) { +#ifdef __HIP_PLATFORM_NVIDIA__ + for(int i=(warp_sz/2); i>=1; i/=2) + val *= __shfl_xor_sync(0xffffffff, val, i, warp_sz); + + return val; +#else using warp_reducer = hipcub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage + static __shared__ typename warp_reducer::TempStorage temp_storage[hip::max_warps_per_thread_block]; - int tid = + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int warp_lane = tid / warp_size; return warp_reducer( temp_storage[warp_lane] ).Reduce( val, [](const T& a, const T& b){ return a * b; } ); +#endif } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index d4f6eda9..ce2c06e4 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -122,7 +122,12 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, int cont = (iCenter < natoms); // We will continue iterating until all of the threads have cont set to 0 + +#ifdef __HIP_PLATFORM_NVIDIA__ + while (__any_sync(__activemask(), cont)) { +#else while (__any(cont)) { +#endif if (cont) { double2 rj[weight_unroll/2]; double2 rab_val[weight_unroll/2]; @@ -131,8 +136,16 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, #pragma unroll for (int k = 0; k < weight_unroll/2; k++) { - rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); + double* addr = (double*)((double2*)(local_dist_scratch + jCenter) + k); + rj[k].x = addr[0]; + rj[k].y = addr[1]; + double* addr2 = (double*)((double2*)(local_rab + jCenter) + k); + rab_val[k].x = addr2[0]; + rab_val[k].y = addr2[1]; + // These caused a memory access violation when lddist is not a + // multiple of 2 as then there can be an unaligned access + // rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); + // rab_val[k] = *((double2*)(local_rab + jCenter) + k); } #pragma unroll @@ -177,7 +190,6 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, sum += parent_weight; weights[ipt] *= parent_weight / sum; } - #endif } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 8e93d33f..976d4d80 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -13,8 +13,14 @@ namespace GauXC { +#define GGA_KERNEL_SM_WARPS 16 -__global__ void eval_uvars_lda_kernel( size_t ntasks, +__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + // eval_vvars populated uvar storage already in the case of LDA+RKS + return; +} + +__global__ void eval_uvars_lda_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -23,68 +29,419 @@ __global__ void eval_uvars_lda_kernel( size_t ntasks, auto& task = tasks_device[ batch_idx ]; const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - auto* den_eval_device = task.den; + auto* den_pos_eval_device = task.den_s; + auto* den_neg_eval_device = task.den_z; - const auto* basis_eval_device = task.bf; - const auto* den_basis_prod_device = task.zmat; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - double den_reg = 0.; + if( tid < npts ) { + const auto ps = den_pos_eval_device[ tid ]; + const auto pz = den_neg_eval_device[ tid ]; + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); - if( tid_x < nbf and tid_y < npts ) { + } +} - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; +__global__ void eval_uvars_lda_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + + auto* den_z_eval_device = task.den_s; + auto* den_s_eval_device = task.den_z; + auto* den_y_eval_device = task.den_y; + auto* den_x_eval_device = task.den_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + const double dtolsq = 1e-24; // TODO: make variable + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + + if( tid < npts ) { + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + } + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); } +} - // Warp blocks are stored col major - den_reg = 2 * hip::warp_reduce_sum( den_reg ); +__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + auto* gamma_eval_device = task.gamma; - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if( tid < npts ) { + const double dx = dden_sx_eval_device[ tid ]; + const double dy = dden_sy_eval_device[ tid ]; + const double dz = dden_sz_eval_device[ tid ]; + + gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz; + + } + +} + +__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + auto* den_pos_eval_device = task.den_s; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + + auto* den_neg_eval_device = task.den_z; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double ps = den_pos_eval_device[ tid ]; + const double pz = den_neg_eval_device[ tid ]; + const double dndx = den_pos_x_eval_device[ tid ]; + const double dndy = den_pos_y_eval_device[ tid ]; + const double dndz = den_pos_z_eval_device[ tid ]; + const double dMzdx = den_neg_x_eval_device[ tid ]; + const double dMzdy = den_neg_y_eval_device[ tid ]; + const double dMzdz = den_neg_z_eval_device[ tid ]; + + // (del n).(del n) + const auto dn_sq = dndx*dndx + dndy*dndy + dndz*dndz; + // (del Mz).(del Mz) + const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz; + // (del n).(del Mz) + const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz; + + gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz; + gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq); + gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz; + + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); } +} + +__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + auto* den_s_eval_device = task.den_s; + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + + auto* den_z_eval_device = task.den_z; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; + + const auto* den_y_eval_device = task.den_y; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + + const auto* den_x_eval_device = task.den_x; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + + auto* H_z_eval_device = task.H_z; + auto* H_y_eval_device = task.H_y; + auto* H_x_eval_device = task.H_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + + const double dtolsq = 1e-24; // TODO: make variable + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double dndz = dden_sz_eval_device[ tid ]; + const double dndy = dden_sy_eval_device[ tid ]; + const double dndx = dden_sx_eval_device[ tid ]; + + const double dMzdz = dden_zz_eval_device[ tid ]; + const double dMzdy = dden_zy_eval_device[ tid ]; + const double dMzdx = dden_zx_eval_device[ tid ]; + + const double dMydz = dden_yz_eval_device[ tid ]; + const double dMydy = dden_yy_eval_device[ tid ]; + const double dMydx = dden_yx_eval_device[ tid ]; + + const double dMxdz = dden_xz_eval_device[ tid ]; + const double dMxdy = dden_xy_eval_device[ tid ]; + const double dMxdx = dden_xx_eval_device[ tid ]; + + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz; + const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz; + const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz; + const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz; + + const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz; + const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz; + const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz; + + const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely; + const auto s_sum = + dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py; + + const auto inv_sqsum2 = + rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx + + dels_dot_dely * dels_dot_dely); + const auto sqsum2 = 1./inv_sqsum2; + + double sign = 1.; + if( signbit(s_sum)) + sign = -1.; + + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2; + H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2; + H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + + H_z_eval_device[ tid ] = sign / 3.; + H_y_eval_device[ tid ] = sign / 3.; + H_x_eval_device[ tid ] = sign / 3.; + } + + gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2; + gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum); + gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2; + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + + } } +template +__global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { -void eval_uvvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, - XCDeviceTask* device_tasks, device_queue queue ) { + constexpr auto warp_size = hip::warp_size; + //constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block; - hipStream_t stream = queue.queue_as(); + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - dim3 threads(hip::warp_size, hip::max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( nbf_max , threads.x ), - util::div_ceil( npts_max , threads.y ), - ntasks ); + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + auto* tau_eval_device = task.tau; + decltype(tau_eval_device) lapl_eval_device = nullptr; + if constexpr (need_lapl) { + lapl_eval_device = task.denlapl; + } + + //const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr; + if constexpr (need_lapl) { + basis_lapl_eval_device = task.d2bflapl; + } + + //const auto* den_basis_prod_device = task.zmat; + const auto* den_basis_dx_prod_device = task.xmat_x; + const auto* den_basis_dy_prod_device = task.xmat_y; + const auto* den_basis_dz_prod_device = task.xmat_z; + decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr; + if constexpr (need_lapl) { + den_basis_prod_device = task.zmat; + } + + int ipt = blockIdx.x * blockDim.x + threadIdx.x; + if (ipt < npts) { + double tau_reg = 0; + double lapl_reg = 0; + + for( int ibf = 0; ibf < nbf; ibf++ ) { + + const double* db_x_col = den_basis_dx_prod_device + ibf*npts; + const double* db_y_col = den_basis_dy_prod_device + ibf*npts; + const double* db_z_col = den_basis_dz_prod_device + ibf*npts; + + const double* bf_x_col = dbasis_x_eval_device + ibf*npts; + const double* bf_y_col = dbasis_y_eval_device + ibf*npts; + const double* bf_z_col = dbasis_z_eval_device + ibf*npts; + + + tau_reg += bf_x_col[ipt] * db_x_col[ipt]; + tau_reg += bf_y_col[ipt] * db_y_col[ipt]; + tau_reg += bf_z_col[ipt] * db_z_col[ipt]; + + + if constexpr (need_lapl) { + const double* db_col = den_basis_prod_device + ibf*npts; + const double* bf_l_col = basis_lapl_eval_device + ibf*npts; + lapl_reg += bf_l_col[ipt] * db_col[ipt]; + } + } - hipLaunchKernelGGL(eval_uvars_lda_kernel, dim3(blocks), dim3(threads), 0, - stream, ntasks, device_tasks ); + tau_reg = 0.5 * tau_reg; + lapl_reg = 2. * lapl_reg + 4. * tau_reg; + + tau_eval_device[ipt] = tau_reg; + lapl_eval_device[ipt] = lapl_reg; + } } +#define EVAL_UVARS_KERNEL(xc_approx) \ + hipStream_t stream = queue.queue_as(); \ + dim3 blocks( util::div_ceil( npts_max, threads.x ), \ + 1, \ + ntasks ); \ + switch ( ks_scheme ) { \ + case RKS: \ + eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case UKS: \ + eval_uvars_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case GKS: \ + eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \ + } + +void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( hip::max_warps_per_thread_block * hip::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(lda); +} +void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( GGA_KERNEL_SM_WARPS * hip::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(gga); +} +void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, + int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, + device_queue queue ) { + // TODO: This interface should be unified with the lda/gga interfaces + hipStream_t stream = queue.queue_as(); + + // U Variables + { + dim3 threads(hip::max_warps_per_thread_block, 1,1 ); + dim3 blocks = dim3( util::div_ceil( npts_max, threads.x ), + 1, + ntasks ); + if(do_lapl) + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } + + // V variables (GAMMA) + dim3 threads( hip::max_threads_per_thread_block ); + dim3 blocks( util::div_ceil( npts_total, threads.x ), + 1, + ntasks ); + eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); +} + + -__global__ void eval_uvars_gga_kernel( size_t ntasks, +template +__global__ void eval_vvar_grad_kern( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -95,10 +452,35 @@ __global__ void eval_uvars_gga_kernel( size_t ntasks, const auto npts = task.npts; const auto nbf = task.bfn_screening.nbe; - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.ddenx; - auto* den_y_eval_device = task.ddeny; - auto* den_z_eval_device = task.ddenz; + double* den_eval_device = nullptr; + double* den_x_eval_device = nullptr; + double* den_y_eval_device = nullptr; + double* den_z_eval_device = nullptr; + + if constexpr (den_select == DEN_S) { + den_eval_device = task.den_s; + den_x_eval_device = task.dden_sx; + den_y_eval_device = task.dden_sy; + den_z_eval_device = task.dden_sz; + } + if constexpr (den_select == DEN_Z) { + den_eval_device = task.den_z; + den_x_eval_device = task.dden_zx; + den_y_eval_device = task.dden_zy; + den_z_eval_device = task.dden_zz; + } + if constexpr (den_select == DEN_Y) { + den_eval_device = task.den_y; + den_x_eval_device = task.dden_yx; + den_y_eval_device = task.dden_yy; + den_z_eval_device = task.dden_yz; + } + if constexpr (den_select == DEN_X) { + den_eval_device = task.den_x; + den_x_eval_device = task.dden_xx; + den_y_eval_device = task.dden_xy; + den_z_eval_device = task.dden_xz; + } const auto* basis_eval_device = task.bf; const auto* dbasis_x_eval_device = task.dbfx; @@ -106,19 +488,21 @@ __global__ void eval_uvars_gga_kernel( size_t ntasks, const auto* dbasis_z_eval_device = task.dbfz; const auto* den_basis_prod_device = task.zmat; - - // We always launch enough blocks to cover npts, so blocks aren't doing multiple results + double den_reg = 0.; double dx_reg = 0.; double dy_reg = 0.; double dz_reg = 0.; + int ipt = blockIdx.x * blockDim.x + threadIdx.x; + + if (ipt < npts) { + // Have each thread accumulate its own reduction result into a register. // There's no real _need_ for LDS because the reductions are small and // therefore can be done without sharing. for( int ibf = 0; ibf < nbf; ibf++ ) { - for( int ipt = blockIdx.x * blockDim.x + threadIdx.x; ipt < npts; ipt += blockDim.x * gridDim.x ) { const double* bf_col = basis_eval_device + ibf*npts; const double* bf_x_col = dbasis_x_eval_device + ibf*npts; @@ -126,76 +510,114 @@ __global__ void eval_uvars_gga_kernel( size_t ntasks, const double* bf_z_col = dbasis_z_eval_device + ibf*npts; const double* db_col = den_basis_prod_device + ibf*npts; - den_reg += 2 * bf_col[ ipt ] * db_col[ ipt ]; - dx_reg += 4 * bf_x_col[ ipt ] * db_col[ ipt ]; - dy_reg += 4 * bf_y_col[ ipt ] * db_col[ ipt ]; - dz_reg += 4 * bf_z_col[ ipt ] * db_col[ ipt ]; - } + den_reg += bf_col[ ipt ] * db_col[ ipt ]; + dx_reg += 2 * bf_x_col[ ipt ] * db_col[ ipt ]; + dy_reg += 2 * bf_y_col[ ipt ] * db_col[ ipt ]; + dz_reg += 2 * bf_z_col[ ipt ] * db_col[ ipt ]; } - for( int ipt = blockIdx.x * blockDim.x + threadIdx.x; ipt < npts; ipt += blockDim.x * gridDim.x ) { den_eval_device [ipt] = den_reg; den_x_eval_device [ipt] = dx_reg ; den_y_eval_device [ipt] = dy_reg ; den_z_eval_device [ipt] = dz_reg ; } - } -__global__ void eval_vvars_gga_kernel( - size_t npts, - const double* den_x_eval_device, - const double* den_y_eval_device, - const double* den_z_eval_device, - double* gamma_eval_device -) { - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { +template +__global__ void eval_vvar_kern( size_t ntasks, + XCDeviceTask* tasks_device ) { - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; + auto& task = tasks_device[ batch_idx ]; - } + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; -} + double* den_eval_device = nullptr; + // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored + if constexpr (den_select == DEN_S) den_eval_device = task.den_s; + if constexpr (den_select == DEN_Z) den_eval_device = task.den_z; + if constexpr (den_select == DEN_Y) den_eval_device = task.den_y; + if constexpr (den_select == DEN_X) den_eval_device = task.den_x; + const auto* basis_eval_device = task.bf; + const auto* den_basis_prod_device = task.zmat; + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; -void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, device_queue queue ) { + double den_reg = 0.; - hipStream_t stream = queue.queue_as(); + if( tid_x < nbf and tid_y < npts ) { - // U Variables - { - dim3 threads(hip::max_threads_per_thread_block, 1, 1); - dim3 blocks( util::div_ceil( npts_max , threads.x ), - 1, - ntasks ); + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; + + den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - hipLaunchKernelGGL(eval_uvars_gga_kernel, dim3(blocks), dim3(threads), 0, - stream, ntasks, device_tasks ); } - // V Variables - dim3 threads( hip::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ) ); - hipLaunchKernelGGL(eval_vvars_gga_kernel, blocks, threads, 0, stream, - npts_total, denx, deny, denz, gamma); + // Warp blocks are stored col major + constexpr auto warp_size = hip::warp_size; + //constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block; + den_reg = hip::warp_reduce_sum( den_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + } + } +void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + + hipStream_t stream = queue.queue_as(); + dim3 threads; + dim3 blocks; + if( do_grad ) { + threads = dim3(hip::max_warps_per_thread_block, 1, 1); + blocks = dim3( util::div_ceil( npts_max, threads.x ), + 1, + ntasks ); + } else { + threads = dim3( hip::warp_size, hip::max_warps_per_thread_block, 1 ); + blocks = dim3( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), + ntasks ); + } + switch( den_select ) { + case DEN_S: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Z: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Y: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_X: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" ); + } + +} diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index d188e9e6..ac1fcd57 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -14,7 +14,7 @@ namespace GauXC { -__global__ void zmat_lda_vxc_kernel( size_t ntasks, +__global__ void zmat_lda_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -46,34 +46,93 @@ __global__ void zmat_lda_vxc_kernel( size_t ntasks, -void zmat_lda_vxc( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCDeviceTask* tasks_device, - device_queue queue ) { - hipStream_t stream = queue.queue_as() ; - dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - hipLaunchKernelGGL(zmat_lda_vxc_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); + + + + + +template +__global__ void zmat_lda_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + + + const auto* basis_eval_device = task.bf; + + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + double sign = 1.0; + if constexpr ( den_selector == DEN_Z ) sign = -1.0; + + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + sign * factm * basis_eval_device[ ibfoff ]); + } } +template +__global__ void zmat_lda_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + double* K_device; + if constexpr ( den_selector == DEN_Z ) K_device = task.K_z; + if constexpr ( den_selector == DEN_Y ) K_device = task.K_y; + if constexpr ( den_selector == DEN_X ) K_device = task.K_x; + + const auto* basis_eval_device = task.bf; + auto* z_matrix_device = task.zmat; + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + if( tid_x < npts and tid_y < nbf ) { + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + + if constexpr ( den_selector == DEN_S ) { + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + factm * basis_eval_device[ ibfoff ]); + } + else { + const double factk = 0.5 * (factp - factm); + z_matrix_device[ ibfoff ] = K_device[ ibfoff ] * factk * basis_eval_device[ ibfoff ]; + } + } +} @@ -83,7 +142,7 @@ void zmat_lda_vxc( size_t ntasks, -__global__ void zmat_gga_vxc_kernel( size_t ntasks, +__global__ void zmat_gga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -94,9 +153,9 @@ __global__ void zmat_gga_vxc_kernel( size_t ntasks, const auto nbf = task.bfn_screening.nbe; const auto* vrho_device = task.vrho; const auto* vgamma_device = task.vgamma; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; + const auto* den_x_eval_device = task.dden_sx; + const auto* den_y_eval_device = task.dden_sy; + const auto* den_z_eval_device = task.dden_sz; const auto* basis_eval_device = task.bf; const auto* dbasis_x_eval_device = task.dbfx; @@ -124,11 +183,328 @@ __global__ void zmat_gga_vxc_kernel( size_t ntasks, } } + + + + + + + +template +__global__ void zmat_gga_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double factp = 0.25 * vrho_pos_device[tid_x]; + const double factm = 0.25 * vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double sign = 1.0; + + double x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ]; + y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ]; + z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ]; + + + } + if constexpr ( den_selector == DEN_Z ) { + sign = -1.0; + x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ]; + y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ]; + z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ]; + + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + (factp + sign * factm) * basis_eval_device[ ibfoff ]; + } +} + + + + +template +__global__ void zmat_gga_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + + // for non-DEN_S + double* K_device; + double* H_device; + if constexpr ( den_selector == DEN_Z ) { K_device = task.K_z; H_device = task.H_z; } + if constexpr ( den_selector == DEN_Y ) { K_device = task.K_y; H_device = task.H_y; } + if constexpr ( den_selector == DEN_X ) { K_device = task.K_x; H_device = task.H_x; } + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double fact_p = 0.5*vrho_pos_device[tid_x]; + const double fact_m = 0.5*vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double s_fact, x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + const double* Hz_device = task.H_z; + const double* Hy_device = task.H_y; + const double* Hx_device = task.H_x; + + s_fact = 0.5 * (fact_p + fact_m); + + x_fact = gga_fact_1 * dden_sx_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zx_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yx_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xx_eval_device[ tid_x ] ); + y_fact = gga_fact_1 * dden_sy_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zy_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yy_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xy_eval_device[ tid_x ] ); + z_fact = gga_fact_1 * dden_sz_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zz_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yz_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xz_eval_device[ tid_x ] ); + } + + if constexpr ( den_selector == DEN_Z ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_zx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_zy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_zz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_Y ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_yx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_yy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_yz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_X ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_xx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_xy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_xz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + s_fact * basis_eval_device[ ibfoff ]; + + } +} + +template +__global__ void zmat_mgga_vxc_rks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* vrho_device = task.vrho; + const auto* vgamma_device = task.vgamma; + const double* vlapl_device = need_lapl ? task.vlapl : nullptr; + const auto* den_x_eval_device = task.dden_sx; + const auto* den_y_eval_device = task.dden_sy; + const auto* den_z_eval_device = task.dden_sz; + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + const double* d2basis_lapl_eval_device = + need_lapl ? task.d2bflapl : nullptr; + + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double fact_1 = 0.5 * vrho_device[tid_x] ; + const double fact_2 = 2.0 * vgamma_device[tid_x]; + + const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ]; + const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ]; + const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ]; + + double val = + fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); + + if constexpr (need_lapl) { + val += vlapl_device[tid_x] * d2basis_lapl_eval_device[ibfoff]; + } + + z_matrix_device[ ibfoff ] = val; + } +} + + + +#define ZMAT_VXC_KERN(xc_approx) \ + hipStream_t stream = queue.queue_as(); \ + dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); \ + dim3 blocks( util::div_ceil( max_npts, threads.x ), \ + util::div_ceil( max_nbf, threads.y ), \ + ntasks ); \ + switch( scheme ) { \ + case RKS: \ + zmat_##xc_approx##_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + break; \ + case UKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + case GKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Y ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_X ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid KS scheme" ); \ + } + + + +void zmat_lda_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, + device_queue queue ) { +ZMAT_VXC_KERN(lda) +} + + + void zmat_gga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, device_queue queue ) { +ZMAT_VXC_KERN(gga) +} + + + +void zmat_mgga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + device_queue queue ) { hipStream_t stream = queue.queue_as() ; @@ -138,13 +514,135 @@ void zmat_gga_vxc( size_t ntasks, util::div_ceil( max_nbf, threads.y ), ntasks ); - hipLaunchKernelGGL(zmat_gga_vxc_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); + if(do_lapl) + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); } - + + + + + + + + + + + + +template +__global__ void mmat_mgga_vxc_rks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* vtau_device = task.vtau; + const double* vlapl_device = need_lapl ? task.vlapl : nullptr; + + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* mmat_x = task.xmat_x; + auto* mmat_y = task.xmat_y; + auto* mmat_z = task.xmat_z; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double fact_1 = 0.25 * vtau_device[tid_x] + + (need_lapl ? vlapl_device[tid_x] : 0.0); + + mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ]; + mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ]; + mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ]; + } +} + +//__global__ void print_zmat_stats( size_t ntasks, +// XCDeviceTask* tasks_device) { +// +// for(size_t iT = 0; iT < ntasks; ++iT) { +// auto& task = tasks_device[iT]; +// const auto npts = task.npts; +// const auto nbf = task.bfn_screening.nbe; +// +// const auto* zmat = task.zmat; +// const auto* bmat = task.bf; +// const auto* blmat = task.d2bflapl; +// +// double znrm = 0.0, bnrm = 0.0, blnrm = 0.0; +// for(auto j = 0; j < npts*nbf; ++j) { +// znrm += zmat[j] * zmat[j]; +// bnrm += bmat[j] * bmat[j]; +// blnrm += blmat[j] * blmat[j]; +// } +// +// const auto* eps = task.eps; +// const auto* vgamma = task.vgamma; +// const auto* vtau = task.vtau; +// const auto* vlapl = task.vlapl; +// const auto* vrho = task.vrho; +// const auto* gamma = task.gamma; +// const auto* tau = task.tau; +// const auto* lapl = task.denlapl; +// const auto* rho = task.den; +// double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0; +// double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0; +// for(auto j = 0; j < npts; ++j) { +// enrm += eps[j] * eps[j]; +// vrnrm += vrho[j] * vrho[j]; +// vgnrm += vgamma[j] * vgamma[j]; +// vtnrm += vtau[j] * vtau[j]; +// vlnrm += vlapl[j] * vlapl[j]; +// +// rnrm += rho[j] * rho[j]; +// gnrm += gamma[j] * gamma[j]; +// tnrm += tau[j] * tau[j]; +// lnrm += lapl[j] * lapl[j]; +// } +// +// printf("ITASK = %lu B = %.6e BL = %.6e R = %.6e G = %.6e T = %.6e L = %.6e E = %.6e VR = %.6e VG = %6e VT = %.6e VL = %.6e Z = %.6e \n", +// iT, bnrm, blnrm, rnrm, gnrm, tnrm, lnrm, enrm, vrnrm, vgnrm, vtnrm, vlnrm, znrm); +// } +// +//} + +void mmat_mgga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + device_queue queue ) { + + hipStream_t stream = queue.queue_as() ; + + + dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); + dim3 blocks( util::div_ceil( max_npts, threads.x ), + util::div_ceil( max_nbf, threads.y ), + ntasks ); + + if(do_lapl) + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + + //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); +} + } diff --git a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx index da8544ce..96ccb3b0 100644 --- a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx +++ b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx @@ -27,4 +27,14 @@ void eval_kern_exc_vxc_gga( const functional_type& func, size_t npts, } +void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, const double* tau, const double* lapl, + double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl, + device_queue queue ) { + + hipStream_t stream = queue.queue_as(); + func.eval_exc_vxc_device( npts, rho, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau, stream ); + +} + } diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index 5ffa8443..d714927c 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -405,9 +405,9 @@ void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) { eval_collocation_shell_to_task_hessian( max_l, data->l_batched_shell_to_task.data(), aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("collocation hess" __FILE__ ": " + std::to_string(__LINE__)); +#endif } void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) { @@ -425,9 +425,9 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) { eval_collocation_shell_to_task_laplacian( max_l, data->l_batched_shell_to_task.data(), aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__)); +#endif } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt index b103b4d4..c37f0ebe 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt @@ -29,6 +29,7 @@ set( GAUXC_OBARA_SAIKA_HOST_SRC src/obara_saika_integrals.cxx src/chebyshev_boys_computation.cxx ) + target_sources( gauxc PRIVATE ${GAUXC_OBARA_SAIKA_HOST_SRC} ) target_include_directories( gauxc PUBLIC $