wavefunction91 · wavefunction91 · Sep 8, 2025 · Dec 27, 2024 · Jan 10, 2025 · Jan 29, 2025
diff --git a/include/gauxc/enums.hpp b/include/gauxc/enums.hpp
@@ -39,6 +39,7 @@ enum class AtomicGridSizeDefault {
  *  molecular integration
  */
 enum class XCWeightAlg {
+  NOTPARTITIONED, ///< Not partitioned
   Becke, ///< The original Becke weighting scheme
   SSF,   ///< The Stratmann-Scuseria-Frisch weighting scheme
   LKO    ///< The Lauqua-Kuessman-Ochsenfeld weighting scheme

diff --git a/include/gauxc/load_balancer.hpp b/include/gauxc/load_balancer.hpp
@@ -15,6 +15,7 @@
 #include <gauxc/xc_task.hpp>
 #include <gauxc/util/timer.hpp>
 #include <gauxc/runtime_environment.hpp>
+#include <gauxc/enums.hpp>
 
 namespace GauXC {
 
@@ -27,6 +28,8 @@ namespace detail {
 struct LoadBalancerState {
   bool modified_weights_are_stored = false; 
     ///< Whether the load balancer currently stores partitioned weights
+  XCWeightAlg weight_alg = XCWeightAlg::NOTPARTITIONED; 
+    ///< Weight partitioning scheme used by this LoadBalancer
 };
 
 

diff --git a/include/gauxc/xc_integrator.hpp b/include/gauxc/xc_integrator.hpp
@@ -70,8 +70,8 @@ class XCIntegrator {
   exc_vxc_type_gks  eval_exc_vxc ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&,
                                    const IntegratorSettingsXC& = IntegratorSettingsXC{});
 
-  exc_grad_type eval_exc_grad( const MatrixType& );
-  exc_grad_type eval_exc_grad( const MatrixType&, const MatrixType& );
+  exc_grad_type eval_exc_grad( const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} );
+  exc_grad_type eval_exc_grad( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} );
 
   exx_type      eval_exx     ( const MatrixType&, 
                                const IntegratorSettingsEXX& = IntegratorSettingsEXX{} );

diff --git a/include/gauxc/xc_integrator/impl.hpp b/include/gauxc/xc_integrator/impl.hpp
@@ -76,16 +76,16 @@ typename XCIntegrator<MatrixType>::exc_vxc_type_gks
 
 template <typename MatrixType>
 typename XCIntegrator<MatrixType>::exc_grad_type
-  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& P ) {
+  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
-  return pimpl_->eval_exc_grad(P);
+  return pimpl_->eval_exc_grad(P, ks_settings);
 };
 
 template <typename MatrixType>
 typename XCIntegrator<MatrixType>::exc_grad_type
-  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz ) {
+  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
-  return pimpl_->eval_exc_grad(Ps, Pz);
+  return pimpl_->eval_exc_grad(Ps, Pz, ks_settings);
 };
 
 template <typename MatrixType>

diff --git a/include/gauxc/xc_integrator/replicated/impl.hpp b/include/gauxc/xc_integrator/replicated/impl.hpp
@@ -159,27 +159,27 @@ typename ReplicatedXCIntegrator<MatrixType>::exc_vxc_type_gks
 
 template <typename MatrixType>
 typename ReplicatedXCIntegrator<MatrixType>::exc_grad_type 
-  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& P ) {
+  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
 
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
 
   std::vector<value_type> EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() );
   pimpl_->eval_exc_grad( P.rows(), P.cols(), P.data(), P.rows(),
-                         EXC_GRAD.data() );
+                         EXC_GRAD.data(), ks_settings );
 
   return EXC_GRAD;
 
 }
 
 template <typename MatrixType>
 typename ReplicatedXCIntegrator<MatrixType>::exc_grad_type 
-  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz ) {
+  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
 
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
 
   std::vector<value_type> EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() );
   pimpl_->eval_exc_grad( Ps.rows(), Ps.cols(), Ps.data(), Ps.rows(), Pz.data(), Pz.rows(),
-                         EXC_GRAD.data() );
+                         EXC_GRAD.data(), ks_settings );
 
   return EXC_GRAD;
 

diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp
@@ -75,9 +75,9 @@ class ReplicatedXCIntegratorImpl {
                               value_type* EXC, const IntegratorSettingsXC& ks_settings ) = 0;
 
   virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
-                               value_type* EXC_GRAD ) = 0;
+                               value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldps, 
-                               const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD ) = 0;
+                               const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual void eval_exx_( int64_t m, int64_t n, const value_type* P,
                           int64_t ldp, value_type* K, int64_t ldk,
                           const IntegratorSettingsEXX& settings ) = 0;
@@ -150,9 +150,9 @@ class ReplicatedXCIntegratorImpl {
 
 
   void eval_exc_grad( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
-                      value_type* EXC_GRAD );
+                      value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings );
   void eval_exc_grad( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
-                      const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD );
+                      const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings );
 
   void eval_exx( int64_t m, int64_t n, const value_type* P,
                  int64_t ldp, value_type* K, int64_t ldk,

diff --git a/include/gauxc/xc_integrator/replicated_xc_integrator.hpp b/include/gauxc/xc_integrator/replicated_xc_integrator.hpp
@@ -50,8 +50,8 @@ class ReplicatedXCIntegrator : public XCIntegratorImpl<MatrixType> {
   exc_vxc_type_rks  eval_exc_vxc_ ( const MatrixType&, const IntegratorSettingsXC& ) override;
   exc_vxc_type_uks  eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override;
   exc_vxc_type_gks  eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
-  exc_grad_type eval_exc_grad_( const MatrixType& ) override;
-  exc_grad_type eval_exc_grad_( const MatrixType&, const MatrixType& ) override;
+  exc_grad_type eval_exc_grad_( const MatrixType&, const IntegratorSettingsXC& ) override;
+  exc_grad_type eval_exc_grad_( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
   exx_type      eval_exx_     ( const MatrixType&, const IntegratorSettingsEXX& ) override;
   fxc_contraction_type_rks  eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
   fxc_contraction_type_uks  eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override;

diff --git a/include/gauxc/xc_integrator/xc_integrator_impl.hpp b/include/gauxc/xc_integrator/xc_integrator_impl.hpp
@@ -42,8 +42,8 @@ class XCIntegratorImpl {
   virtual exc_vxc_type_uks  eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual exc_vxc_type_gks  eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const MatrixType& Py, const MatrixType& Px, 
                                             const IntegratorSettingsXC& ks_settings ) = 0;
-  virtual exc_grad_type eval_exc_grad_( const MatrixType& P ) = 0;
-  virtual exc_grad_type eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz ) = 0;
+  virtual exc_grad_type eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) = 0;
+  virtual exc_grad_type eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual exx_type      eval_exx_     ( const MatrixType&     P, 
                                         const IntegratorSettingsEXX& settings ) = 0;
   virtual fxc_contraction_type_rks  eval_fxc_contraction_ ( const MatrixType& P,
@@ -125,17 +125,17 @@ class XCIntegratorImpl {
    *  @param[in] P The alpha density matrix
    *  @returns EXC gradient
    */
-  exc_grad_type eval_exc_grad( const MatrixType& P ) {
-    return eval_exc_grad_(P);
+  exc_grad_type eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
+    return eval_exc_grad_(P, ks_settings);
   }
 
   /** Integrate EXC gradient for UKS
    *
    *  @param[in] P The alpha density matrix
    *  @returns EXC gradient
    */
-  exc_grad_type eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz ) {
-    return eval_exc_grad_(Ps, Pz);
+  exc_grad_type eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
+    return eval_exc_grad_(Ps, Pz, ks_settings);
   }
 
   /** Integrate Exact Exchange for RHF

diff --git a/include/gauxc/xc_integrator_settings.hpp b/include/gauxc/xc_integrator_settings.hpp
@@ -21,4 +21,8 @@ struct IntegratorSettingsKS : public IntegratorSettingsXC {
   double gks_dtol = 1e-12;
 };
 
+struct IntegratorSettingsEXC_GRAD : public IntegratorSettingsKS {
+  bool include_weight_derivatives= true; // whether to include grid weight contribution and employ translational invariance, or just use Hellmann-Feynman gradient
+};
+
 }
diff --git a/src/molecular_weights/device/device_molecular_weights.cxx b/src/molecular_weights/device/device_molecular_weights.cxx
@@ -37,7 +37,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   auto task_comparator = []( const XCTask& a, const XCTask& b ) {
     return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
   };
-  std::sort(task_begin, task_end, task_comparator );
+  std::stable_sort(task_begin, task_end, task_comparator );
 
   const auto& mol  = lb.molecule();
   const auto natoms = mol.natoms();
@@ -79,6 +79,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   rt.device_backend()->master_queue_synchronize();
 
   lb.state().modified_weights_are_stored = true;
+  lb.state().weight_alg = this->settings_.weight_alg;
 
 }
 

diff --git a/src/molecular_weights/host/host_molecular_weights.cxx b/src/molecular_weights/host/host_molecular_weights.cxx
@@ -25,7 +25,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   auto task_comparator = []( const XCTask& a, const XCTask& b ) {
     return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
   };
-  std::sort( tasks.begin(), tasks.end(), task_comparator );
+  std::stable_sort( tasks.begin(), tasks.end(), task_comparator );
 
   // Modify the weights
   const auto& mol  = lb.molecule();
@@ -34,6 +34,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const {
     tasks.begin(), tasks.end() );
 
   lb.state().modified_weights_are_stored = true;
+  lb.state().weight_alg = this->settings_.weight_alg;
 }
 
 }
diff --git a/src/xc_integrator/local_work_driver/common/integrator_constants.hpp b/src/xc_integrator/local_work_driver/common/integrator_constants.hpp
@@ -13,7 +13,7 @@ namespace integrator {
 template <typename F = double>
 constexpr F magic_ssf_factor = 0.64;
 
-constexpr double ssf_weight_tol = 1e-10;
+constexpr double ssf_weight_tol = 1e-13;
 
 }
 }
diff --git a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
@@ -20,6 +20,10 @@ enum class DeviceBlasUplo : unsigned char {
   Lower
 };
 
+template <typename T>
+void increment( device_blas_handle generic_handle, const T* X, T* Y, int N );
+
+
 template <typename T>
 void dot( device_blas_handle handle,
           int            N,

diff --git a/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp b/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp
@@ -15,11 +15,11 @@
 namespace GauXC {
 
 void increment_exc_grad_lda( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue );
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
 void increment_exc_grad_gga( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue );
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
 void increment_exc_grad_mgga( integrator_ks_scheme ks_scheme, size_t nshell, bool need_lapl, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue );
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
 
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx
@@ -8,6 +8,7 @@
 #include "cuda_aos_scheme1.hpp"
 #include "device/cuda/cuda_backend.hpp"
 #include "cuda_aos_scheme1_weights.hpp"
+#include "device/common/device_blas.hpp"
 
 namespace GauXC {
 
@@ -37,6 +38,41 @@ void CudaAoSScheme1<Base>::partition_weights( XCDeviceData* _data ) {
     scheme1_stack.dist_nearest_device, base_stack.weights_device, *device_backend->master_stream );
 }
 
+template <typename Base> 
+void CudaAoSScheme1<Base>::eval_weight_1st_deriv_contracted( XCDeviceData* _data, XCWeightAlg alg ) {
+  if( alg != XCWeightAlg::SSF ) {
+    GAUXC_GENERIC_EXCEPTION("Weight Algorithm NYI for CUDA AoS Scheme1");
+  }
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  auto device_backend = dynamic_cast<CUDABackend*>(data->device_backend_);
+  if( !device_backend ) GAUXC_BAD_BACKEND_CAST();
+
+  // make w times f vector
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  auto base_stack    = data->base_stack;
+  if( is_pol )
+    increment( data->device_backend_->master_blas_handle(), base_stack.den_z_eval_device, 
+    base_stack.den_s_eval_device, data->total_npts_task_batch ); 
+
+  hadamard_product(data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.den_s_eval_device, 1, 
+    base_stack.eps_eval_device, 1); 
+
+
+  // Compute distances from grid to atomic centers
+  const auto ldatoms = data->get_ldatoms();
+  auto static_stack  = data->static_stack;
+  auto scheme1_stack = data->scheme1_stack;
+  cuda_aos_scheme1_weight_1st_deriv_wrapper( data->total_npts_task_batch, data->global_dims.natoms,
+    base_stack.points_x_device, base_stack.points_y_device, base_stack.points_z_device,
+    static_stack.rab_device, ldatoms, static_stack.coords_device, 
+    scheme1_stack.dist_scratch_device, ldatoms, scheme1_stack.iparent_device, 
+    scheme1_stack.dist_nearest_device, base_stack.eps_eval_device, static_stack.exc_grad_device, *device_backend->master_stream );
+}
+
 
 template struct CudaAoSScheme1<AoSScheme1Base>;
 #ifdef GAUXC_HAS_MAGMA

diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp
@@ -33,6 +33,7 @@ struct CudaAoSScheme1 : public Base {
 
   // API Overrides
   void partition_weights( XCDeviceData* ) override final;
+  void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg ) override final;
 
   std::unique_ptr<XCDeviceData> create_device_data(const DeviceRuntimeEnvironment&) override final;
 

diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu
@@ -53,7 +53,28 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms,
     iparent, dist_nearest, weights, stream );
 #endif
 
+}
+
+
+void cuda_aos_scheme1_weight_1st_deriv_wrapper(
+  int32_t npts, int32_t natoms,
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* RAB, int32_t ldRAB, const double* coords, 
+  double* dist, int32_t lddist, const int32_t* iparent,
+  const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream ){
+
+  // Compute distances from grid to atomic centers
+  compute_grid_to_center_dist( npts, natoms, coords, points_x, points_y, points_z, 
+   dist, lddist, stream );
+
+  eval_weight_1st_deriv_contracted_ssf_1d( npts, natoms, RAB, ldRAB, coords, points_x, points_y, points_z, dist, lddist,
+    iparent, dist_nearest, w_times_f, exc_grad_w, stream );
 
 }
 
+
+
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp
@@ -13,4 +13,12 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms,
   double* dist, int32_t lddist, const int32_t* iparent,
   const double* dist_nearest, double* weights, cudaStream_t stream );
 
+void cuda_aos_scheme1_weight_1st_deriv_wrapper(
+  int32_t npts, int32_t natoms,
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* RAB, int32_t ldRAB, const double* coords, 
+  double* dist, int32_t lddist, const int32_t* iparent,
+  const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream );
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
@@ -45,6 +45,24 @@ void increment( const T* X, T* Y, cudaStream_t stream ) {
   increment_kernel<<<1,1,0,stream>>>(X,Y);
 }
 
+template <typename T>
+__global__ void increment_vec_kernel( const T* X, T* Y, int N ) {
+  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if( tid < N ) Y[tid] += X[tid];
+}
+
+template <typename T>
+void increment( device_blas_handle generic_handle, const T* X, T* Y, int N) {
+  const int threads = cuda::warp_size * cuda::max_warps_per_thread_block;
+  const int blocks = util::div_ceil( N, threads );
+  cublasHandle_t handle = generic_handle.blas_handle_as<util::cublas_handle>();
+  auto stream = util::get_stream(handle);
+  increment_vec_kernel<<<blocks, threads, 0, stream>>>(X,Y,N);
+}
+
+template
+  void increment( device_blas_handle generic_handle, const double* X, double* Y, int N );
+
 template <>
 void dot( device_blas_handle generic_handle,
           int            N,