It WORKS!

plumed · Feb 26, 2025 · b8a3855 · b8a3855
1 parent 04e034b
commit b8a3855
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 41 deletions.
diff --git a/src/colvar/MultiColvarTemplate.h b/src/colvar/MultiColvarTemplate.h
@@ -71,6 +71,7 @@ class MultiColvarTemplate : public ActionWithVector {
   void calculate() override;
   void applyNonZeroRankForces( std::vector<double>& outforces ) override ;
   static void performTask( unsigned task_index, const MultiColvarInput& actiondata, ParallelActionsInput& input, ParallelActionsOutput& output );
+  template <bool doVirial=true>
   static void gatherForces( unsigned task_index, MultiColvarInput actiondata, ParallelActionsInput input, ForceInput fdata, ForceOutput forces);
   static void gatherThreads( ForceOutput forces );
 };
@@ -227,12 +228,13 @@ void MultiColvarTemplate<T>::performTask( unsigned task_index,
     input.inputdata[pos_start+1]=fpos[1];
     input.inputdata[pos_start+2]=fpos[2];
   }
-
-  std::size_t mass_start = pos_start + 3*input.nindices_per_task;
-  std::size_t charge_start = mass_start + input.nindices_per_task;
+  printf("-> %u :  %p\n",task_index,output.derivatives.data() );
+  const size_t mass_start = pos_start + 3*input.nindices_per_task;
+  const size_t charge_start = mass_start + input.nindices_per_task;
+  const size_t local_ndev = 3*input.nindices_per_task+9;
 
   ColvarOutput cvout = ColvarOutput( output.values,
-                                     3*input.nindices_per_task+9,
+                                     local_ndev,
                                      output.derivatives.data() );
   T::calculateCV( ColvarInput(actiondata.mode,
                               input.nindices_per_task,
@@ -244,6 +246,7 @@ void MultiColvarTemplate<T>::performTask( unsigned task_index,
 }
 
 template <class T>
+template <bool doVirial>
 void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
     const MultiColvarInput /*actiondata*/,
     const ParallelActionsInput input,
@@ -264,15 +267,17 @@ void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
     //   forces.thread_safe[n] += ff*fdata.deriv[i][m];
     //   ++m;
     // }
-    forces.thread_safe[0] += ff*fdata.deriv[i][m];
-    forces.thread_safe[1] += ff*fdata.deriv[i][m+1];
-    forces.thread_safe[2] += ff*fdata.deriv[i][m+2];
-    forces.thread_safe[3] += ff*fdata.deriv[i][m+3];
-    forces.thread_safe[4] += ff*fdata.deriv[i][m+4];
-    forces.thread_safe[5] += ff*fdata.deriv[i][m+5];
-    forces.thread_safe[6] += ff*fdata.deriv[i][m+6];
-    forces.thread_safe[7] += ff*fdata.deriv[i][m+7];
-    forces.thread_safe[8] += ff*fdata.deriv[i][m+8];
+    if constexpr (doVirial) {
+      forces.thread_safe[0] += ff*fdata.deriv[i][m];
+      forces.thread_safe[1] += ff*fdata.deriv[i][m+1];
+      forces.thread_safe[2] += ff*fdata.deriv[i][m+2];
+      forces.thread_safe[3] += ff*fdata.deriv[i][m+3];
+      forces.thread_safe[4] += ff*fdata.deriv[i][m+4];
+      forces.thread_safe[5] += ff*fdata.deriv[i][m+5];
+      forces.thread_safe[6] += ff*fdata.deriv[i][m+6];
+      forces.thread_safe[7] += ff*fdata.deriv[i][m+7];
+      forces.thread_safe[8] += ff*fdata.deriv[i][m+8];
+    }
   }
 }
 

diff --git a/src/core/ParallelTaskManager.h b/src/core/ParallelTaskManager.h
@@ -33,6 +33,8 @@
 #include "tools/View.h"
 #include "tools/View2D.h"
 
+#include "tools/ColvarOutput.h"
+
 #include<iostream>
 #define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " << (__VA_ARGS__) << '\n'
 namespace PLMD {
@@ -100,7 +102,10 @@ class ForceInput {
 public:
   View<double> force;
   View2D<double> deriv;
-  ForceInput( std::size_t nc, double* f, std::size_t nd, double* d ) : ncomp(nc), force(f,nc), deriv(d,nc,nd) {}
+  ForceInput( std::size_t nc, double* f, std::size_t nd, double* d ) :
+    ncomp(nc),
+    force(f,nc),
+    deriv(d,nc,nd) {}
 };
 
 //There is no need to pass this as reference:
@@ -198,7 +203,8 @@ ParallelTaskManager<T, D>::ParallelTaskManager(ActionWithVector* av):
 }
 
 template <class T, class D>
-void ParallelTaskManager<T, D>::setNumberOfIndicesAndDerivativesPerTask( const std::size_t nind, const std::size_t nder ) {
+void ParallelTaskManager<T, D>::setNumberOfIndicesAndDerivativesPerTask( const std::size_t nind,
+    const std::size_t nder ) {
   plumed_massert( action->getNumberOfComponents()>0, "there should be some components wen you setup the index list" );
   std::size_t valuesize=(action->getConstPntrToComponent(0))->getNumberOfStoredValues();
   for(unsigned i=1; i<action->getNumberOfComponents(); ++i) {
@@ -346,16 +352,17 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
     auto omp_forces_data = omp_forces[0].data();
     const auto omp_forces_size = omp_forces[0].size();
     //on the cpu we only need the declaration, see the create statement below
-    const auto nderivs =  myinput.ncomponents*nderivatives_per_task;
-    const auto ndev_per_task = nderivatives_per_task;
-
+    const auto ndev_per_task = myinput.ncomponents*nderivatives_per_task;
+    const auto nderivPerComponent = nderivatives_per_task;
+    const auto nderivs =  ndev_per_task*nactive_tasks;
+
     //To future me/you:
     // I need to allocate this on the host to create a bigger temporay data array
     // on the device
     // by trying with double* x=nullptr, you will get a failure
     // another solution is acc_malloc and then device_ptr in the pragma
     // (but you have to remember the acc_free)
-    std::vector<double> derivative(1);
+    std::vector<double> derivative(nderivs,0.0);
     double * derivatives = derivative.data();
 
     ParallelActionsInput input = myinput;
@@ -364,42 +371,82 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
     auto actiondata_acc = fromToDataHelper(t_actiondata);
 
 
-#pragma acc parallel loop reduction(+:omp_forces_data[0:omp_forces_size]) \
+    vdbg(nactive_tasks);
+    vdbg(nderivatives_per_task);
+    vdbg(ndev_per_task);
+    vdbg(myinput.ncomponents);
+#pragma acc data  \
                             present(input,t_actiondata) \
                              copyin(nactive_tasks, \
                                     nderivs, \
                                     ndev_per_task, \
+                                    nderivPerComponent ,\
                                     partialTaskList_data[0:nactive_tasks], \
                                     value_stash_data[0:value_stash_size]) \
                                copy(omp_forces_data[0:omp_forces_size], \
                                     forcesForApply_data[0:forcesForApply_size]) \
-                             create(derivatives[0:nderivs]) \
+                             copyout(derivatives[0:nderivs]) \
                             default(none)
-    for(unsigned i=0; i<nactive_tasks; ++i) {
-      //This may be changed to a shared array
-      std::vector<double> valstmp( input.ncomponents );
-      std::size_t task_index = partialTaskList_data[i];
-      // std::vector<double> derivative( nderivs );
-      ParallelActionsOutput myout( input.ncomponents,
-                                   valstmp.data(),
-                                   nderivs,
-                                   derivatives);
-      // Calculate the stuff in the loop for this action
-      T::performTask( task_index, t_actiondata, input, myout );
+    {
+#pragma acc parallel loop reduction(+:omp_forces_data[0:omp_forces_size])
+      for(unsigned i=0; i<nactive_tasks; ++i) {
+        //This may be changed to a shared array
+        std::vector<double> valstmp( input.ncomponents );
+        std::size_t task_index = partialTaskList_data[i];
+        // std::vector<double> derivative( nderivs );
+        ParallelActionsOutput myout( input.ncomponents,
+                                     valstmp.data(),
+                                     nderivPerComponent,
+                                     derivatives+ndev_per_task*i
+                                   );
+
+        // Calculate the stuff in the loop for this action
+        T::performTask( task_index, t_actiondata, input, myout );
+        // Gather the forces from the values
+        T::gatherForces<true>( task_index, t_actiondata, input,
+                               ForceInput( input.ncomponents,
+                                           value_stash_data+input.ncomponents*task_index,
+                                           nderivPerComponent,
+                                           derivatives+ndev_per_task*i),
+                               ForceOutput { omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size }
+                             );
 
-      // Gather the forces from the values
-      T::gatherForces( task_index, t_actiondata, input,
-                       ForceInput( input.ncomponents,
-                                   value_stash_data+input.ncomponents*task_index,
-                                   ndev_per_task,
-                                   derivatives),
-                       ForceOutput { omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size }
-                     );
 
 
+      }
 
+// #pragma acc kernels
+//       for(unsigned n=0; n<9; ++n) {
+//         double t=0.0;
+//         unsigned m =  3*input.nindices_per_task+ n;
+// // #pragma acc parallel loop reduction(+:t)
+// //the reduction on t is implicit
+//         for(unsigned i=0; i<nactive_tasks; ++i) {
+//           std::size_t task_index = partialTaskList_data[i];
+//           View2D<double> deriv{derivatives,input.ncomponents,ndev_per_task};
+//           View<double> force{value_stash_data+input.ncomponents*task_index,input.ncomponents};
+//           // std::size_t base = 3*task_index*input.nindices_per_task;
+//           for(unsigned i=0; i<input.ncomponents; ++i) {
+
+//             t += force[i]*deriv[i][m];
+
+//           }
+//           omp_forces_data[n]= t;
+//         }
+
+//       }
     }
     T::gatherThreads({ omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size });
+    for (const auto x:omp_forces[0]) {
+      vdbg (x);
+    }
+    for (int i=0; i<nderivs; ++i) {
+      // if (i%ndev_per_task<(ndev_per_task-9) ) continue;
+      std::cerr <<"\t" <<i <<":"<< derivative [i] << "\n";
+    }
+
+    vdbg(sizeof(PLMD::colvar::ColvarOutput));
+    vdbg(2 * sizeof(void*));
 #else
     plumed_merror("cannot use USEGPU flag if PLUMED has not been compiled with openACC");
 #endif
@@ -413,7 +460,9 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
     unsigned nt=OpenMP::getNumThreads();
     if( nt*stride*10>nactive_tasks ) nt=nactive_tasks/stride/10;
     if( nt==0 ) nt=1;
-
+    vdbg(nactive_tasks);
+    vdbg(nderivatives_per_task);
+    vdbg(myinput.ncomponents);
     #pragma omp parallel num_threads(nt)
     {
       const unsigned t=OpenMP::getThreadNum();
@@ -439,7 +488,12 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
                                      derivatives.data()),
                          forces
                        );
+        for (int nd=0; nd<derivatives.size(); ++nd) {
+          // if (i%ndev_per_task<(ndev_per_task-9) ) continue;
+          std::cerr <<"\t" <<i <<"-"<<nd<<":"<< derivatives [nd] << "\n";
+        }
       }
+
       #pragma omp critical
       T::gatherThreads( forces );
     }