If we ingnore the viria, it looks like we are here

plumed · Feb 21, 2025 · 35b742a · 35b742a
1 parent ab63e35
commit 35b742a
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 26 deletions.
diff --git a/src/colvar/MultiColvarTemplate.h b/src/colvar/MultiColvarTemplate.h
@@ -144,6 +144,7 @@ class MultiColvarTemplate : public ActionWithVector {
 /// The number of atoms per task
   unsigned natoms_per_task;
 public:
+  static constexpr size_t virialSize = 9;
   static void registerKeywords(Keywords&);
   explicit MultiColvarTemplate(const ActionOptions&);
   unsigned getNumberOfDerivatives() override ;
@@ -219,8 +220,8 @@ MultiColvarTemplate<T>::MultiColvarTemplate(const ActionOptions&ao):
   // Sets up the index list in the task manager
   taskmanager.setNumberOfIndicesAndDerivativesPerTask(
     natoms_per_task,
-    3*natoms_per_task + 9 );
-  taskmanager.setNumberOfThreadedForces( 9 );
+    3*natoms_per_task + virialSize );
+  taskmanager.setNumberOfThreadedForces( virialSize );
   taskmanager.setActionInput( MultiColvarInput{ usepbc, mode } );
 }
 
@@ -345,7 +346,7 @@ void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
       forces.thread_unsafe[base + m] += ff*fdata.deriv[i][m]; ++m;
     }
     if constexpr(doVirial) {
-      for(unsigned n=0; n<9 /*forces.reducedSize*/; ++n) {
+      for(unsigned n=0; n<virialSize; ++n) {
         forces.thread_safe[n] += ff*fdata.deriv[i][m];
         ++m;
       }
@@ -379,7 +380,7 @@ void MultiColvarTemplate<T>::gatherThreads( const ForceOutput forces ) {
 
   //copies the forces that has been reduced in the place where they will be used
   unsigned k=0;
-  for(unsigned n=forces.other_size-forces.reducedSize; n<forces.other_size; ++n) {
+  for(unsigned n=forces.other_size-virialSize; n<forces.other_size; ++n) {
     forces.thread_unsafe[n] += forces.thread_safe[k];
     ++k;
   }

diff --git a/src/core/ParallelTaskManager.h b/src/core/ParallelTaskManager.h
@@ -324,8 +324,10 @@ void ParallelTaskManager<T, D>::runAllTasks() {
       std::vector<double> derivatives(devSize);
       std::size_t task_number = partialTaskList_data[i];
       std::size_t val_pos = task_number*input.ncomponents;
-      ParallelActionsOutput myout( input.ncomponents, value_stash_data+val_pos, devSize,derivatives.data() );
-      // myout.values[0]=task_number;
+      ParallelActionsOutput myout( input.ncomponents,
+                                   value_stash_data+val_pos,
+                                   devSize,
+                                   derivatives.data() );
       // Calculate the stuff in the loop for this action
       T::performTask( task_number, t_actiondata, input, myout );
     }
@@ -389,7 +391,7 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
     }
   }
 
-  if( useacc &&false ) {
+  if( useacc ) {
 #ifdef __PLUMED_HAS_OPENACC
     omp_forces[0].assign( omp_forces[0].size(), 0.0 );
     auto value_stash_data = value_stash.data();
@@ -405,8 +407,14 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
     const auto ndev_per_task = nderivatives_per_task;
     const auto ncomponents = myinput.ncomponents;
 
-    // std::vector<double> derivatives( nderivs );
-    double * derivatives = nullptr;// derivatives.data();
+    //To future me/you:
+    // I need to allocate this on the host to create a bigger temporay data array
+    // on the device
+    // by trying with double* x=nullptr, you will get a failure
+    // another solution is acc_malloc and then device_ptr in the pragma
+    // (but you have to remember the acc_free)
+    std::vector<double> derivative(1);
+    double * derivatives = derivative.data();
 
     ParallelActionsInput input = myinput;
     auto myinput_acc = fromToDataHelper(input);
@@ -426,31 +434,50 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
 #pragma acc parallel loop
       for(unsigned i=0; i<nactive_tasks; ++i) {
         //I am starting to wonder if it could be smarter to set up only the derivatives per task here (or to use asome raw memory)
-        std::vector<double> fake_valstmp( input.ncomponents );
+        std::vector<double> valstmp( input.ncomponents );
         std::size_t task_number = partialTaskList_data[i];
         std::size_t val_pos = task_number*input.ncomponents;
-        ParallelActionsOutput myout( input.ncomponents, fake_valstmp.data(), nderivs,derivatives );
+        // std::vector<double> derivative( nderivs );
+        ParallelActionsOutput myout( input.ncomponents,
+                                     valstmp.data(),
+                                     nderivs,
+                                     derivatives);
         // Calculate the stuff in the loop for this action
         T::performTask( task_number, t_actiondata, input, myout );
 
         // Gather the forces from the values
-        // T::gatherForces<false>( task_number, t_actiondata, input,
-        //                         ForceInput( input.ncomponents,
-        //                                     value_stash_data+input.ncomponents*task_number,
-        //                                     ndev_per_task, derivatives),
-        //                         ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
+        T::gatherForces<false>( task_number, t_actiondata, input,
+                                ForceInput( input.ncomponents,
+                                            value_stash_data+input.ncomponents*task_number,
+                                            ndev_per_task, derivatives),
+                                ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
 
       }
-      // #pragma acc parallel loop
-      // for(unsigned i=0; i<of_size; ++i) {
-      //   T::gatherBoxForces( i, t_actiondata, input,
-      //     ForceInput( input.ncomponents,
-      //                 value_stash_data+input.ncomponents*task_number,
-      //                 ndev_per_task, derivatives.data()),
-      //     ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
-      // }
+// #pragma acc parallel loop
+//       for(unsigned n=0; n<of_size; ++n) {
+//         View2D<double> deriv (derivatives,input.ncomponents,ndev_per_task);
+// #pragma acc loop seq
+//         for (unsigned task=0; task < nactive_tasks; ++task) {
+//           std::size_t task_number = partialTaskList_data[task];
+//           // std::size_t base = 3*task_number*input.nindices_per_task;
+//           const unsigned m = 3*input.nindices_per_task;
+//           for(unsigned compID=0; compID<input.ncomponents; ++compID) {
+//             const double ff = value_stash_data[input.ncomponents*task_number,compID];
+//             omp_forces_data[n] += ff*
+//                                   deriv[compID][m+n];
+//           }
+//         }
+//       }
+
+
+      // T::gatherBoxForces( i, t_actiondata, input,
+      //   ForceInput( input.ncomponents,
+      //               value_stash_data+input.ncomponents*task_number,
+      //               ndev_per_task, derivatives.data()),
+      //   ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
+
     }
-    T::gatherThreads( { omp_forces[0], forcesForApply});
+    //T::gatherThreads( { omp_forces[0], forcesForApply});
 #else
     plumed_merror("cannot use USEGPU flag if PLUMED has not been compiled with openACC");
 #endif
@@ -496,5 +523,5 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
 
 }
 
-}
+} // namespace PLMD
 #endif