Skip to content

Commit

Permalink
It WORKS!
Browse files Browse the repository at this point in the history
  • Loading branch information
Iximiel committed Feb 26, 2025
1 parent 04e034b commit b8a3855
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 41 deletions.
31 changes: 18 additions & 13 deletions src/colvar/MultiColvarTemplate.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class MultiColvarTemplate : public ActionWithVector {
void calculate() override;
void applyNonZeroRankForces( std::vector<double>& outforces ) override ;
static void performTask( unsigned task_index, const MultiColvarInput& actiondata, ParallelActionsInput& input, ParallelActionsOutput& output );
template <bool doVirial=true>
static void gatherForces( unsigned task_index, MultiColvarInput actiondata, ParallelActionsInput input, ForceInput fdata, ForceOutput forces);
static void gatherThreads( ForceOutput forces );
};
Expand Down Expand Up @@ -227,12 +228,13 @@ void MultiColvarTemplate<T>::performTask( unsigned task_index,
input.inputdata[pos_start+1]=fpos[1];
input.inputdata[pos_start+2]=fpos[2];
}

std::size_t mass_start = pos_start + 3*input.nindices_per_task;
std::size_t charge_start = mass_start + input.nindices_per_task;
printf("-> %u : %p\n",task_index,output.derivatives.data() );
const size_t mass_start = pos_start + 3*input.nindices_per_task;
const size_t charge_start = mass_start + input.nindices_per_task;
const size_t local_ndev = 3*input.nindices_per_task+9;

ColvarOutput cvout = ColvarOutput( output.values,
3*input.nindices_per_task+9,
local_ndev,
output.derivatives.data() );
T::calculateCV( ColvarInput(actiondata.mode,
input.nindices_per_task,
Expand All @@ -244,6 +246,7 @@ void MultiColvarTemplate<T>::performTask( unsigned task_index,
}

template <class T>
template <bool doVirial>
void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
const MultiColvarInput /*actiondata*/,
const ParallelActionsInput input,
Expand All @@ -264,15 +267,17 @@ void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
// forces.thread_safe[n] += ff*fdata.deriv[i][m];
// ++m;
// }
forces.thread_safe[0] += ff*fdata.deriv[i][m];
forces.thread_safe[1] += ff*fdata.deriv[i][m+1];
forces.thread_safe[2] += ff*fdata.deriv[i][m+2];
forces.thread_safe[3] += ff*fdata.deriv[i][m+3];
forces.thread_safe[4] += ff*fdata.deriv[i][m+4];
forces.thread_safe[5] += ff*fdata.deriv[i][m+5];
forces.thread_safe[6] += ff*fdata.deriv[i][m+6];
forces.thread_safe[7] += ff*fdata.deriv[i][m+7];
forces.thread_safe[8] += ff*fdata.deriv[i][m+8];
if constexpr (doVirial) {
forces.thread_safe[0] += ff*fdata.deriv[i][m];
forces.thread_safe[1] += ff*fdata.deriv[i][m+1];
forces.thread_safe[2] += ff*fdata.deriv[i][m+2];
forces.thread_safe[3] += ff*fdata.deriv[i][m+3];
forces.thread_safe[4] += ff*fdata.deriv[i][m+4];
forces.thread_safe[5] += ff*fdata.deriv[i][m+5];
forces.thread_safe[6] += ff*fdata.deriv[i][m+6];
forces.thread_safe[7] += ff*fdata.deriv[i][m+7];
forces.thread_safe[8] += ff*fdata.deriv[i][m+8];
}
}
}

Expand Down
110 changes: 82 additions & 28 deletions src/core/ParallelTaskManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include "tools/View.h"
#include "tools/View2D.h"

#include "tools/ColvarOutput.h"

#include<iostream>
#define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " << (__VA_ARGS__) << '\n'
namespace PLMD {
Expand Down Expand Up @@ -100,7 +102,10 @@ class ForceInput {
public:
View<double> force;
View2D<double> deriv;
ForceInput( std::size_t nc, double* f, std::size_t nd, double* d ) : ncomp(nc), force(f,nc), deriv(d,nc,nd) {}
ForceInput( std::size_t nc, double* f, std::size_t nd, double* d ) :
ncomp(nc),
force(f,nc),
deriv(d,nc,nd) {}
};

//There is no need to pass this as reference:
Expand Down Expand Up @@ -198,7 +203,8 @@ ParallelTaskManager<T, D>::ParallelTaskManager(ActionWithVector* av):
}

template <class T, class D>
void ParallelTaskManager<T, D>::setNumberOfIndicesAndDerivativesPerTask( const std::size_t nind, const std::size_t nder ) {
void ParallelTaskManager<T, D>::setNumberOfIndicesAndDerivativesPerTask( const std::size_t nind,
const std::size_t nder ) {
plumed_massert( action->getNumberOfComponents()>0, "there should be some components wen you setup the index list" );
std::size_t valuesize=(action->getConstPntrToComponent(0))->getNumberOfStoredValues();
for(unsigned i=1; i<action->getNumberOfComponents(); ++i) {
Expand Down Expand Up @@ -346,16 +352,17 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
auto omp_forces_data = omp_forces[0].data();
const auto omp_forces_size = omp_forces[0].size();
//on the cpu we only need the declaration, see the create statement below
const auto nderivs = myinput.ncomponents*nderivatives_per_task;
const auto ndev_per_task = nderivatives_per_task;

const auto ndev_per_task = myinput.ncomponents*nderivatives_per_task;
const auto nderivPerComponent = nderivatives_per_task;
const auto nderivs = ndev_per_task*nactive_tasks;

//To future me/you:
// I need to allocate this on the host to create a bigger temporay data array
// on the device
// by trying with double* x=nullptr, you will get a failure
// another solution is acc_malloc and then device_ptr in the pragma
// (but you have to remember the acc_free)
std::vector<double> derivative(1);
std::vector<double> derivative(nderivs,0.0);
double * derivatives = derivative.data();

ParallelActionsInput input = myinput;
Expand All @@ -364,42 +371,82 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
auto actiondata_acc = fromToDataHelper(t_actiondata);


#pragma acc parallel loop reduction(+:omp_forces_data[0:omp_forces_size]) \
vdbg(nactive_tasks);
vdbg(nderivatives_per_task);
vdbg(ndev_per_task);
vdbg(myinput.ncomponents);
#pragma acc data \
present(input,t_actiondata) \
copyin(nactive_tasks, \
nderivs, \
ndev_per_task, \
nderivPerComponent ,\
partialTaskList_data[0:nactive_tasks], \
value_stash_data[0:value_stash_size]) \
copy(omp_forces_data[0:omp_forces_size], \
forcesForApply_data[0:forcesForApply_size]) \
create(derivatives[0:nderivs]) \
copyout(derivatives[0:nderivs]) \
default(none)
for(unsigned i=0; i<nactive_tasks; ++i) {
//This may be changed to a shared array
std::vector<double> valstmp( input.ncomponents );
std::size_t task_index = partialTaskList_data[i];
// std::vector<double> derivative( nderivs );
ParallelActionsOutput myout( input.ncomponents,
valstmp.data(),
nderivs,
derivatives);
// Calculate the stuff in the loop for this action
T::performTask( task_index, t_actiondata, input, myout );
{
#pragma acc parallel loop reduction(+:omp_forces_data[0:omp_forces_size])
for(unsigned i=0; i<nactive_tasks; ++i) {
//This may be changed to a shared array
std::vector<double> valstmp( input.ncomponents );
std::size_t task_index = partialTaskList_data[i];
// std::vector<double> derivative( nderivs );
ParallelActionsOutput myout( input.ncomponents,
valstmp.data(),
nderivPerComponent,
derivatives+ndev_per_task*i
);

// Calculate the stuff in the loop for this action
T::performTask( task_index, t_actiondata, input, myout );
// Gather the forces from the values
T::gatherForces<true>( task_index, t_actiondata, input,
ForceInput( input.ncomponents,
value_stash_data+input.ncomponents*task_index,
nderivPerComponent,
derivatives+ndev_per_task*i),
ForceOutput { omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size }
);

// Gather the forces from the values
T::gatherForces( task_index, t_actiondata, input,
ForceInput( input.ncomponents,
value_stash_data+input.ncomponents*task_index,
ndev_per_task,
derivatives),
ForceOutput { omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size }
);


}

// #pragma acc kernels
// for(unsigned n=0; n<9; ++n) {
// double t=0.0;
// unsigned m = 3*input.nindices_per_task+ n;
// // #pragma acc parallel loop reduction(+:t)
// //the reduction on t is implicit
// for(unsigned i=0; i<nactive_tasks; ++i) {
// std::size_t task_index = partialTaskList_data[i];
// View2D<double> deriv{derivatives,input.ncomponents,ndev_per_task};
// View<double> force{value_stash_data+input.ncomponents*task_index,input.ncomponents};
// // std::size_t base = 3*task_index*input.nindices_per_task;
// for(unsigned i=0; i<input.ncomponents; ++i) {

// t += force[i]*deriv[i][m];

// }
// omp_forces_data[n]= t;
// }

// }
}
T::gatherThreads({ omp_forces_data,omp_forces_size, forcesForApply_data,forcesForApply_size });
for (const auto x:omp_forces[0]) {
vdbg (x);
}
for (int i=0; i<nderivs; ++i) {
// if (i%ndev_per_task<(ndev_per_task-9) ) continue;
std::cerr <<"\t" <<i <<":"<< derivative [i] << "\n";
}

vdbg(sizeof(PLMD::colvar::ColvarOutput));
vdbg(2 * sizeof(void*));
#else
plumed_merror("cannot use USEGPU flag if PLUMED has not been compiled with openACC");
#endif
Expand All @@ -413,7 +460,9 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
unsigned nt=OpenMP::getNumThreads();
if( nt*stride*10>nactive_tasks ) nt=nactive_tasks/stride/10;
if( nt==0 ) nt=1;

vdbg(nactive_tasks);
vdbg(nderivatives_per_task);
vdbg(myinput.ncomponents);
#pragma omp parallel num_threads(nt)
{
const unsigned t=OpenMP::getThreadNum();
Expand All @@ -439,7 +488,12 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
derivatives.data()),
forces
);
for (int nd=0; nd<derivatives.size(); ++nd) {
// if (i%ndev_per_task<(ndev_per_task-9) ) continue;
std::cerr <<"\t" <<i <<"-"<<nd<<":"<< derivatives [nd] << "\n";
}
}

#pragma omp critical
T::gatherThreads( forces );
}
Expand Down

0 comments on commit b8a3855

Please sign in to comment.