Skip to content

Commit

Permalink
If we ingnore the viria, it looks like we are here
Browse files Browse the repository at this point in the history
  • Loading branch information
Iximiel committed Feb 21, 2025
1 parent ab63e35 commit 35b742a
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 26 deletions.
9 changes: 5 additions & 4 deletions src/colvar/MultiColvarTemplate.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class MultiColvarTemplate : public ActionWithVector {
/// The number of atoms per task
unsigned natoms_per_task;
public:
static constexpr size_t virialSize = 9;
static void registerKeywords(Keywords&);
explicit MultiColvarTemplate(const ActionOptions&);
unsigned getNumberOfDerivatives() override ;
Expand Down Expand Up @@ -219,8 +220,8 @@ MultiColvarTemplate<T>::MultiColvarTemplate(const ActionOptions&ao):
// Sets up the index list in the task manager
taskmanager.setNumberOfIndicesAndDerivativesPerTask(
natoms_per_task,
3*natoms_per_task + 9 );
taskmanager.setNumberOfThreadedForces( 9 );
3*natoms_per_task + virialSize );
taskmanager.setNumberOfThreadedForces( virialSize );
taskmanager.setActionInput( MultiColvarInput{ usepbc, mode } );
}

Expand Down Expand Up @@ -345,7 +346,7 @@ void MultiColvarTemplate<T>::gatherForces( unsigned task_index,
forces.thread_unsafe[base + m] += ff*fdata.deriv[i][m]; ++m;
}
if constexpr(doVirial) {
for(unsigned n=0; n<9 /*forces.reducedSize*/; ++n) {
for(unsigned n=0; n<virialSize; ++n) {
forces.thread_safe[n] += ff*fdata.deriv[i][m];
++m;
}
Expand Down Expand Up @@ -379,7 +380,7 @@ void MultiColvarTemplate<T>::gatherThreads( const ForceOutput forces ) {

//copies the forces that has been reduced in the place where they will be used
unsigned k=0;
for(unsigned n=forces.other_size-forces.reducedSize; n<forces.other_size; ++n) {
for(unsigned n=forces.other_size-virialSize; n<forces.other_size; ++n) {
forces.thread_unsafe[n] += forces.thread_safe[k];
++k;
}
Expand Down
71 changes: 49 additions & 22 deletions src/core/ParallelTaskManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,10 @@ void ParallelTaskManager<T, D>::runAllTasks() {
std::vector<double> derivatives(devSize);
std::size_t task_number = partialTaskList_data[i];
std::size_t val_pos = task_number*input.ncomponents;
ParallelActionsOutput myout( input.ncomponents, value_stash_data+val_pos, devSize,derivatives.data() );
// myout.values[0]=task_number;
ParallelActionsOutput myout( input.ncomponents,
value_stash_data+val_pos,
devSize,
derivatives.data() );
// Calculate the stuff in the loop for this action
T::performTask( task_number, t_actiondata, input, myout );
}
Expand Down Expand Up @@ -389,7 +391,7 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
}
}

if( useacc &&false ) {
if( useacc ) {
#ifdef __PLUMED_HAS_OPENACC
omp_forces[0].assign( omp_forces[0].size(), 0.0 );
auto value_stash_data = value_stash.data();
Expand All @@ -405,8 +407,14 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
const auto ndev_per_task = nderivatives_per_task;
const auto ncomponents = myinput.ncomponents;

// std::vector<double> derivatives( nderivs );
double * derivatives = nullptr;// derivatives.data();
//To future me/you:
// I need to allocate this on the host to create a bigger temporay data array
// on the device
// by trying with double* x=nullptr, you will get a failure
// another solution is acc_malloc and then device_ptr in the pragma
// (but you have to remember the acc_free)
std::vector<double> derivative(1);
double * derivatives = derivative.data();

ParallelActionsInput input = myinput;
auto myinput_acc = fromToDataHelper(input);
Expand All @@ -426,31 +434,50 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply
#pragma acc parallel loop
for(unsigned i=0; i<nactive_tasks; ++i) {
//I am starting to wonder if it could be smarter to set up only the derivatives per task here (or to use asome raw memory)
std::vector<double> fake_valstmp( input.ncomponents );
std::vector<double> valstmp( input.ncomponents );
std::size_t task_number = partialTaskList_data[i];
std::size_t val_pos = task_number*input.ncomponents;
ParallelActionsOutput myout( input.ncomponents, fake_valstmp.data(), nderivs,derivatives );
// std::vector<double> derivative( nderivs );
ParallelActionsOutput myout( input.ncomponents,
valstmp.data(),
nderivs,
derivatives);
// Calculate the stuff in the loop for this action
T::performTask( task_number, t_actiondata, input, myout );

// Gather the forces from the values
// T::gatherForces<false>( task_number, t_actiondata, input,
// ForceInput( input.ncomponents,
// value_stash_data+input.ncomponents*task_number,
// ndev_per_task, derivatives),
// ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
T::gatherForces<false>( task_number, t_actiondata, input,
ForceInput( input.ncomponents,
value_stash_data+input.ncomponents*task_number,
ndev_per_task, derivatives),
ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );

}
// #pragma acc parallel loop
// for(unsigned i=0; i<of_size; ++i) {
// T::gatherBoxForces( i, t_actiondata, input,
// ForceInput( input.ncomponents,
// value_stash_data+input.ncomponents*task_number,
// ndev_per_task, derivatives.data()),
// ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );
// }
// #pragma acc parallel loop
// for(unsigned n=0; n<of_size; ++n) {
// View2D<double> deriv (derivatives,input.ncomponents,ndev_per_task);
// #pragma acc loop seq
// for (unsigned task=0; task < nactive_tasks; ++task) {
// std::size_t task_number = partialTaskList_data[task];
// // std::size_t base = 3*task_number*input.nindices_per_task;
// const unsigned m = 3*input.nindices_per_task;
// for(unsigned compID=0; compID<input.ncomponents; ++compID) {
// const double ff = value_stash_data[input.ncomponents*task_number,compID];
// omp_forces_data[n] += ff*
// deriv[compID][m+n];
// }
// }
// }


// T::gatherBoxForces( i, t_actiondata, input,
// ForceInput( input.ncomponents,
// value_stash_data+input.ncomponents*task_number,
// ndev_per_task, derivatives.data()),
// ForceOutput { omp_forces_data,of_size, forcesForApply_data,ffa_size } );

}
T::gatherThreads( { omp_forces[0], forcesForApply});
//T::gatherThreads( { omp_forces[0], forcesForApply});
#else
plumed_merror("cannot use USEGPU flag if PLUMED has not been compiled with openACC");
#endif
Expand Down Expand Up @@ -496,5 +523,5 @@ void ParallelTaskManager<T, D>::applyForces( std::vector<double>& forcesForApply

}

}
} // namespace PLMD
#endif

0 comments on commit 35b742a

Please sign in to comment.