diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 403e8e1..cdfb899 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,7 @@ cmake_minimum_required(VERSION 3.0) project(blade LANGUAGES CXX CUDA) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) file(GLOB_RECURSE sources *.cxx *.cu *.h) diff --git a/src/run/run.cu b/src/run/run.cu index 7588f98..e12b812 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -357,14 +357,20 @@ void Run::set_variable(char *line,char *token,System *system) dxAtomMax=io_nextf(line)*ANGSTROM; } else if (strcmp(token,"dxrmsinit")==0) { dxRMSInit=io_nextf(line)*ANGSTROM; + } else if (strcmp(token, "lbfgs_m")==0){ + lbfgs_m=io_nexti(line); + } else if (strcmp(token, "lbfgs_eps")==0){ + lbfgs_eps=io_nextf(line); } else if (strcmp(token,"mintype")==0) { std::string minString=io_nexts(line); - if (strcmp(minString.c_str(),"sd")==0) { + if (strcmp(minString.c_str(), "lbfgs")==0){ + minType=elbfgs; + } else if (strcmp(minString.c_str(),"sd")==0) { minType=esd; } else if (strcmp(minString.c_str(),"sdfd")==0) { minType=esdfd; } else { - fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: sd or sdfd\n",minString.c_str()); + fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: lbfgs, sd, or sdfd\n",minString.c_str()); } } else if (strcmp(token,"domdecheuristic")==0) { domdecHeuristic=io_nextb(line); @@ -406,6 +412,7 @@ void Run::energy(char *line,char *token,System *system) system->potential->calc_force(0,system); system->state->recv_energy(); print_nrg(0,system); + display_nrg(system); dynamics_finalize(system); } @@ -501,6 +508,13 @@ void Run::minimize(char *line,char *token,System *system) gpuCheck(cudaPeekAtLastError()); } + if(system->run->minType==elbfgs){ + Run* r = system->run; + printf("L-BFGS (m=%d) did %d force evaluations in %d steps. Reset memory %d times.\n", + r->lbfgs->m, r->lbfgs_energy_evals, r->lbfgs->step_count, r->lbfgs->reset_count); + printf("U0: %f, Uf: %f, Uf - U0: %f\n", r->lbfgs->U0, r->lbfgs->Uf, r->lbfgs->Uf - r->lbfgs->U0); + } + system->state->min_dest(system); dynamics_finalize(system); } diff --git a/src/run/run.h b/src/run/run.h index f7de7d1..184b31e 100644 --- a/src/run/run.h +++ b/src/run/run.h @@ -8,6 +8,7 @@ #include "xdr/xdrfile.h" #include "xdr/xdrfile_xtc.h" +#include "update/lbfgs.h" // Forward delcaration class System; @@ -19,6 +20,7 @@ struct Cutoffs { }; typedef enum emin { + elbfgs, // Limited memory BFGS algo esd, // steepest descent esdfd, // steepest descent with finite difference to choose step length eminend} EMin; @@ -56,6 +58,12 @@ class Run { real dxRMS; EMin minType; // minimization scheme + // L-BFGS variables + LBFGS* lbfgs; + int lbfgs_energy_evals = 0; + int lbfgs_m = 7; // gradient history length*DOF + real lbfgs_eps = 1; // gradient rms convergence criteria + real betaEwald; real rCut; real rSwitch; diff --git a/src/system/state.cxx b/src/system/state.cxx index 7635c0f..d45d914 100644 --- a/src/system/state.cxx +++ b/src/system/state.cxx @@ -42,6 +42,7 @@ State::State(System *system) { cudaMalloc(&(positionBackup_d),(2*nL+3*n)*sizeof(real_x)); forceBuffer=(real_f*)calloc(rootFactor*(2*nL+3*n),sizeof(real_f)); cudaMalloc(&(forceBuffer_d),rootFactor*(2*nL+3*n)*sizeof(real_f)); + cudaMalloc(&(forceBufferX_d),rootFactor*(2*nL+3*n)*sizeof(real_x)); cudaMalloc(&(forceBackup_d),(2*nL+3*n)*sizeof(real_f)); if (system->idCount>0) { // OMP @@ -156,6 +157,7 @@ State::~State() { if (positionBackup_d) cudaFree(positionBackup_d); if (forceBuffer) free(forceBuffer); if (forceBuffer_d) cudaFree(forceBuffer_d); + if (forceBufferX_d) cudaFree(forceBufferX_d); if (forceBackup_d) cudaFree(forceBackup_d); // Other buffers if (energy) free(energy); diff --git a/src/system/state.h b/src/system/state.h index 1270515..66ebf83 100644 --- a/src/system/state.h +++ b/src/system/state.h @@ -67,6 +67,7 @@ class State { real *positionBuffer_omp; real_f *forceBuffer; real_f *forceBuffer_d; + real_x *forceBufferX_d; real_f *forceBackup_d; // For NPT real_f *forceBuffer_omp; diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu new file mode 100644 index 0000000..73d0440 --- /dev/null +++ b/src/update/lbfgs.cu @@ -0,0 +1,331 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include "update/lbfgs.h" +#include "main/real3.h" // for real_sum_reduce + +/* + Cuda Kernels +*/ + +// d = 1/sqrt(d) - element-wise +__global__ void inv_sqrt(int N, real_x *d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = 1.0/sqrt(d[i]); + } +} + +// d = 1/d - element-wise +__global__ void recip(int N, real_x *d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = 1.0/d[i]; + } +} + +// d = abs(d) - element-wise +__global__ void vector_abs(int N, real_x *d){ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = abs(d[i]); + } +} + +// C = u*A + w*d*B, C and A and B can be related +__global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real_x *B, real_x *C) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + if(d){ + C[i] = u * A[i] + w * d[0] * B[i]; + } else { + C[i] = u * A[i] + w * B[i]; + } + } +} + +// specific for 2nd lbfgs loop +// C = A + (u-w)*B, C and A and B can be related +__global__ void vector_add(int N, real_x *A, real_x *u, real_x *w, real_x *B, real_x *C) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + C[i] = A[i] + (u[0]-w[0]) * B[i]; + } +} + +// C = A[0]*B - element-wise +__global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + C[i] = A[0]*B[i]; + } +} + +// B = c*A - element-wise +__global__ void vector_scale(int N, real_x c, real_x *A, real_x *B){ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + B[i] = c*A[i]; + } +} + +__global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + real_x lDot = 0; + extern __shared__ real sDot[]; + if (i < N) { + lDot = A[i]*B[i]; + } + + real_sum_reduce((real)lDot, sDot, dot); +} + +/* + Class setup +*/ + +// +LBFGS::LBFGS(int m, real_x eps, int DOF, std::function user_grad, real_x *position, real_x *gradient) + : m(m), eps_tol(eps), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) { + k = 0; + cudaMalloc(&tmp_d, sizeof(real_x)); + cudaMalloc(&gamma_d, sizeof(real_x)); + cudaMalloc(&rho_d, m*sizeof(real_x)); + cudaMalloc(&alpha_d, m*sizeof(real_x)); + cudaMalloc(&q_d, DOF*sizeof(real_x)); + cudaMalloc(&prev_positions_d, DOF*sizeof(real_x)); + cudaMalloc(&prev_gradient_d, DOF*sizeof(real_x)); + cudaMalloc(&s_d, m*DOF*sizeof(real_x)); + cudaMalloc(&y_d, m*DOF*sizeof(real_x)); + cudaMalloc(&s_tmp_d, DOF*sizeof(real_x)); + cudaMalloc(&y_tmp_d, DOF*sizeof(real_x)); + + U0 = user_grad(); // first energy call by lbfgs + + // Normalize first s.d. step + gamma_norm(); + + // Set up s.d. step + cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d); + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d); +} + +LBFGS::~LBFGS() { + if (tmp_d) cudaFree(tmp_d); + if (gamma_d) cudaFree(gamma_d); + if (rho_d) cudaFree(rho_d); + if (alpha_d) cudaFree(alpha_d); + if (q_d) cudaFree(q_d); + if (prev_positions_d) cudaFree(prev_positions_d); + if (prev_gradient_d) cudaFree(prev_gradient_d); + if (s_d) cudaFree(s_d); + if (y_d) cudaFree(y_d); + if (s_tmp_d) cudaFree(s_tmp_d); + if (y_tmp_d) cudaFree(y_tmp_d); +} + +/* + L-BFGS iteration +*/ + +bool LBFGS::converged(){ + // return true if rmsg minimized + rmsg = 0; + grad_mag = 0; + grad_pos_mag = 0; + + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d); + cudaMemcpy(&rmsg, tmp_d, sizeof(real_x), cudaMemcpyDefault); + grad_mag = sqrt(rmsg); + rmsg = sqrt(rmsg/DOF); + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, X_d, X_d, tmp_d); + cudaMemcpy(&grad_pos_mag, tmp_d, sizeof(real_x), cudaMemcpyDefault); + grad_pos_mag = grad_mag / std::max(1.0, sqrt(grad_pos_mag)); + + printf(" rmsg = %f\n", rmsg); + printf(" |g| = %f\n", grad_mag); + printf("|g|/|x| = %f\n", grad_pos_mag); + if(rmsg < eps_tol){ + printf("Structure minimized!!\n"); + minimized=true; + } + return rmsg < eps_tol; +} + +// set gamma = 1/|g_d| +void LBFGS::gamma_norm(){ + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d); + inv_sqrt<<<1,1>>>(1, tmp_d); + cudaMemcpy(gamma_d, tmp_d, sizeof(real_x), cudaMemcpyDefault); +} + +// Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4) +void LBFGS::minimize_step(real_x f0) { // f0 & G filled from class initializer + // Copy kth positions & gradients + cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + if(converged()) return; + + // min_a f(X + a*q) + step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q) + + // kth position and gradient deltas + update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory + k++; + + // Two-loop recursion + cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + for (int i = k-1; i >= std::max(0, k-m); i--) { + int index = i % m; + // alpha.i = rho.i*(s.i dot q) + cudaMemset(alpha_d+index, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index); + vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index); + // q = q - alpha.i*y.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d); + } + // q = gamma.k*q = r + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d); + for (int i = std::max(0, k-m); i <= k-1; i++) { + int index = i % m; + // B = rho.i*(y.i dot q) + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d); + vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d); + // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, q_d, alpha_d+index, tmp_d, s_d+index*DOF, q_d); + } + + // q = -q + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d); + step_count++; +} + +// this method is overly cautious & slow, likely doesn't need the cpu grad checks or resets +void LBFGS::update_sk_yk() { + if(m == 0) { // Steepest decent, normalize step size via gamma + gamma_norm(); + return; + } + + // s & y tmp + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, prev_gradient_d, y_tmp_d); + + real_x yy = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d); + cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault); + real_x sy = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d); + cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault); + + // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS) + if(sy < 1e-10){ + printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy); + cudaMemset(s_d, 0, m*DOF*sizeof(real_x)); + cudaMemset(y_d, 0, m*DOF*sizeof(real_x)); + gamma_norm(); + k = -1; + reset_count++; + return; + } + + // rho = 1/y.s + real_x rho = 1.0/sy; + // gamma_k = s.y/(y.y) + real_x gamma = sy/yy; + cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault); + int index = k % m; + cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); + //printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma); +} + +/* + Line Search +*/ + +// return [f(X + alpha*p), df(X+alpha*p)/da] +void LBFGS::phi(real_x alpha, real_x* result){ + cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d); + result[0] = system_grad(); + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da + cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault); +} + +real_x sign(real_x value){ + if(value < 0){ + return -1; + } + return 1; +} + +// Ch3.5, p59 of Nocedal & Wright (Equation 3.59) +real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){ + real_x d1 = ga + gb - 3*(fa - fb)/(a-b); + real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb); + real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2); + if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){ + //printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b); + return (a + b) / 2.0; + } + return arg; +} + +// Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5) +real_x LBFGS::linesearch(real_x f0){ + real_x max_iter = 5; + real_x aim1 = 0; + real_x ai = 1; + real_x amax = 10; + // Data is already loaded, saves 1 energy & grad call + real_x phi0[2] = {f0, 0}; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da + cudaMemcpy(&phi0[1], tmp_d, sizeof(real_x), cudaMemcpyDefault); + real_x phiim1[2]; + memcpy(phiim1, phi0, 2*sizeof(real_x)); + // Evaluate at 1 & iterate using cubic interpolation + real_x phii[2]; + phi(ai, phii); + printf("alpha: %f, phi: %f, phi': %f\n", aim1, phi0[0], phi0[1]); + printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]); + for(int i = 0; i < max_iter; i++){ + // Check strong wolfe condition + if (phii[0] <= phi0[0] + c1*ai*phi0[1] && abs(phii[1]) <= c2*abs(phi0[1])){ + Uf = phii[0]; + return ai; + } + real_x tmp = ai; + ai = cubic_interp(aim1, phiim1[0], phiim1[1], ai, phii[0], phii[1]); + phi(ai, phii); + printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]); + aim1 = tmp; + memcpy(phiim1, phii, 2*sizeof(real_x)); + if(ai < 0 || ai > amax){ + break; + } + } + + // Failed line search + ai = 1e-5; // default step size, a small enough step should decrease function if s.y > 0 + phi(ai, phii); + Uf = phii[0]; + printf("Linesearch failed! Defaulting to ai = %e, phi: %f, phi': %f\n", ai, phii[0], phii[1]); + return ai; +} \ No newline at end of file diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h new file mode 100644 index 0000000..8fac8eb --- /dev/null +++ b/src/update/lbfgs.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include "main/defines.h" + +// Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation +class LBFGS { +public: + int step_count=0; // total number of minimize calls + int reset_count=0; // number of times lbfgs memory was cleared + int m=7; // Number of previous gradients to use for hessian approximation (5-7) + bool minimized=false; + real_x U0, Uf; + + LBFGS(int m, real_x eps, int DOF, std::function user_grad, real_x *position, real_x *gradient); + ~LBFGS(); + void minimize_step(real_x f0); + +private: + // Line search parameters + real_x c1 = 1e-4; // sufficient decent + real_x c2 = .9; // curvature cond. + // Convergence + real_x eps_tol = 1; // rms criteria (1e-1 is very tight and unlikely to work) + real_x rmsg = 0; // |g| / sqrt(DOF) + real_x grad_mag = 0; // |g| + real_x grad_pos_mag = 0; // |g| / max(1, |x|) + + int DOF; // Degrees of freedom to minimize + real_x step_size = 1; // previous step size + // All data stored on GPU + int k = 0; // lbfgs iteration step + real_x *tmp_d; // 1 real temperary storage + real_x *gamma_d; // s[k-1] projected onto y[k-1] + real_x *rho_d; // [m] rho[k] = 1/(s[k]^T * y[k]) + real_x *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i] + real_x *search_d; // [DOF] L-BFGS search direction + real_x *prev_positions_d; // [DOF] x[k-1] + real_x *prev_gradient_d; // [DOF] g[k-1] + real_x *X_d; // points to user GPU memory, positions to be optimized + real_x *G_d; // points to user GPU memory + real_x *q_d; + real_x *s_d; // [m, DOF] s[k-1] = x[k] - x[k-1] (1D array) + real_x *y_d; // [m, DOF] y[k-1] = grad[k] - grad[k-1] (1D array) + + real_x *s_tmp_d; // [DOF] x[k] - x[k-1], used to check curvature + real_x *y_tmp_d; // [DOF] g[k] - g[k-1], used to check curvature + bool skipped_step = false; // skipped hessian update, if 2 updates are skipped, we clear s & y memory + + std::function system_grad; + + bool converged(); + void gamma_norm(); + void update_sk_yk(); + + void phi(real_x alpha, real_x *result); + real_x linesearch(real_x f0); +}; \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index 71547c1..6ed4798 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -8,10 +8,14 @@ #include "system/potential.h" #include "domdec/domdec.h" #include "holonomic/holonomic.h" +#include "holonomic/rectify.h" #include "io/io.h" #include "main/real3.h" +#include +#include "msld/msld.h" + __global__ void no_mass_weight_kernel(int N,real* masses,real* ones) @@ -82,6 +86,18 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire real_sum_reduce(3*lDot/N,sDot,dot); } +__global__ void double_onto_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction) +{ + int i=blockIdx.x*blockDim.x+threadIdx.x; + if (irun->minType==esdfd) { cudaMalloc(&minDirection_d,3*atomCount*sizeof(real_v)); } + if (system->run->minType==elbfgs){ + // Function called by L-BFGS class to get energy & gradient + std::function energy_and_grad = [system](){ + // Copy double precision onto float precision buffer + int DOF = system->state->atomCount*3; + int shift = system->state->lambdaCount; + double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( + DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, true); + // Potential & Grad Eval -> step=0 to calc energy + //system->domdec->update_domdec(system,true); + system->potential->calc_force(0, system); + // grad(F(X)) G already stored + system->state->recv_energy(); + // Copy float precision onto double precision buffer + double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( + DOF, system->state->forceBufferX_d+shift, system->state->forceBuffer_d+shift, false); + system->run->lbfgs_energy_evals++; + return system->state->energy[eepotential]; + }; + + int shift = system->state->lambdaCount; + int DOF = system->state->atomCount*3; // only minimize atom positions + system->run->lbfgs = new LBFGS(system->run->lbfgs_m, system->run->lbfgs_eps, DOF, + energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift); + } } void State::min_dest(System *system) @@ -109,9 +150,12 @@ void State::min_dest(System *system) if (system->run->minType==esdfd) { cudaFree(minDirection_d); } + if (system->run->minType==elbfgs){ + system->run->lbfgs->~LBFGS(); + } } -void State::min_move(int step,int nsteps,System *system) +void State::min_move(int step, int nsteps, System *system) { Run *r=system->run; real_e grads2[2]; @@ -120,7 +164,16 @@ void State::min_move(int step,int nsteps,System *system) real scaling, rescaling; real frac; - if (r->minType==esd) { + if (r->minType==elbfgs){ + if (system->id==0){ + recv_energy(); + if (system->verbose>0) display_nrg(system); + currEnergy=energy[eepotential]; + r->lbfgs->minimize_step(currEnergy); + if(r->lbfgs->minimized){ system->run->step = nsteps; } // do we want this? + } + } + else if (r->minType==esd) { if (system->id==0) { recv_energy(); if (system->verbose>0) display_nrg(system);