From f285e2e7e9a82fb45cf85184e566ebe4676bbf2a Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Wed, 14 May 2025 11:27:12 -0700 Subject: [PATCH 1/8] Added LBFGS file --- src/update/LBFGS.h | 306 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 src/update/LBFGS.h diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h new file mode 100644 index 0000000..2cc765a --- /dev/null +++ b/src/update/LBFGS.h @@ -0,0 +1,306 @@ + +#ifndef LBFGS_H +#define LBFGS_H +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +template +class LBFGS { + static_assert(std::is_floating_point::value, "L-BFGS can only be used with floating point types"); + +public: + LBFGS(int m, int DOF, std::function user_func, std::function user_grad) : m(m), DOF(DOF), func(user_func), grad(user_grad) { + minimized = false; + min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7; + max_step_size = 1.0; + rho = (T*) calloc(m, sizeof(T)); + alpha = (T*) malloc(m*sizeof(T)); + gamma = (T*) malloc(sizeof(T)); + beta = (T*) malloc(sizeof(T)); + step_size = 0.0; + + + G = (T*) malloc(DOF*sizeof(T)); + X = (T*) malloc(DOF*sizeof(T)); + + search = (T*) malloc(DOF*sizeof(T)); + prev_positions = (T*)malloc(DOF * sizeof(T)); + prev_gradient = (T*)malloc(DOF * sizeof(T)); + + q = (T*) malloc(DOF*sizeof(T)); + s = (T*) calloc(m*DOF, sizeof(T)); + y = (T*) calloc(m*DOF, sizeof(T)); + } + + /** + * TODO: Free all memory used by LBFGS + */ + ~LBFGS() { + free(rho); + free(alpha); + free(gamma); + free(beta); + free(G); + free(X); + free(search); + free(prev_positions); + free(prev_gradient); + free(q); + free(s); + free(y); + } + +/** + * TODO: Implement steepest decent step + * Sets variables and returns 1 + * @param X Initial Position + * @param G Initial Position Grad + */ +void init(T* X, T* G, T steepest_descent_step_size) { + for (int i = 0; i < DOF; ++i) { + prev_positions[i] = X[i]; + prev_gradient[i] = G[i]; + } + + for (int i = 0; i < DOF; ++i) { + X[i] -= steepest_descent_step_size * G[i]; + s[i + (m - 1) * DOF] = X[i] - prev_positions[i]; + y[i + (m - 1) * DOF] = G[i] - prev_gradient[i]; + this->X[i] = X[i]; + } + grad(this->X, this->G); +} + +/** + * This will loop over minimize step and a line search. Implement later + */ +void minimize() { + bool found_minimum = false; + int j = 0; + int maxcount2 = 1000000; + while (!found_minimum && j < maxcount2) { + ++j; + minimize_step(X, G); + printf("f("); + for (int i = 0; i < DOF; i++) { + if (i != DOF - 1) { + printf("%f, ", X[i]); + } + else { + printf("%f) = %f\n", X[i], func(X)); + } + } + grad(X, G); + printf("grad of f("); + for (int i = 0; i < DOF; i++) { + if (i != DOF - 1) { + printf("%f, ", X[i]); + } + else { + printf("%f) = [", X[i]); + } + } + for (int i = 0; i < DOF; i++) { + if (i != DOF - 1) { + printf("%f, ", G[i]); + } + else { + printf("%f]\n", G[i]); + } + } + double sum_of_residuals = 0; + for (int i = 0; i < DOF; ++i) { + sum_of_residuals += G[i] * G[i]; + + } + sum_of_residuals = sqrtf((sum_of_residuals / DOF)); + if (minimized == true || sum_of_residuals < 1e-7) { + std::cout << "found minimum\n"; + std::cout << "Number of Steps: " << j << "\n"; + found_minimum = true; + } + + } +} + + +/** + * 1. Compute new L-BFGS step direction + * Pseudocode from wikipedia: + * q = g.i // search direction to be updated + * for j = i-1 to i-m: + * alpha.j = rho.j * s.j.T * q // dot product + * q = q - alpha.j * y.j // vector scale & subtraction + * gamma.i = s.i-1.T * y.i-1 / y.i-1.T * y.i-1 // dot products in numerator and denominator + * q = gamma.i * q + * for j = i-m to i-1: + * beta = rho.j * y.j.T * q // dot product + * q = q + (alpha.j - beta) * s.j // vector scale & addition + * q = -q // negate applied above instead of here most likely + * gamma = s.i.T * y.i / y.i.T * y.i + * rho.j = 1 / (y.j.T * s.j) + */ + +void minimize_step(T* X_input, T* G_input) { + for (int i = 0; i < DOF; ++i) { + X[i] = X_input[i]; + G[i] = G_input[i]; + } + for (int i = 0; i < DOF; ++i) { + q[i] = G[i]; + } + update(); + for (int i = m - 1; i >= 0; --i) { + alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF); + for (int j = 0; j < DOF; ++j) { + q[j] -= alpha[i] * y[i * DOF + j]; + } + } + + T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF); + for (int j = 0; j < DOF; ++j) { + q[j] *= gamma; + } + + for (int i = 0; i < m; ++i) { + T beta = rho[i] * dot_product(y + i * DOF, q, DOF); + + for (int j = 0; j < DOF; ++j) { + q[j] += (alpha[i] - beta) * s[i * DOF + j]; + } + } + BacktrackingLineSearch backtracking(X, G, q, DOF, func); + step_size = backtracking.linesearch(); + std::cout << "step size: " << step_size << "\n"; + for (int j = 0; j < DOF; ++j) { + X[j] += (step_size * -q[j]); + X_input[j] = X[j]; + } + +} + +void update() { + for (int i = 0; i < ((m - 1) * DOF); ++i) { + s[i] = s[i + DOF]; + y[i] = y[i + DOF]; + + } + for (int i = 0; i < DOF; ++i) { + s[(m - 1) * DOF + i] = X[i] - prev_positions[i]; + y[(m - 1) * DOF + i] = G[i] - prev_gradient[i]; + } + for (int i = 0; i < m - 1; ++i) { + rho[i] = rho[i + 1]; + } + + double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF); + + if (s_dot_y == 0) { + std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n"; + minimized = true; + } + else { + rho[m - 1] = 1 / s_dot_y; + } + for (int i = 0; i < DOF; ++i) { + prev_positions[i] = X[i]; + prev_gradient[i] = G[i]; + } +} + +private: + int m; // Number of previous gradients to use for hessian approximation (5-7) + int DOF; // Degrees of freedom + T min_step_size; // terminate minimization with steps smaller than this number + T max_step_size; // ensures that lbfgs doesn't overshoot minimum + bool minimized; + + T *beta; + T *gamma; // s projected onto y + T *rho; // [m] rho_{i} = (s_{i}^T * y_{i} + T *alpha; // [m] alpha_{i} = rho_{i} * s_{i}^T * y_{i} + + T *X; + T *G; + T *search; // [DOF] L-BFGS search direction + T *prev_positions; // [DOF] x_{i-1} + T *prev_gradient; // [DOF] g_{i-1} + + T* q; + T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i} + T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i} + T step_size; + + std::function func; + std::function grad; + + T dot_product(T* a, T* b, int n) { + T result = 0; + for (int i = 0; i < n; ++i) { + result += a[i] * b[i]; + } + return result; + } +}; + +template +class BacktrackingLineSearch { +public: + BacktrackingLineSearch(T* x, T* g, T* q, int dof, function user_func) : X(x), G(g), DOF(dof), func(user_func) { + x_plus_step = (T*)malloc(DOF * sizeof(T)); + search_direction = (T*)malloc(DOF * sizeof(T)); + for (int i = 0; i < DOF; i++) { + search_direction[i] = -q[i]; + } + + } + T linesearch() { + int max_it = 1000; + T c = 0.5; + T tau = 0.75; + T m = dot_product(G, search_direction, DOF); + T step_size = 1; + for (int i = 0; i < max_it; i++) { + for (int j = 0; j < DOF; ++j) { + x_plus_step[j] = X[j] + (step_size * search_direction[j]); + } + if (func(X) - func(x_plus_step) >= step_size * -c * m) { + return step_size; + } + else { + step_size *= tau; + } + } + return 0; + } + ~BacktrackingLineSearch() { + free(x_plus_step); + free(search_direction); + } +private: + T* X; + T* G; + T* x_plus_step; + T* search_direction; + int DOF; + function func; + + T dot_product(T* a, T* b, int n) { + T result = 0; + for (int i = 0; i < n; ++i) { + result += a[i] * b[i]; + } + return result; + } + +}; + +#endif //LBFGS_H From 64f6e6daec5cb6329dd77d808f6598b6a40ac950 Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Wed, 14 May 2025 13:02:29 -0700 Subject: [PATCH 2/8] Initial commit for LBFGS minimze on CPU --- src/CMakeLists.txt | 2 + src/run/run.cu | 4 +- src/run/run.h | 5 ++ src/update/LBFGS.h | 154 +++++++++-------------------------------- src/update/minimize.cu | 33 ++++++++- 5 files changed, 74 insertions(+), 124 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 403e8e1..cdfb899 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,7 @@ cmake_minimum_required(VERSION 3.0) project(blade LANGUAGES CXX CUDA) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) file(GLOB_RECURSE sources *.cxx *.cu *.h) diff --git a/src/run/run.cu b/src/run/run.cu index 4e0e9d8..567e4ae 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -355,7 +355,9 @@ void Run::set_variable(char *line,char *token,System *system) dxRMSInit=io_nextf(line)*ANGSTROM; } else if (strcmp(token,"mintype")==0) { std::string minString=io_nexts(line); - if (strcmp(minString.c_str(),"sd")==0) { + if (strcmp(minString.c_str(), "lbfgs") == 0){ + minType=elbfgs; + } else if (strcmp(minString.c_str(),"sd")==0) { minType=esd; } else if (strcmp(minString.c_str(),"sdfd")==0) { minType=esdfd; diff --git a/src/run/run.h b/src/run/run.h index f7de7d1..15fcd38 100644 --- a/src/run/run.h +++ b/src/run/run.h @@ -8,6 +8,7 @@ #include "xdr/xdrfile.h" #include "xdr/xdrfile_xtc.h" +#include "update/LBFGS.h" // Forward delcaration class System; @@ -19,6 +20,7 @@ struct Cutoffs { }; typedef enum emin { + elbfgs, // Limited memory BFGS algo esd, // steepest descent esdfd, // steepest descent with finite difference to choose step length eminend} EMin; @@ -56,6 +58,9 @@ class Run { real dxRMS; EMin minType; // minimization scheme + // L-BFGS variables + LBFGS* lbfgs; + real betaEwald; real rCut; real rSwitch; diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h index 2cc765a..6697b27 100644 --- a/src/update/LBFGS.h +++ b/src/update/LBFGS.h @@ -1,8 +1,8 @@ #ifndef LBFGS_H #define LBFGS_H -#include +#include #include #include #include @@ -12,12 +12,11 @@ using namespace std; -template -class LBFGS { +template class LBFGS { static_assert(std::is_floating_point::value, "L-BFGS can only be used with floating point types"); public: - LBFGS(int m, int DOF, std::function user_func, std::function user_grad) : m(m), DOF(DOF), func(user_func), grad(user_grad) { + LBFGS(int m, int DOF, std::function user_grad) : m(m), DOF(DOF), grad(user_grad) { minimized = false; min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7; max_step_size = 1.0; @@ -27,7 +26,6 @@ class LBFGS { beta = (T*) malloc(sizeof(T)); step_size = 0.0; - G = (T*) malloc(DOF*sizeof(T)); X = (T*) malloc(DOF*sizeof(T)); @@ -36,6 +34,7 @@ class LBFGS { prev_gradient = (T*)malloc(DOF * sizeof(T)); q = (T*) malloc(DOF*sizeof(T)); + x_plus_step = (T*) malloc(DOF*sizeof(T)); s = (T*) calloc(m*DOF, sizeof(T)); y = (T*) calloc(m*DOF, sizeof(T)); } @@ -54,6 +53,7 @@ class LBFGS { free(prev_positions); free(prev_gradient); free(q); + free(x_plus_step); free(s); free(y); } @@ -69,7 +69,6 @@ void init(T* X, T* G, T steepest_descent_step_size) { prev_positions[i] = X[i]; prev_gradient[i] = G[i]; } - for (int i = 0; i < DOF; ++i) { X[i] -= steepest_descent_step_size * G[i]; s[i + (m - 1) * DOF] = X[i] - prev_positions[i]; @@ -79,59 +78,6 @@ void init(T* X, T* G, T steepest_descent_step_size) { grad(this->X, this->G); } -/** - * This will loop over minimize step and a line search. Implement later - */ -void minimize() { - bool found_minimum = false; - int j = 0; - int maxcount2 = 1000000; - while (!found_minimum && j < maxcount2) { - ++j; - minimize_step(X, G); - printf("f("); - for (int i = 0; i < DOF; i++) { - if (i != DOF - 1) { - printf("%f, ", X[i]); - } - else { - printf("%f) = %f\n", X[i], func(X)); - } - } - grad(X, G); - printf("grad of f("); - for (int i = 0; i < DOF; i++) { - if (i != DOF - 1) { - printf("%f, ", X[i]); - } - else { - printf("%f) = [", X[i]); - } - } - for (int i = 0; i < DOF; i++) { - if (i != DOF - 1) { - printf("%f, ", G[i]); - } - else { - printf("%f]\n", G[i]); - } - } - double sum_of_residuals = 0; - for (int i = 0; i < DOF; ++i) { - sum_of_residuals += G[i] * G[i]; - - } - sum_of_residuals = sqrtf((sum_of_residuals / DOF)); - if (minimized == true || sum_of_residuals < 1e-7) { - std::cout << "found minimum\n"; - std::cout << "Number of Steps: " << j << "\n"; - found_minimum = true; - } - - } -} - - /** * 1. Compute new L-BFGS step direction * Pseudocode from wikipedia: @@ -148,8 +94,10 @@ void minimize() { * gamma = s.i.T * y.i / y.i.T * y.i * rho.j = 1 / (y.j.T * s.j) */ - void minimize_step(T* X_input, T* G_input) { + if (minimized){ + return; + } for (int i = 0; i < DOF; ++i) { X[i] = X_input[i]; G[i] = G_input[i]; @@ -164,12 +112,10 @@ void minimize_step(T* X_input, T* G_input) { q[j] -= alpha[i] * y[i * DOF + j]; } } - T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF); for (int j = 0; j < DOF; ++j) { q[j] *= gamma; } - for (int i = 0; i < m; ++i) { T beta = rho[i] * dot_product(y + i * DOF, q, DOF); @@ -177,14 +123,12 @@ void minimize_step(T* X_input, T* G_input) { q[j] += (alpha[i] - beta) * s[i * DOF + j]; } } - BacktrackingLineSearch backtracking(X, G, q, DOF, func); - step_size = backtracking.linesearch(); + step_size = linesearch(); std::cout << "step size: " << step_size << "\n"; for (int j = 0; j < DOF; ++j) { X[j] += (step_size * -q[j]); X_input[j] = X[j]; } - } void update() { @@ -200,9 +144,7 @@ void update() { for (int i = 0; i < m - 1; ++i) { rho[i] = rho[i + 1]; } - double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF); - if (s_dot_y == 0) { std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n"; minimized = true; @@ -216,6 +158,26 @@ void update() { } } +T linesearch() { + int max_it = 1000; + T c = 0.5; + T tau = 0.75; + T m = dot_product(G, q, DOF); + T step_size = 1; + for (int i = 0; i < max_it; i++) { + for (int j = 0; j < DOF; ++j) { + x_plus_step[j] = X[j] - (step_size * q[j]); + } + if (grad(X, G) - grad(x_plus_step, G) >= step_size * -c * m) { + return step_size; + } + else { + step_size *= tau; + } + } + return 0; +} + private: int m; // Number of previous gradients to use for hessian approximation (5-7) int DOF; // Degrees of freedom @@ -235,63 +197,12 @@ void update() { T *prev_gradient; // [DOF] g_{i-1} T* q; + T* x_plus_step; T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i} T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i} T step_size; - std::function func; - std::function grad; - - T dot_product(T* a, T* b, int n) { - T result = 0; - for (int i = 0; i < n; ++i) { - result += a[i] * b[i]; - } - return result; - } -}; - -template -class BacktrackingLineSearch { -public: - BacktrackingLineSearch(T* x, T* g, T* q, int dof, function user_func) : X(x), G(g), DOF(dof), func(user_func) { - x_plus_step = (T*)malloc(DOF * sizeof(T)); - search_direction = (T*)malloc(DOF * sizeof(T)); - for (int i = 0; i < DOF; i++) { - search_direction[i] = -q[i]; - } - - } - T linesearch() { - int max_it = 1000; - T c = 0.5; - T tau = 0.75; - T m = dot_product(G, search_direction, DOF); - T step_size = 1; - for (int i = 0; i < max_it; i++) { - for (int j = 0; j < DOF; ++j) { - x_plus_step[j] = X[j] + (step_size * search_direction[j]); - } - if (func(X) - func(x_plus_step) >= step_size * -c * m) { - return step_size; - } - else { - step_size *= tau; - } - } - return 0; - } - ~BacktrackingLineSearch() { - free(x_plus_step); - free(search_direction); - } -private: - T* X; - T* G; - T* x_plus_step; - T* search_direction; - int DOF; - function func; + std::function grad; T dot_product(T* a, T* b, int n) { T result = 0; @@ -300,7 +211,6 @@ class BacktrackingLineSearch { } return result; } - }; -#endif //LBFGS_H +#endif \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index 71547c1..e1362d7 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -12,6 +12,9 @@ #include "main/real3.h" +#include +#include "msld/msld.h" + __global__ void no_mass_weight_kernel(int N,real* masses,real* ones) @@ -120,7 +123,35 @@ void State::min_move(int step,int nsteps,System *system) real scaling, rescaling; real frac; - if (r->minType==esd) { + if (r->minType==elbfgs){ + if (system->id==0){ + recv_energy(); + if (system->verbose>0) display_nrg(system); + currEnergy=energy[eepotential]; + if (step == 0){ + // Function called by LBFGS class to get energy & gradient + std::function grad = [system](real* X, real* G){ + // Copy X onto CUDA device + int DOF = system->state->atomCount*3 + system->msld->blockCount; + cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault); + // Potential & Grad Eval + system->potential->calc_force(0, system); + // Copy grad(F(X)) into G + cudaMemcpy(G, system->state->force_d, DOF*sizeof(real), cudaMemcpyDefault); + real energy = system->state->energy[eepotential]; + return energy; + }; + + int DOF = system->state->atomCount*3 + system->msld->blockCount; + int m = 7; // Past grad depth + r->lbfgs = new LBFGS(7, DOF, grad); + // Steepest decent step + r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2); + } + r->lbfgs->minimize_step((real*)system->state->positionBuffer, (real*) system->state->forceBuffer); + } + } + else if (r->minType==esd) { if (system->id==0) { recv_energy(); if (system->verbose>0) display_nrg(system); From 31eff07dea50433ee3aade6542662a45c38ecc45 Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Wed, 14 May 2025 17:06:54 -0700 Subject: [PATCH 3/8] Minor changes to LBFGS --- src/update/LBFGS.h | 68 ++++++++++++++++++++---------------------- src/update/minimize.cu | 4 +-- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h index 6697b27..d9067f6 100644 --- a/src/update/LBFGS.h +++ b/src/update/LBFGS.h @@ -21,27 +21,21 @@ template class LBFGS { min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7; max_step_size = 1.0; rho = (T*) calloc(m, sizeof(T)); - alpha = (T*) malloc(m*sizeof(T)); - gamma = (T*) malloc(sizeof(T)); - beta = (T*) malloc(sizeof(T)); + alpha = (T*) calloc(m, sizeof(T)); + gamma = (T*) calloc(m, sizeof(T)); + beta = (T*) calloc(m, sizeof(T)); step_size = 0.0; - - G = (T*) malloc(DOF*sizeof(T)); - X = (T*) malloc(DOF*sizeof(T)); + + search = (T*) calloc(DOF, sizeof(T)); + prev_positions = (T*) calloc(DOF, sizeof(T)); + prev_gradient = (T*) calloc(DOF, sizeof(T)); - search = (T*) malloc(DOF*sizeof(T)); - prev_positions = (T*)malloc(DOF * sizeof(T)); - prev_gradient = (T*)malloc(DOF * sizeof(T)); - - q = (T*) malloc(DOF*sizeof(T)); - x_plus_step = (T*) malloc(DOF*sizeof(T)); + q = (T*) calloc(DOF, sizeof(T)); + x_plus_step = (T*) calloc(DOF, sizeof(T)); s = (T*) calloc(m*DOF, sizeof(T)); y = (T*) calloc(m*DOF, sizeof(T)); } - /** - * TODO: Free all memory used by LBFGS - */ ~LBFGS() { free(rho); free(alpha); @@ -59,23 +53,26 @@ template class LBFGS { } /** - * TODO: Implement steepest decent step - * Sets variables and returns 1 + * Steepest decent step + * * @param X Initial Position * @param G Initial Position Grad */ void init(T* X, T* G, T steepest_descent_step_size) { + // f(X0), g(X0) + grad(X, G); + // Update for (int i = 0; i < DOF; ++i) { prev_positions[i] = X[i]; prev_gradient[i] = G[i]; } for (int i = 0; i < DOF; ++i) { - X[i] -= steepest_descent_step_size * G[i]; s[i + (m - 1) * DOF] = X[i] - prev_positions[i]; y[i + (m - 1) * DOF] = G[i] - prev_gradient[i]; - this->X[i] = X[i]; + X[i] -= steepest_descent_step_size * G[i]; } - grad(this->X, this->G); + // f(X0-size*G), g(X0-size*G) + grad(X, G); } /** @@ -94,18 +91,16 @@ void init(T* X, T* G, T steepest_descent_step_size) { * gamma = s.i.T * y.i / y.i.T * y.i * rho.j = 1 / (y.j.T * s.j) */ -void minimize_step(T* X_input, T* G_input) { +void minimize_step(T* X, T* G) { if (minimized){ return; } - for (int i = 0; i < DOF; ++i) { - X[i] = X_input[i]; - G[i] = G_input[i]; - } + // Eval function at X + T p0 = grad(X, G); for (int i = 0; i < DOF; ++i) { q[i] = G[i]; } - update(); + update(X, G); for (int i = m - 1; i >= 0; --i) { alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF); for (int j = 0; j < DOF; ++j) { @@ -118,24 +113,23 @@ void minimize_step(T* X_input, T* G_input) { } for (int i = 0; i < m; ++i) { T beta = rho[i] * dot_product(y + i * DOF, q, DOF); - for (int j = 0; j < DOF; ++j) { q[j] += (alpha[i] - beta) * s[i * DOF + j]; } } - step_size = linesearch(); + // min_a f(X + a*q) + step_size = this->linesearch(p0, X, G); std::cout << "step size: " << step_size << "\n"; + // Update system positions for (int j = 0; j < DOF; ++j) { X[j] += (step_size * -q[j]); - X_input[j] = X[j]; } } -void update() { +void update(T* X, T* G) { for (int i = 0; i < ((m - 1) * DOF); ++i) { s[i] = s[i + DOF]; y[i] = y[i + DOF]; - } for (int i = 0; i < DOF; ++i) { s[(m - 1) * DOF + i] = X[i] - prev_positions[i]; @@ -158,21 +152,23 @@ void update() { } } -T linesearch() { +T linesearch(T p0, T* X, T* G) { int max_it = 1000; T c = 0.5; T tau = 0.75; T m = dot_product(G, q, DOF); T step_size = 1; for (int i = 0; i < max_it; i++) { + T sum = 0; for (int j = 0; j < DOF; ++j) { x_plus_step[j] = X[j] - (step_size * q[j]); } - if (grad(X, G) - grad(x_plus_step, G) >= step_size * -c * m) { + T new_value = grad(x_plus_step, G); + if (p0 - new_value >= step_size * -c * m) { return step_size; - } - else { + } else { step_size *= tau; + printf("Step_size: %f, p0: %f, new: %f, x+s: %f\n", step_size, p0, new_value, q[50]); } } return 0; @@ -213,4 +209,4 @@ T linesearch() { } }; -#endif \ No newline at end of file +#endif // LBFGS \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index e1362d7..51bd33b 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -130,7 +130,7 @@ void State::min_move(int step,int nsteps,System *system) currEnergy=energy[eepotential]; if (step == 0){ // Function called by LBFGS class to get energy & gradient - std::function grad = [system](real* X, real* G){ + std::function energy_and_grad = [system](real* X, real* G){ // Copy X onto CUDA device int DOF = system->state->atomCount*3 + system->msld->blockCount; cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault); @@ -144,7 +144,7 @@ void State::min_move(int step,int nsteps,System *system) int DOF = system->state->atomCount*3 + system->msld->blockCount; int m = 7; // Past grad depth - r->lbfgs = new LBFGS(7, DOF, grad); + r->lbfgs = new LBFGS(7, DOF, energy_and_grad); // Steepest decent step r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2); } From f3cd0cfbda7583610b027ce9d132af545914c18e Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Tue, 16 Dec 2025 20:38:13 -0800 Subject: [PATCH 4/8] Updated L-BFGS implementation to work on floating point position and gradient buffers directly. Still working out bugs. --- src/run/run.cu | 10 +- src/run/run.h | 5 +- src/system/potential.cxx | 1 + src/update/LBFGS.h | 212 --------------------------------------- src/update/lbfgs.cu | 156 ++++++++++++++++++++++++++++ src/update/lbfgs.h | 38 +++++++ src/update/minimize.cu | 62 ++++++++---- 7 files changed, 246 insertions(+), 238 deletions(-) delete mode 100644 src/update/LBFGS.h create mode 100644 src/update/lbfgs.cu create mode 100644 src/update/lbfgs.h diff --git a/src/run/run.cu b/src/run/run.cu index 4559e6b..c33df91 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -359,14 +359,14 @@ void Run::set_variable(char *line,char *token,System *system) dxRMSInit=io_nextf(line)*ANGSTROM; } else if (strcmp(token,"mintype")==0) { std::string minString=io_nexts(line); - if (strcmp(minString.c_str(), "lbfgs") == 0){ + if (strcmp(minString.c_str(), "lbfgs")==0){ minType=elbfgs; } else if (strcmp(minString.c_str(),"sd")==0) { minType=esd; } else if (strcmp(minString.c_str(),"sdfd")==0) { minType=esdfd; } else { - fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: sd or sdfd\n",minString.c_str()); + fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: lbfgs, sd, or sdfd\n",minString.c_str()); } } else if (strcmp(token,"domdecheuristic")==0) { domdecHeuristic=io_nextb(line); @@ -503,6 +503,12 @@ void Run::minimize(char *line,char *token,System *system) gpuCheck(cudaPeekAtLastError()); } + if(system->run->minType==elbfgs){ + printf("L-BFGS took %d force evaluations in %d steps.\n", system->run->lbfgs_energy_evals, system->run->lbfgs->step); + printf("U0: %f, Uf: %f, Uf - U0: %f\n", + system->run->lbfgs->U0, system->run->lbfgs->Uf, system->run->lbfgs->Uf - system->run->lbfgs->U0); + } + system->state->min_dest(system); dynamics_finalize(system); } diff --git a/src/run/run.h b/src/run/run.h index 15fcd38..e8269e6 100644 --- a/src/run/run.h +++ b/src/run/run.h @@ -8,7 +8,7 @@ #include "xdr/xdrfile.h" #include "xdr/xdrfile_xtc.h" -#include "update/LBFGS.h" +#include "update/lbfgs.h" // Forward delcaration class System; @@ -59,7 +59,8 @@ class Run { EMin minType; // minimization scheme // L-BFGS variables - LBFGS* lbfgs; + LBFGS* lbfgs; + int lbfgs_energy_evals = 0; real betaEwald; real rCut; diff --git a/src/system/potential.cxx b/src/system/potential.cxx index 4fdd10b..93d75c4 100644 --- a/src/system/potential.cxx +++ b/src/system/potential.cxx @@ -1617,6 +1617,7 @@ void Potential::calc_force(int step,System *system) if (system->run->freqNPT>0) { calcEnergy=(calcEnergy||(step%system->run->freqNPT==0)); } + calcEnergy=true; #ifdef REPLICAEXCHANGE if (system->run->freqREx>0) { calcEnergy=(calcEnergy||(step%system->run->freqREx==0)); diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h deleted file mode 100644 index d9067f6..0000000 --- a/src/update/LBFGS.h +++ /dev/null @@ -1,212 +0,0 @@ - -#ifndef LBFGS_H -#define LBFGS_H - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -template class LBFGS { - static_assert(std::is_floating_point::value, "L-BFGS can only be used with floating point types"); - -public: - LBFGS(int m, int DOF, std::function user_grad) : m(m), DOF(DOF), grad(user_grad) { - minimized = false; - min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7; - max_step_size = 1.0; - rho = (T*) calloc(m, sizeof(T)); - alpha = (T*) calloc(m, sizeof(T)); - gamma = (T*) calloc(m, sizeof(T)); - beta = (T*) calloc(m, sizeof(T)); - step_size = 0.0; - - search = (T*) calloc(DOF, sizeof(T)); - prev_positions = (T*) calloc(DOF, sizeof(T)); - prev_gradient = (T*) calloc(DOF, sizeof(T)); - - q = (T*) calloc(DOF, sizeof(T)); - x_plus_step = (T*) calloc(DOF, sizeof(T)); - s = (T*) calloc(m*DOF, sizeof(T)); - y = (T*) calloc(m*DOF, sizeof(T)); - } - - ~LBFGS() { - free(rho); - free(alpha); - free(gamma); - free(beta); - free(G); - free(X); - free(search); - free(prev_positions); - free(prev_gradient); - free(q); - free(x_plus_step); - free(s); - free(y); - } - -/** - * Steepest decent step - * - * @param X Initial Position - * @param G Initial Position Grad - */ -void init(T* X, T* G, T steepest_descent_step_size) { - // f(X0), g(X0) - grad(X, G); - // Update - for (int i = 0; i < DOF; ++i) { - prev_positions[i] = X[i]; - prev_gradient[i] = G[i]; - } - for (int i = 0; i < DOF; ++i) { - s[i + (m - 1) * DOF] = X[i] - prev_positions[i]; - y[i + (m - 1) * DOF] = G[i] - prev_gradient[i]; - X[i] -= steepest_descent_step_size * G[i]; - } - // f(X0-size*G), g(X0-size*G) - grad(X, G); -} - -/** - * 1. Compute new L-BFGS step direction - * Pseudocode from wikipedia: - * q = g.i // search direction to be updated - * for j = i-1 to i-m: - * alpha.j = rho.j * s.j.T * q // dot product - * q = q - alpha.j * y.j // vector scale & subtraction - * gamma.i = s.i-1.T * y.i-1 / y.i-1.T * y.i-1 // dot products in numerator and denominator - * q = gamma.i * q - * for j = i-m to i-1: - * beta = rho.j * y.j.T * q // dot product - * q = q + (alpha.j - beta) * s.j // vector scale & addition - * q = -q // negate applied above instead of here most likely - * gamma = s.i.T * y.i / y.i.T * y.i - * rho.j = 1 / (y.j.T * s.j) - */ -void minimize_step(T* X, T* G) { - if (minimized){ - return; - } - // Eval function at X - T p0 = grad(X, G); - for (int i = 0; i < DOF; ++i) { - q[i] = G[i]; - } - update(X, G); - for (int i = m - 1; i >= 0; --i) { - alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF); - for (int j = 0; j < DOF; ++j) { - q[j] -= alpha[i] * y[i * DOF + j]; - } - } - T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF); - for (int j = 0; j < DOF; ++j) { - q[j] *= gamma; - } - for (int i = 0; i < m; ++i) { - T beta = rho[i] * dot_product(y + i * DOF, q, DOF); - for (int j = 0; j < DOF; ++j) { - q[j] += (alpha[i] - beta) * s[i * DOF + j]; - } - } - // min_a f(X + a*q) - step_size = this->linesearch(p0, X, G); - std::cout << "step size: " << step_size << "\n"; - // Update system positions - for (int j = 0; j < DOF; ++j) { - X[j] += (step_size * -q[j]); - } -} - -void update(T* X, T* G) { - for (int i = 0; i < ((m - 1) * DOF); ++i) { - s[i] = s[i + DOF]; - y[i] = y[i + DOF]; - } - for (int i = 0; i < DOF; ++i) { - s[(m - 1) * DOF + i] = X[i] - prev_positions[i]; - y[(m - 1) * DOF + i] = G[i] - prev_gradient[i]; - } - for (int i = 0; i < m - 1; ++i) { - rho[i] = rho[i + 1]; - } - double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF); - if (s_dot_y == 0) { - std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n"; - minimized = true; - } - else { - rho[m - 1] = 1 / s_dot_y; - } - for (int i = 0; i < DOF; ++i) { - prev_positions[i] = X[i]; - prev_gradient[i] = G[i]; - } -} - -T linesearch(T p0, T* X, T* G) { - int max_it = 1000; - T c = 0.5; - T tau = 0.75; - T m = dot_product(G, q, DOF); - T step_size = 1; - for (int i = 0; i < max_it; i++) { - T sum = 0; - for (int j = 0; j < DOF; ++j) { - x_plus_step[j] = X[j] - (step_size * q[j]); - } - T new_value = grad(x_plus_step, G); - if (p0 - new_value >= step_size * -c * m) { - return step_size; - } else { - step_size *= tau; - printf("Step_size: %f, p0: %f, new: %f, x+s: %f\n", step_size, p0, new_value, q[50]); - } - } - return 0; -} - -private: - int m; // Number of previous gradients to use for hessian approximation (5-7) - int DOF; // Degrees of freedom - T min_step_size; // terminate minimization with steps smaller than this number - T max_step_size; // ensures that lbfgs doesn't overshoot minimum - bool minimized; - - T *beta; - T *gamma; // s projected onto y - T *rho; // [m] rho_{i} = (s_{i}^T * y_{i} - T *alpha; // [m] alpha_{i} = rho_{i} * s_{i}^T * y_{i} - - T *X; - T *G; - T *search; // [DOF] L-BFGS search direction - T *prev_positions; // [DOF] x_{i-1} - T *prev_gradient; // [DOF] g_{i-1} - - T* q; - T* x_plus_step; - T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i} - T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i} - T step_size; - - std::function grad; - - T dot_product(T* a, T* b, int n) { - T result = 0; - for (int i = 0; i < n; ++i) { - result += a[i] * b[i]; - } - return result; - } -}; - -#endif // LBFGS \ No newline at end of file diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu new file mode 100644 index 0000000..2bc2528 --- /dev/null +++ b/src/update/lbfgs.cu @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include +#include + +#include +#include "update/lbfgs.h" +#include "main/real3.h" // for real_sum_reduce + +// C = u*A + w*(d^n)*B, C and A and B can be related +__global__ void vector_add( + int N, real u, real *A, real w, real* d, real n, real *B, + real* C) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + if(d){ + C[i] = u * A[i] + w * pow(*d, n) * B[i]; + } else { + C[i] = u * A[i] + w * B[i]; + } + } +} + + // *dot = w*u^p*(A).(B), w is cpu real, u is gpu real, p is cpu real +__global__ void dot_product( + int N, real w, real* u, real p, real *A, real *B, + real* dot) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + real lDot = 0; + extern __shared__ real sDot[]; + if(i == 0){ + *dot = 0; + } + if (i < N) { + if(u){ + lDot = w*powf(*u, p)*A[i]*B[i]; + } else { + lDot = w*A[i]*B[i]; + } + } + + real_sum_reduce(lDot, sDot, dot); +} + +// +LBFGS::LBFGS(int m, int DOF, std::function user_grad, real *position, real *gradient) + : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) { + step = 0; + cudaMalloc(&tmp_d, sizeof(real)); + cudaMalloc(&gamma_d, sizeof(real)); + cudaMalloc(&rho_inv_d, m*sizeof(real)); + cudaMalloc(&alpha_d, m*sizeof(real)); + cudaMalloc(&beta_d, m*sizeof(real)); + cudaMalloc(&q_d, DOF*sizeof(real)); + cudaMalloc(&prev_positions_d, DOF*sizeof(real)); + cudaMalloc(&prev_gradient_d, DOF*sizeof(real)); + cudaMalloc(&s_d, m*DOF*sizeof(real)); + cudaMalloc(&y_d, m*DOF*sizeof(real)); + + U0 = user_grad(); // first energy call by lbfgs +} + +LBFGS::~LBFGS() { + if (tmp_d) cudaFree(tmp_d); + if (rho_inv_d) cudaFree(rho_inv_d); + if (alpha_d) cudaFree(alpha_d); + if (gamma_d) cudaFree(gamma_d); + if (beta_d) cudaFree(beta_d); + if (prev_positions_d) cudaFree(prev_positions_d); + if (prev_gradient_d) cudaFree(prev_gradient_d); + if (q_d) cudaFree(q_d); + if (s_d) cudaFree(s_d); + if (y_d) cudaFree(y_d); +} + +void LBFGS::minimize_step(real f0) { // f0 from outer loop + cudaMemcpy(q_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); // G from outer loop + //vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, G_d, 0, NULL, 1, q_d, q_d); + if(step != 0 && m != 0) update_sk(); + // Two-loop recursion - Ch7.4, p178 of Nocedal & Wright + for (int i = step-1; i >= step-m; i--) { + if(i < 0) break; + int index = i % m; + // alpha.i = rho.i*s.i*q + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, s_d+index*DOF, q_d, alpha_d+index); + // q = q - alpha.i*y.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, 1, y_d+index*DOF, q_d); + } + if(step != 0 && m != 0){ + int index = (step-1) % m; + // norm = y.i*y.i + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, y_d+index*DOF, tmp_d); + // dot = s.i*y.i/norm + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, tmp_d, -1, s_d+index*DOF, y_d+index*DOF, gamma_d); + // q = gamma*q + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 0, q_d, 1, gamma_d, 1, q_d, q_d); + } + for (int i = step-m; i < step; i++) { + if(step == 0) break; + if(i < 0) continue; + int index = i % m; + // B = rho.i*y.i*q + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, y_d+index*DOF, q_d, tmp_d); + // q = q + s.i*alpha.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, +1, alpha_d+index, 1, s_d+index*DOF, q_d); + // q = q - B*s.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, 1, s_d+index*DOF, q_d); + } + // q = -q + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, 0, NULL, 1, q_d, q_d); + // min_a f(X + a*q) + linesearch(f0); // backtracking, leaves X & G at final X+a*q + step++; +} + +void LBFGS::update_sk() { + int index = (step - 1) % m; + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, 1, prev_positions_d, s_d+index*DOF); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, 1, prev_gradient_d, y_d+index*DOF); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, s_d+index*DOF, rho_inv_d+index); + + cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real), cudaMemcpyDefault); + cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); +} + +// Ch3.2, p37 of Nocedal & Wright +real LBFGS::linesearch(real f0) { + int max_it = 15; + real a = 5e-3; // max step size + real a_prev = 0; + real c1 = 1e-4; // Armijo cond + real tau = 0.25; // labeled rho in reference + real fi = 0; + real m0; + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, G_d, q_d, tmp_d); // df(0)/da + cudaMemcpy(&m0, tmp_d, sizeof(real), cudaMemcpyDefault); + for (int i = 0; i < max_it; i++) { + // Bump positions back and forth + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, (a-a_prev), NULL, 1, q_d, X_d); + fi = system_grad(); // update G + printf("a: %f, U0-Uf: %f, m0: %f\n", a, f0-fi, m0); + if (fi <= f0 + c1*a*m0) { + Uf = fi; + return a; // position & gradient already updated at new point) + } else { + a_prev = a; + a *= tau; + } + } + // Linesearch failed + printf("Linesearch Failed!\n", a); // a0*tau^(max_iter) + Uf = fi; + return a; +} \ No newline at end of file diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h new file mode 100644 index 0000000..bc0d757 --- /dev/null +++ b/src/update/lbfgs.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include "main/defines.h" + +class LBFGS { +public: + int m; // Number of previous gradients to use for hessian approximation (5-7) + int step; + bool minimized; + real U0; + real Uf; + + LBFGS(int m, int DOF, std::function user_grad, real* position, real* gradient); + ~LBFGS(); + void minimize_step(real f0); + +private: + int DOF; // Degrees of freedom to minimize + // All data stored on GPU + real *beta_d; + real *tmp_d; // 1 real temp storage + real *gamma_d; // sk-1 projected onto yk-1 + real *rho_inv_d; // [m] rho_inv[i] = s[i]^T * y[i] + real *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i] + real *search_d; // [DOF] L-BFGS search direction + real *prev_positions_d; // [DOF] x[i-1] + real *prev_gradient_d; // [DOF] g[i-1] + real *X_d; // points to user GPU memory, positions to be optimized + real *G_d; // points to user GPU memory + real *q_d; + real *s_d; // [m, DOF] s[i] = x[i+1] - x[i] (1D array) + real *y_d; // [m, DOF] y[i] = grad[i+1] - grad[i] (1D array) + std::function system_grad; + + void update_sk(); + real linesearch(real f0); +}; \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index 51bd33b..19a23b8 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -85,6 +85,18 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire real_sum_reduce(3*lDot/N,sDot,dot); } +__global__ void double_convert_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction) +{ + int i=blockIdx.x*blockDim.x+threadIdx.x; + if (irun->minType==esdfd) { cudaMalloc(&minDirection_d,3*atomCount*sizeof(real_v)); } + if (system->run->minType==elbfgs){ + // Function called by L-BFGS class to get energy & gradient + std::function energy_and_grad = [system](){ + // Copy X onto double precision buffer + int DOF = system->state->atomCount*3; + int shift = system->state->lambdaCount; + double_convert_float<<<(system->state->atomCount+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( + DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, false); + // Potential & Grad Eval -> step=0 to calc energy + system->domdec->update_domdec(system,true); + system->potential->calc_force(0, system); + // grad(F(X)) G already stored + system->state->recv_energy(); + system->run->lbfgs_energy_evals++; + return (real)system->state->energy[eepotential]; + }; + + int shift = system->state->lambdaCount; + int DOF = system->state->atomCount*3; // only minimize atom positions + int m = 7; // Past grad depth + system->state->recv_state(); + system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_fd+shift, system->state->forceBuffer_d+shift); + } } void State::min_dest(System *system) @@ -112,9 +147,12 @@ void State::min_dest(System *system) if (system->run->minType==esdfd) { cudaFree(minDirection_d); } + if (system->run->minType==elbfgs){ + system->run->lbfgs->~LBFGS(); + } } -void State::min_move(int step,int nsteps,System *system) +void State::min_move(int step, int nsteps, System *system) { Run *r=system->run; real_e grads2[2]; @@ -128,27 +166,7 @@ void State::min_move(int step,int nsteps,System *system) recv_energy(); if (system->verbose>0) display_nrg(system); currEnergy=energy[eepotential]; - if (step == 0){ - // Function called by LBFGS class to get energy & gradient - std::function energy_and_grad = [system](real* X, real* G){ - // Copy X onto CUDA device - int DOF = system->state->atomCount*3 + system->msld->blockCount; - cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault); - // Potential & Grad Eval - system->potential->calc_force(0, system); - // Copy grad(F(X)) into G - cudaMemcpy(G, system->state->force_d, DOF*sizeof(real), cudaMemcpyDefault); - real energy = system->state->energy[eepotential]; - return energy; - }; - - int DOF = system->state->atomCount*3 + system->msld->blockCount; - int m = 7; // Past grad depth - r->lbfgs = new LBFGS(7, DOF, energy_and_grad); - // Steepest decent step - r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2); - } - r->lbfgs->minimize_step((real*)system->state->positionBuffer, (real*) system->state->forceBuffer); + r->lbfgs->minimize_step(currEnergy); } } else if (r->minType==esd) { From 7ec444550715780f2993dce8a1ee26a1f7dd1a07 Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Sun, 21 Dec 2025 19:59:56 -0800 Subject: [PATCH 5/8] Fixed multiple bugs, most importantly in line search checking of wolfe cond... book reference misleading --- src/run/run.cu | 7 +- src/system/state.cxx | 1 + src/system/state.h | 1 + src/update/lbfgs.cu | 411 ++++++++++++++++++++++++++++++----------- src/update/lbfgs.h | 66 ++++--- src/update/minimize.cu | 25 +-- 6 files changed, 370 insertions(+), 141 deletions(-) diff --git a/src/run/run.cu b/src/run/run.cu index c33df91..493fa96 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -504,9 +504,10 @@ void Run::minimize(char *line,char *token,System *system) } if(system->run->minType==elbfgs){ - printf("L-BFGS took %d force evaluations in %d steps.\n", system->run->lbfgs_energy_evals, system->run->lbfgs->step); - printf("U0: %f, Uf: %f, Uf - U0: %f\n", - system->run->lbfgs->U0, system->run->lbfgs->Uf, system->run->lbfgs->Uf - system->run->lbfgs->U0); + Run* r = system->run; + printf("L-BFGS (m=%d) did %d force evaluations in %d steps. Reset memory %d times.\n", + r->lbfgs->m, r->lbfgs_energy_evals, r->lbfgs->step_count, r->lbfgs->reset_count); + printf("U0: %f, Uf: %f, Uf - U0: %f\n", r->lbfgs->U0, r->lbfgs->Uf, r->lbfgs->Uf - r->lbfgs->U0); } system->state->min_dest(system); diff --git a/src/system/state.cxx b/src/system/state.cxx index 7635c0f..de03543 100644 --- a/src/system/state.cxx +++ b/src/system/state.cxx @@ -42,6 +42,7 @@ State::State(System *system) { cudaMalloc(&(positionBackup_d),(2*nL+3*n)*sizeof(real_x)); forceBuffer=(real_f*)calloc(rootFactor*(2*nL+3*n),sizeof(real_f)); cudaMalloc(&(forceBuffer_d),rootFactor*(2*nL+3*n)*sizeof(real_f)); + cudaMalloc(&(forceBufferX_d),rootFactor*(2*nL+3*n)*sizeof(real_x)); cudaMalloc(&(forceBackup_d),(2*nL+3*n)*sizeof(real_f)); if (system->idCount>0) { // OMP diff --git a/src/system/state.h b/src/system/state.h index 1270515..66ebf83 100644 --- a/src/system/state.h +++ b/src/system/state.h @@ -67,6 +67,7 @@ class State { real *positionBuffer_omp; real_f *forceBuffer; real_f *forceBuffer_d; + real_x *forceBufferX_d; real_f *forceBackup_d; // For NPT real_f *forceBuffer_omp; diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu index 2bc2528..858d0aa 100644 --- a/src/update/lbfgs.cu +++ b/src/update/lbfgs.cu @@ -4,153 +4,356 @@ #include #include #include +#include #include #include "update/lbfgs.h" #include "main/real3.h" // for real_sum_reduce -// C = u*A + w*(d^n)*B, C and A and B can be related -__global__ void vector_add( - int N, real u, real *A, real w, real* d, real n, real *B, - real* C) { +/* + Cuda Kernels +*/ + +// d = 1/sqrt(d) - element-wise +__global__ void inv_sqrt(int N, real_x *d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = 1.0/sqrt(d[i]); + } +} + +// d = 1/d - element-wise +__global__ void recip(int N, real_x *d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = 1.0/d[i]; + } +} + +// d = abs(d) - element-wise +__global__ void vector_abs(int N, real_x *d){ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + d[i] = abs(d[i]); + } +} + +// C = u*A + w*d*B, C and A and B can be related +__global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real_x *B, real_x *C) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) { if(d){ - C[i] = u * A[i] + w * pow(*d, n) * B[i]; + C[i] = u * A[i] + w * d[0] * B[i]; } else { C[i] = u * A[i] + w * B[i]; } } } - // *dot = w*u^p*(A).(B), w is cpu real, u is gpu real, p is cpu real -__global__ void dot_product( - int N, real w, real* u, real p, real *A, real *B, - real* dot) { +// C = A[0]*B - element-wise +__global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){ int i = blockIdx.x * blockDim.x + threadIdx.x; - real lDot = 0; - extern __shared__ real sDot[]; - if(i == 0){ - *dot = 0; + if (i < N) { + C[i] = A[0]*B[i]; + } +} + +// B = c*A - element-wise +__global__ void vector_scale(int N, real_x c, real_x *A, real_x *B){ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + B[i] = c*A[i]; } +} + +__global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + real_x lDot = 0; + extern __shared__ real sDot[]; if (i < N) { - if(u){ - lDot = w*powf(*u, p)*A[i]*B[i]; - } else { - lDot = w*A[i]*B[i]; - } + lDot = A[i]*B[i]; } - real_sum_reduce(lDot, sDot, dot); + real_sum_reduce((real)lDot, sDot, dot); } -// -LBFGS::LBFGS(int m, int DOF, std::function user_grad, real *position, real *gradient) +/* + Class setup +*/ + +// +LBFGS::LBFGS(int m, int DOF, std::function user_grad, real_x *position, real_x *gradient) : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) { - step = 0; - cudaMalloc(&tmp_d, sizeof(real)); - cudaMalloc(&gamma_d, sizeof(real)); - cudaMalloc(&rho_inv_d, m*sizeof(real)); - cudaMalloc(&alpha_d, m*sizeof(real)); - cudaMalloc(&beta_d, m*sizeof(real)); - cudaMalloc(&q_d, DOF*sizeof(real)); - cudaMalloc(&prev_positions_d, DOF*sizeof(real)); - cudaMalloc(&prev_gradient_d, DOF*sizeof(real)); - cudaMalloc(&s_d, m*DOF*sizeof(real)); - cudaMalloc(&y_d, m*DOF*sizeof(real)); + k = 0; + cudaMalloc(&tmp_d, sizeof(real_x)); + cudaMalloc(&gamma_d, sizeof(real_x)); + cudaMalloc(&rho_d, m*sizeof(real_x)); + cudaMalloc(&alpha_d, m*sizeof(real_x)); + cudaMalloc(&q_d, DOF*sizeof(real_x)); + cudaMalloc(&prev_positions_d, DOF*sizeof(real_x)); + cudaMalloc(&prev_gradient_d, DOF*sizeof(real_x)); + cudaMalloc(&s_d, m*DOF*sizeof(real_x)); + cudaMalloc(&y_d, m*DOF*sizeof(real_x)); + cudaMalloc(&s_tmp_d, DOF*sizeof(real_x)); + cudaMalloc(&y_tmp_d, DOF*sizeof(real_x)); U0 = user_grad(); // first energy call by lbfgs + + // Normalize first s.d. step + gamma_norm(); + + // Set up s.d. step + cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d); + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d); } LBFGS::~LBFGS() { if (tmp_d) cudaFree(tmp_d); - if (rho_inv_d) cudaFree(rho_inv_d); + if (rho_d) cudaFree(rho_d); if (alpha_d) cudaFree(alpha_d); if (gamma_d) cudaFree(gamma_d); - if (beta_d) cudaFree(beta_d); if (prev_positions_d) cudaFree(prev_positions_d); if (prev_gradient_d) cudaFree(prev_gradient_d); if (q_d) cudaFree(q_d); if (s_d) cudaFree(s_d); if (y_d) cudaFree(y_d); + if (s_tmp_d) cudaFree(s_tmp_d); + if (y_tmp_d) cudaFree(y_tmp_d); } -void LBFGS::minimize_step(real f0) { // f0 from outer loop - cudaMemcpy(q_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); // G from outer loop - //vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, G_d, 0, NULL, 1, q_d, q_d); - if(step != 0 && m != 0) update_sk(); - // Two-loop recursion - Ch7.4, p178 of Nocedal & Wright - for (int i = step-1; i >= step-m; i--) { - if(i < 0) break; +/* + L-BFGS iteration +*/ + +bool LBFGS::check_convergence(){ + // return true if rmsg minimized + real_x grad_norm = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d); + cudaMemcpy(&grad_norm, tmp_d, sizeof(real_x), cudaMemcpyDefault); + grad_norm = sqrt(grad_norm); + return grad_norm < eps; +} + +// set gamma = 1/|g_d| +void LBFGS::gamma_norm(){ + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d); + inv_sqrt<<<1,1>>>(1, tmp_d); + cudaMemcpy(gamma_d, tmp_d, sizeof(real_x), cudaMemcpyDefault); +} + +// Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4) +void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop + // Copy kth positions & gradients + cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + + // min_a f(X + a*q) + step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q) + + // Check convergence + // TODO: implement convergence checks + + // kth position and gradient deltas + update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory + k++; + + // Two-loop recursion + cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + for (int i = k-1; i >= std::max(0, k-m); i--) { int index = i % m; - // alpha.i = rho.i*s.i*q - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, s_d+index*DOF, q_d, alpha_d+index); - // q = q - alpha.i*y.i - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, 1, y_d+index*DOF, q_d); - } - if(step != 0 && m != 0){ - int index = (step-1) % m; - // norm = y.i*y.i - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, y_d+index*DOF, tmp_d); - // dot = s.i*y.i/norm - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, tmp_d, -1, s_d+index*DOF, y_d+index*DOF, gamma_d); - // q = gamma*q - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 0, q_d, 1, gamma_d, 1, q_d, q_d); + // q = q - (rho.i*(s.i dot q))*y.i + cudaMemset(alpha_d+index, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index); + vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d); } - for (int i = step-m; i < step; i++) { - if(step == 0) break; - if(i < 0) continue; + // q = gamma.k*q = r + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d); + for (int i = std::max(0, k-m); i <= k-1; i++) { int index = i % m; - // B = rho.i*y.i*q - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, y_d+index*DOF, q_d, tmp_d); - // q = q + s.i*alpha.i - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, +1, alpha_d+index, 1, s_d+index*DOF, q_d); - // q = q - B*s.i - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, 1, s_d+index*DOF, q_d); + // B = rho.i*(y.i dot q) + // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d); + vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, s_d+index*DOF, q_d); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, 1, alpha_d+index, s_d+index*DOF, q_d); } + // q = -q - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, 0, NULL, 1, q_d, q_d); - // min_a f(X + a*q) - linesearch(f0); // backtracking, leaves X & G at final X+a*q - step++; -} - -void LBFGS::update_sk() { - int index = (step - 1) % m; - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, 1, prev_positions_d, s_d+index*DOF); - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, 1, prev_gradient_d, y_d+index*DOF); - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, s_d+index*DOF, rho_inv_d+index); - - cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real), cudaMemcpyDefault); - cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); -} - -// Ch3.2, p37 of Nocedal & Wright -real LBFGS::linesearch(real f0) { - int max_it = 15; - real a = 5e-3; // max step size - real a_prev = 0; - real c1 = 1e-4; // Armijo cond - real tau = 0.25; // labeled rho in reference - real fi = 0; - real m0; - dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, G_d, q_d, tmp_d); // df(0)/da - cudaMemcpy(&m0, tmp_d, sizeof(real), cudaMemcpyDefault); - for (int i = 0; i < max_it; i++) { - // Bump positions back and forth - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, (a-a_prev), NULL, 1, q_d, X_d); - fi = system_grad(); // update G - printf("a: %f, U0-Uf: %f, m0: %f\n", a, f0-fi, m0); - if (fi <= f0 + c1*a*m0) { - Uf = fi; - return a; // position & gradient already updated at new point) + vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d); + + step_count++; +} + +void LBFGS::update_sk_yk() { + if(m == 0) { // Steepest decent, normalize step size via gamma + gamma_norm(); + return; + } + int index = k % m; + + // s & y tmp + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, prev_gradient_d, y_tmp_d); + + real_x yy = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d); + cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault); + + real_x sy = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d); + cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault); + + // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS) + if(sy < 1e-6){ + printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy); + cudaMemset(s_d, 0, m*DOF*sizeof(real_x)); + cudaMemset(y_d, 0, m*DOF*sizeof(real_x)); + gamma_norm(); + k = -1; + reset_count++; + return; + } + + // rho = 1/y.s + real_x rho = 1.0/sy; + // gamma_k = s.y/(y.y) + real_x gamma = sy/yy; + cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); + printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma); +} + +/* + Line Search +*/ + +// return [f(X + alpha*p), df(X+alpha*p)/da] +void LBFGS::phi(real_x alpha, real_x* result, bool leave_x){ + cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault); + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d); + result[0] = system_grad(); + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da + cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault); + printf("alpha: %f, phi: %f, phi': %f\n", alpha, result[0], result[1]); + if(!leave_x) { + cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault); + cudaMemcpy(G_d, prev_gradient_d, DOF*sizeof(real_x), cudaMemcpyDefault); + } +} + +real_x sign(real_x value){ + if(value < 0){ + return -1; + } + return 1; +} + +// Ch3.5, p59 of Nocedal & Wright (Equation 3.59) +real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){ + real_x d1 = ga + gb - 3*(fb - fa)/(a-b); + real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb); + real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2); + if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){ + printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b); + return (a + b) / 2.0; + } + return arg; +} + +// Ch3.5, p61 of Nocedal & Wright (Algorithm 3.6) +real_x LBFGS::zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0){ + printf("Zoom started!\n"); + int max_iter = 5; + real_x phil[2]; + memcpy(phil, phi_lower, 2*sizeof(real_x)); + real_x phiu[2]; + memcpy(phiu, phi_upper, 2*sizeof(real_x)); + real_x phij[2]; + real_x aj; + for(int i = 0; i < max_iter; i++){ + aj = cubic_interp(al, phil[0], phil[1], au, phiu[0], phiu[1]); + phi(aj, phij, false); + if(phij[0] > phi0[0] + c1*aj*phi0[1] || phij[0] >= phil[0]){ + au = aj; + memcpy(phiu, phij, 2*sizeof(real_x)); } else { - a_prev = a; - a *= tau; + // note that checking for strong wolf is important + if (abs(phij[1]) <= c2*abs(phi0[1])){ + return aj; + } + if (phij[1]*(au - al) >= 0){ + au = al; + memcpy(phiu, phil, 2*sizeof(real_x)); + } + al = aj; + memcpy(phil, phij, 2*sizeof(real_x)); + } + } + printf("Zoom reached max iterations! aj = %f, phi: %f, phi': %f\n", aj, phij[0], phij[1]); + return aj; +} + +// Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5) +real_x LBFGS::linesearch(real_x f0){ + printf("Linesearch started!\n"); + real_x max_iter = 10; + real_x aim1 = 0; + real_x ai = 1; + real_x amax = 10; + real_x tau = 1.5; + + real_x phi0[2]; + phi(0, phi0, true); + real_x phiim1[2]; + memcpy(phiim1, phi0, 2*sizeof(real_x)); + real_x phii[2]; + for(int i = 0; i < max_iter; i++){ + phi(ai, phii, false); + if (phii[0] > phi0[0] + c1*ai*phi0[1] || (phii[0] >= phiim1[0] && i > 0)){ + ai = zoom(aim1, phiim1, ai, phii, phi0); + phi(ai, phii, true); + Uf = phii[0]; + break; + } + // note that checking for strong wolf is important + if (abs(phii[1]) >= c2*abs(phi0[1])){ + phi(ai, phii, false); + Uf = phii[0]; + break; } + if (phii[1] >= 0){ + ai = zoom(aim1, phiim1, ai, phii, phi0); + phi(ai, phii, false); + Uf = phii[0]; + break; + } + aim1 = ai; + memcpy(phiim1, phii, 2*sizeof(real_x)); + ai = tau*ai; + if(ai > amax){ + ai /= tau; + printf("Reached Max Step Size!"); + phi(ai, phii, false); + break; + } + } + + // Protect from taking too small or steps that increase potential + if(ai < 1e-5 || phii[0] > phi0[0]){ + ai = 1e-4; // if g.p < 0 this will probably give ok results } - // Linesearch failed - printf("Linesearch Failed!\n", a); // a0*tau^(max_iter) - Uf = fi; - return a; + phi(ai, phii, true); // update positions & leave + Uf = phii[0]; + //printf("Linesearch reached max iterations! Defaulting to alpha: %f - phi: %f, phi': %f\n", ai, phii[0], phii[1]); + return ai; } \ No newline at end of file diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h index bc0d757..d8c46c3 100644 --- a/src/update/lbfgs.h +++ b/src/update/lbfgs.h @@ -3,36 +3,56 @@ #include #include "main/defines.h" +// Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation class LBFGS { public: - int m; // Number of previous gradients to use for hessian approximation (5-7) - int step; + int m=5; // Number of previous gradients to use for hessian approximation (5-7) + int step_count; // total number of minimize calls + int reset_count; // number of times lbfgs memory was cleared bool minimized; - real U0; - real Uf; + real_x U0; + real_x Uf; + // Line search parameters + real_x c1 = 1e-4; // sufficient decent + real_x c2 = .9; // curvature cond. + // TODO: Implement convergence/stability checks + real_x eps = 1e-4; // Minimization criteria + real_x delta = 1e-4; // stopping criteria - LBFGS(int m, int DOF, std::function user_grad, real* position, real* gradient); + LBFGS(int m, int DOF, std::function user_grad, real_x *position, real_x *gradient); ~LBFGS(); - void minimize_step(real f0); + void minimize_step(real_x f0); private: int DOF; // Degrees of freedom to minimize + real_x step_size = 1; // previous step size // All data stored on GPU - real *beta_d; - real *tmp_d; // 1 real temp storage - real *gamma_d; // sk-1 projected onto yk-1 - real *rho_inv_d; // [m] rho_inv[i] = s[i]^T * y[i] - real *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i] - real *search_d; // [DOF] L-BFGS search direction - real *prev_positions_d; // [DOF] x[i-1] - real *prev_gradient_d; // [DOF] g[i-1] - real *X_d; // points to user GPU memory, positions to be optimized - real *G_d; // points to user GPU memory - real *q_d; - real *s_d; // [m, DOF] s[i] = x[i+1] - x[i] (1D array) - real *y_d; // [m, DOF] y[i] = grad[i+1] - grad[i] (1D array) - std::function system_grad; - - void update_sk(); - real linesearch(real f0); + int k = 0; // lbfgs iteration step + real_x *tmp_d; // 1 real temperary storage + real_x *gamma_d; // s[k-1] projected onto y[k-1] + real_x *rho_d; // [m] rho[k] = 1/(s[k]^T * y[k]) + real_x *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i] + real_x *search_d; // [DOF] L-BFGS search direction + real_x *prev_positions_d; // [DOF] x[k-1] + real_x *prev_gradient_d; // [DOF] g[k-1] + real_x *X_d; // points to user GPU memory, positions to be optimized + real_x *G_d; // points to user GPU memory + real_x *q_d; + real_x *s_d; // [m, DOF] s[k-1] = x[k] - x[k-1] (1D array) + real_x *y_d; // [m, DOF] y[k-1] = grad[k] - grad[k-1] (1D array) + + real_x *s_tmp_d; // [DOF] x[k] - x[k-1], used to check curvature + real_x *y_tmp_d; // [DOF] g[k] - g[k-1], used to check curvature + bool skipped_step = false; // skipped hessian update, if 2 updates are skipped, we clear s & y memory + + std::function system_grad; + + void gamma_norm(); + void update_sk_yk(); + void phi(real_x alpha, real_x *result, bool leave_x); + + bool check_convergence(); + + real_x zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0); + real_x linesearch(real_x f0); }; \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index 19a23b8..9659848 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -85,13 +85,13 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire real_sum_reduce(3*lDot/N,sDot,dot); } -__global__ void double_convert_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction) +__global__ void double_onto_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction) { int i=blockIdx.x*blockDim.x+threadIdx.x; if (irun->minType==elbfgs){ // Function called by L-BFGS class to get energy & gradient - std::function energy_and_grad = [system](){ - // Copy X onto double precision buffer + std::function energy_and_grad = [system](){ + // Copy double precision onto float precision buffer int DOF = system->state->atomCount*3; int shift = system->state->lambdaCount; - double_convert_float<<<(system->state->atomCount+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( - DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, false); + double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( + DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, true); // Potential & Grad Eval -> step=0 to calc energy - system->domdec->update_domdec(system,true); + //system->domdec->update_domdec(system,true); system->potential->calc_force(0, system); // grad(F(X)) G already stored system->state->recv_energy(); + // Copy float precision onto double precision buffer + double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>( + DOF, system->state->forceBufferX_d+shift, system->state->forceBuffer_d+shift, false); system->run->lbfgs_energy_evals++; - return (real)system->state->energy[eepotential]; + return system->state->energy[eepotential]; }; int shift = system->state->lambdaCount; int DOF = system->state->atomCount*3; // only minimize atom positions - int m = 7; // Past grad depth + int m = 10; // Past grad depth system->state->recv_state(); - system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_fd+shift, system->state->forceBuffer_d+shift); + system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift); } } From be90583be0d97ced2d2d3105b060a9dcb39db316 Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Mon, 22 Dec 2025 14:55:44 -0800 Subject: [PATCH 6/8] Added script options for lbfgs and minimize checks on grms --- src/run/run.cu | 4 + src/run/run.h | 2 + src/update/lbfgs.cu | 166 +++++++++++++++++------------------------ src/update/lbfgs.h | 32 ++++---- src/update/minimize.cu | 6 +- 5 files changed, 94 insertions(+), 116 deletions(-) diff --git a/src/run/run.cu b/src/run/run.cu index 493fa96..b73bce3 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -357,6 +357,10 @@ void Run::set_variable(char *line,char *token,System *system) dxAtomMax=io_nextf(line)*ANGSTROM; } else if (strcmp(token,"dxrmsinit")==0) { dxRMSInit=io_nextf(line)*ANGSTROM; + } else if (strcmp(token, "lbfgs_m")==0){ + lbfgs_m=io_nexti(line); + } else if (strcmp(token, "lbfgs_eps")==0){ + lbfgs_eps=io_nextf(line); } else if (strcmp(token,"mintype")==0) { std::string minString=io_nexts(line); if (strcmp(minString.c_str(), "lbfgs")==0){ diff --git a/src/run/run.h b/src/run/run.h index e8269e6..184b31e 100644 --- a/src/run/run.h +++ b/src/run/run.h @@ -61,6 +61,8 @@ class Run { // L-BFGS variables LBFGS* lbfgs; int lbfgs_energy_evals = 0; + int lbfgs_m = 7; // gradient history length*DOF + real lbfgs_eps = 1; // gradient rms convergence criteria real betaEwald; real rCut; diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu index 858d0aa..46238cc 100644 --- a/src/update/lbfgs.cu +++ b/src/update/lbfgs.cu @@ -50,6 +50,15 @@ __global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real } } +// specific for 2nd lbfgs loop +// C = A + (u-w)*B, C and A and B can be related +__global__ void vector_add(int N, real_x *A, real_x *u, real_x *w, real_x *B, real_x *C) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + C[i] = A[i] + (u[0]-w[0]) * B[i]; + } +} + // C = A[0]*B - element-wise __global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){ int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -82,8 +91,8 @@ __global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) { */ // -LBFGS::LBFGS(int m, int DOF, std::function user_grad, real_x *position, real_x *gradient) - : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) { +LBFGS::LBFGS(int m, real_x eps, int DOF, std::function user_grad, real_x *position, real_x *gradient) + : m(m), eps_tol(eps), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) { k = 0; cudaMalloc(&tmp_d, sizeof(real_x)); cudaMalloc(&gamma_d, sizeof(real_x)); @@ -126,14 +135,30 @@ LBFGS::~LBFGS() { L-BFGS iteration */ -bool LBFGS::check_convergence(){ +bool LBFGS::converged(){ // return true if rmsg minimized - real_x grad_norm = 0; + rmsg = 0; + grad_mag = 0; + grad_pos_mag = 0; + cudaMemset(tmp_d, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d); - cudaMemcpy(&grad_norm, tmp_d, sizeof(real_x), cudaMemcpyDefault); - grad_norm = sqrt(grad_norm); - return grad_norm < eps; + cudaMemcpy(&rmsg, tmp_d, sizeof(real_x), cudaMemcpyDefault); + grad_mag = sqrt(rmsg); + rmsg = sqrt(rmsg/DOF); + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, X_d, X_d, tmp_d); + cudaMemcpy(&grad_pos_mag, tmp_d, sizeof(real_x), cudaMemcpyDefault); + grad_pos_mag = grad_mag / std::max(1.0, sqrt(grad_pos_mag)); + + printf(" rmsg = %f\n", rmsg); + printf(" |g| = %f\n", grad_mag); + printf("|g|/|x| = %f\n", grad_pos_mag); + if(rmsg < eps_tol){ + printf("Structure minimized!!\n"); + minimized=true; + } + return rmsg < eps_tol; } // set gamma = 1/|g_d| @@ -145,17 +170,15 @@ void LBFGS::gamma_norm(){ } // Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4) -void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop +void LBFGS::minimize_step(real_x f0) { // f0 & G filled from class initializer // Copy kth positions & gradients cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault); cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); + if(converged()) return; // min_a f(X + a*q) step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q) - // Check convergence - // TODO: implement convergence checks - // kth position and gradient deltas update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory k++; @@ -164,10 +187,11 @@ void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); for (int i = k-1; i >= std::max(0, k-m); i--) { int index = i % m; - // q = q - (rho.i*(s.i dot q))*y.i + // alpha.i = rho.i*(s.i dot q) cudaMemset(alpha_d+index, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index); vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index); + // q = q - alpha.i*y.i vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d); } // q = gamma.k*q = r @@ -175,26 +199,24 @@ void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop for (int i = std::max(0, k-m); i <= k-1; i++) { int index = i % m; // B = rho.i*(y.i dot q) - // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i cudaMemset(tmp_d, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d); vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d); - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, s_d+index*DOF, q_d); - vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, 1, alpha_d+index, s_d+index*DOF, q_d); - } + // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i + vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, q_d, alpha_d+index, tmp_d, s_d+index*DOF, q_d); + } // q = -q vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d); - step_count++; } +// this method is overly cautious & slow, likely doesn't need the cpu grad checks or resets void LBFGS::update_sk_yk() { if(m == 0) { // Steepest decent, normalize step size via gamma gamma_norm(); return; } - int index = k % m; // s & y tmp vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d); @@ -204,14 +226,13 @@ void LBFGS::update_sk_yk() { cudaMemset(tmp_d, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d); cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault); - real_x sy = 0; cudaMemset(tmp_d, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d); cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault); // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS) - if(sy < 1e-6){ + if(sy < 1e-10){ printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy); cudaMemset(s_d, 0, m*DOF*sizeof(real_x)); cudaMemset(y_d, 0, m*DOF*sizeof(real_x)); @@ -226,10 +247,11 @@ void LBFGS::update_sk_yk() { // gamma_k = s.y/(y.y) real_x gamma = sy/yy; cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault); + int index = k % m; cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault); cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault); - printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma); + //printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma); } /* @@ -237,18 +259,13 @@ void LBFGS::update_sk_yk() { */ // return [f(X + alpha*p), df(X+alpha*p)/da] -void LBFGS::phi(real_x alpha, real_x* result, bool leave_x){ +void LBFGS::phi(real_x alpha, real_x* result){ cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault); vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d); result[0] = system_grad(); cudaMemset(tmp_d, 0, sizeof(real_x)); dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault); - printf("alpha: %f, phi: %f, phi': %f\n", alpha, result[0], result[1]); - if(!leave_x) { - cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault); - cudaMemcpy(G_d, prev_gradient_d, DOF*sizeof(real_x), cudaMemcpyDefault); - } } real_x sign(real_x value){ @@ -260,100 +277,55 @@ real_x sign(real_x value){ // Ch3.5, p59 of Nocedal & Wright (Equation 3.59) real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){ - real_x d1 = ga + gb - 3*(fb - fa)/(a-b); + real_x d1 = ga + gb - 3*(fa - fb)/(a-b); real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb); real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2); if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){ - printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b); + //printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b); return (a + b) / 2.0; } return arg; } -// Ch3.5, p61 of Nocedal & Wright (Algorithm 3.6) -real_x LBFGS::zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0){ - printf("Zoom started!\n"); - int max_iter = 5; - real_x phil[2]; - memcpy(phil, phi_lower, 2*sizeof(real_x)); - real_x phiu[2]; - memcpy(phiu, phi_upper, 2*sizeof(real_x)); - real_x phij[2]; - real_x aj; - for(int i = 0; i < max_iter; i++){ - aj = cubic_interp(al, phil[0], phil[1], au, phiu[0], phiu[1]); - phi(aj, phij, false); - if(phij[0] > phi0[0] + c1*aj*phi0[1] || phij[0] >= phil[0]){ - au = aj; - memcpy(phiu, phij, 2*sizeof(real_x)); - } else { - // note that checking for strong wolf is important - if (abs(phij[1]) <= c2*abs(phi0[1])){ - return aj; - } - if (phij[1]*(au - al) >= 0){ - au = al; - memcpy(phiu, phil, 2*sizeof(real_x)); - } - al = aj; - memcpy(phil, phij, 2*sizeof(real_x)); - } - } - printf("Zoom reached max iterations! aj = %f, phi: %f, phi': %f\n", aj, phij[0], phij[1]); - return aj; -} - // Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5) real_x LBFGS::linesearch(real_x f0){ - printf("Linesearch started!\n"); - real_x max_iter = 10; + real_x max_iter = 5; real_x aim1 = 0; real_x ai = 1; real_x amax = 10; - real_x tau = 1.5; - - real_x phi0[2]; - phi(0, phi0, true); + // Data is already loaded, saves 1 energy & grad call + real_x phi0[2] = {f0, 0}; + cudaMemset(tmp_d, 0, sizeof(real_x)); + dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da + cudaMemcpy(&phi0[1], tmp_d, sizeof(real_x), cudaMemcpyDefault); real_x phiim1[2]; memcpy(phiim1, phi0, 2*sizeof(real_x)); + // Evaluate at 1 & iterate using cubic interpolation real_x phii[2]; + phi(ai, phii); + printf("alpha: %f, phi: %f, phi': %f\n", aim1, phi0[0], phi0[1]); + printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]); for(int i = 0; i < max_iter; i++){ - phi(ai, phii, false); - if (phii[0] > phi0[0] + c1*ai*phi0[1] || (phii[0] >= phiim1[0] && i > 0)){ - ai = zoom(aim1, phiim1, ai, phii, phi0); - phi(ai, phii, true); + // Check strong wolfe condition + if (phii[0] <= phi0[0] + c1*ai*phi0[1] && abs(phii[1]) <= c2*abs(phi0[1])){ Uf = phii[0]; - break; - } - // note that checking for strong wolf is important - if (abs(phii[1]) >= c2*abs(phi0[1])){ - phi(ai, phii, false); - Uf = phii[0]; - break; + return ai; } - if (phii[1] >= 0){ - ai = zoom(aim1, phiim1, ai, phii, phi0); - phi(ai, phii, false); - Uf = phii[0]; - break; - } - aim1 = ai; + real_x tmp = ai; + ai = cubic_interp(aim1, phiim1[0], phiim1[1], ai, phii[0], phii[1]); + phi(ai, phii); + printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]); + aim1 = tmp; memcpy(phiim1, phii, 2*sizeof(real_x)); - ai = tau*ai; - if(ai > amax){ - ai /= tau; - printf("Reached Max Step Size!"); - phi(ai, phii, false); + if(ai < 0 || ai > amax){ break; } } - // Protect from taking too small or steps that increase potential - if(ai < 1e-5 || phii[0] > phi0[0]){ - ai = 1e-4; // if g.p < 0 this will probably give ok results - } - phi(ai, phii, true); // update positions & leave + // Failed line search + ai = 1e-5; // default step size, a small enough step should decrease function if s.y > 0 + phi(ai, phii); Uf = phii[0]; - //printf("Linesearch reached max iterations! Defaulting to alpha: %f - phi: %f, phi': %f\n", ai, phii[0], phii[1]); + printf("Linesearch failed! Defaulting to ai = %e, phi: %f, phi': %f\n", ai, phii[0], phii[1]); return ai; } \ No newline at end of file diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h index d8c46c3..43c05e5 100644 --- a/src/update/lbfgs.h +++ b/src/update/lbfgs.h @@ -6,24 +6,26 @@ // Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation class LBFGS { public: + int step_count=0; // total number of minimize calls + int reset_count=0; // number of times lbfgs memory was cleared int m=5; // Number of previous gradients to use for hessian approximation (5-7) - int step_count; // total number of minimize calls - int reset_count; // number of times lbfgs memory was cleared - bool minimized; - real_x U0; - real_x Uf; - // Line search parameters - real_x c1 = 1e-4; // sufficient decent - real_x c2 = .9; // curvature cond. - // TODO: Implement convergence/stability checks - real_x eps = 1e-4; // Minimization criteria - real_x delta = 1e-4; // stopping criteria + bool minimized=false; + real_x U0, Uf; - LBFGS(int m, int DOF, std::function user_grad, real_x *position, real_x *gradient); + LBFGS(int m, real_x eps, int DOF, std::function user_grad, real_x *position, real_x *gradient); ~LBFGS(); void minimize_step(real_x f0); private: + // Line search parameters + real_x c1 = 1e-4; // sufficient decent + real_x c2 = .9; // curvature cond. + // Convergence + real_x eps_tol = 1; // rms criteria (1e-1 is very tight and unlikely to work) + real_x rmsg = 0; // |g| / sqrt(DOF) + real_x grad_mag = 0; // |g| + real_x grad_pos_mag = 0; // |g| / max(1, |x|) + int DOF; // Degrees of freedom to minimize real_x step_size = 1; // previous step size // All data stored on GPU @@ -47,12 +49,10 @@ class LBFGS { std::function system_grad; + bool converged(); void gamma_norm(); void update_sk_yk(); - void phi(real_x alpha, real_x *result, bool leave_x); - bool check_convergence(); - - real_x zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0); + void phi(real_x alpha, real_x *result); real_x linesearch(real_x f0); }; \ No newline at end of file diff --git a/src/update/minimize.cu b/src/update/minimize.cu index 9659848..c0dd28f 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -133,9 +133,8 @@ void State::min_init(System *system) int shift = system->state->lambdaCount; int DOF = system->state->atomCount*3; // only minimize atom positions - int m = 10; // Past grad depth - system->state->recv_state(); - system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift); + system->run->lbfgs = new LBFGS(system->run->lbfgs_m, system->run->lbfgs_eps, DOF, + energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift); } } @@ -170,6 +169,7 @@ void State::min_move(int step, int nsteps, System *system) if (system->verbose>0) display_nrg(system); currEnergy=energy[eepotential]; r->lbfgs->minimize_step(currEnergy); + if(r->lbfgs->minimized){ system->run->step = nsteps; } // do we want this? } } else if (r->minType==esd) { From 5b2582fe87dabbb1ef2c937140972ec036e0556a Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Mon, 22 Dec 2025 15:03:37 -0800 Subject: [PATCH 7/8] Remove energy=true from get_force call, free new state double force buffer --- src/system/potential.cxx | 1 - src/system/state.cxx | 1 + src/update/lbfgs.cu | 4 ++-- src/update/lbfgs.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/system/potential.cxx b/src/system/potential.cxx index 93d75c4..4fdd10b 100644 --- a/src/system/potential.cxx +++ b/src/system/potential.cxx @@ -1617,7 +1617,6 @@ void Potential::calc_force(int step,System *system) if (system->run->freqNPT>0) { calcEnergy=(calcEnergy||(step%system->run->freqNPT==0)); } - calcEnergy=true; #ifdef REPLICAEXCHANGE if (system->run->freqREx>0) { calcEnergy=(calcEnergy||(step%system->run->freqREx==0)); diff --git a/src/system/state.cxx b/src/system/state.cxx index de03543..d45d914 100644 --- a/src/system/state.cxx +++ b/src/system/state.cxx @@ -157,6 +157,7 @@ State::~State() { if (positionBackup_d) cudaFree(positionBackup_d); if (forceBuffer) free(forceBuffer); if (forceBuffer_d) cudaFree(forceBuffer_d); + if (forceBufferX_d) cudaFree(forceBufferX_d); if (forceBackup_d) cudaFree(forceBackup_d); // Other buffers if (energy) free(energy); diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu index 46238cc..73d0440 100644 --- a/src/update/lbfgs.cu +++ b/src/update/lbfgs.cu @@ -119,12 +119,12 @@ LBFGS::LBFGS(int m, real_x eps, int DOF, std::function user_grad, real LBFGS::~LBFGS() { if (tmp_d) cudaFree(tmp_d); + if (gamma_d) cudaFree(gamma_d); if (rho_d) cudaFree(rho_d); if (alpha_d) cudaFree(alpha_d); - if (gamma_d) cudaFree(gamma_d); + if (q_d) cudaFree(q_d); if (prev_positions_d) cudaFree(prev_positions_d); if (prev_gradient_d) cudaFree(prev_gradient_d); - if (q_d) cudaFree(q_d); if (s_d) cudaFree(s_d); if (y_d) cudaFree(y_d); if (s_tmp_d) cudaFree(s_tmp_d); diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h index 43c05e5..8fac8eb 100644 --- a/src/update/lbfgs.h +++ b/src/update/lbfgs.h @@ -8,7 +8,7 @@ class LBFGS { public: int step_count=0; // total number of minimize calls int reset_count=0; // number of times lbfgs memory was cleared - int m=5; // Number of previous gradients to use for hessian approximation (5-7) + int m=7; // Number of previous gradients to use for hessian approximation (5-7) bool minimized=false; real_x U0, Uf; From bbbc3ef042c280593b9bfbd88021b2daab38ff7f Mon Sep 17 00:00:00 2001 From: Matthew Speranza Date: Thu, 5 Feb 2026 11:59:28 -0800 Subject: [PATCH 8/8] small updates --- src/run/run.cu | 1 + src/update/minimize.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/src/run/run.cu b/src/run/run.cu index b73bce3..e12b812 100644 --- a/src/run/run.cu +++ b/src/run/run.cu @@ -412,6 +412,7 @@ void Run::energy(char *line,char *token,System *system) system->potential->calc_force(0,system); system->state->recv_energy(); print_nrg(0,system); + display_nrg(system); dynamics_finalize(system); } diff --git a/src/update/minimize.cu b/src/update/minimize.cu index c0dd28f..6ed4798 100644 --- a/src/update/minimize.cu +++ b/src/update/minimize.cu @@ -8,6 +8,7 @@ #include "system/potential.h" #include "domdec/domdec.h" #include "holonomic/holonomic.h" +#include "holonomic/rectify.h" #include "io/io.h" #include "main/real3.h"