From f285e2e7e9a82fb45cf85184e566ebe4676bbf2a Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Wed, 14 May 2025 11:27:12 -0700
Subject: [PATCH 1/8] Added LBFGS file

---
 src/update/LBFGS.h | 306 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 src/update/LBFGS.h
diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h
new file mode 100644
index 0000000..2cc765a
--- /dev/null
+++ b/src/update/LBFGS.h
@@ -0,0 +1,306 @@
+
+#ifndef LBFGS_H
+#define LBFGS_H
+#include <type_traits>
+
+#include <iostream>
+#include <cstdlib>
+#include <cassert>
+#include <cmath>
+#include <type_traits>
+#include <functional>
+
+using namespace std;
+
+template <typename T>
+class LBFGS {
+    static_assert(std::is_floating_point<T>::value, "L-BFGS can only be used with floating point types");
+
+public:
+    LBFGS(int m, int DOF, std::function<T(T*)> user_func, std::function<void(T*, T*)> user_grad) : m(m), DOF(DOF), func(user_func), grad(user_grad) {
+        minimized = false;
+        min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7;
+        max_step_size = 1.0;
+        rho = (T*) calloc(m, sizeof(T));
+        alpha = (T*) malloc(m*sizeof(T));
+        gamma = (T*) malloc(sizeof(T));
+        beta = (T*) malloc(sizeof(T));
+        step_size = 0.0;
+            
+
+        G = (T*) malloc(DOF*sizeof(T));
+        X = (T*) malloc(DOF*sizeof(T));
+
+        search = (T*) malloc(DOF*sizeof(T));
+        prev_positions = (T*)malloc(DOF * sizeof(T));
+        prev_gradient = (T*)malloc(DOF * sizeof(T));
+
+        q = (T*) malloc(DOF*sizeof(T));
+        s = (T*) calloc(m*DOF, sizeof(T));
+        y = (T*) calloc(m*DOF, sizeof(T));
+    }
+
+    /**
+     * TODO: Free all memory used by LBFGS
+     */
+    ~LBFGS() {
+        free(rho);
+        free(alpha);
+        free(gamma);
+        free(beta);
+        free(G);
+        free(X);
+        free(search);
+        free(prev_positions);
+        free(prev_gradient);
+        free(q);
+        free(s);
+        free(y);
+    }
+
+/**
+ * TODO: Implement steepest decent step
+ * Sets variables and returns 1
+ * @param X Initial Position
+ * @param G Initial Position Grad
+ */
+void init(T* X, T* G, T steepest_descent_step_size) {
+    for (int i = 0; i < DOF; ++i) {
+        prev_positions[i] = X[i];
+        prev_gradient[i] = G[i];
+    }
+
+    for (int i = 0; i < DOF; ++i) {
+        X[i] -= steepest_descent_step_size * G[i];
+        s[i + (m - 1) * DOF] = X[i] - prev_positions[i];
+        y[i + (m - 1) * DOF] = G[i] - prev_gradient[i];
+        this->X[i] = X[i];
+    }
+    grad(this->X, this->G);
+}
+
+/**
+ * This will loop over minimize step and a line search. Implement later
+ */
+void minimize() {
+    bool found_minimum = false;
+    int j = 0;
+    int maxcount2 = 1000000;
+    while (!found_minimum && j < maxcount2) {
+        ++j;
+        minimize_step(X, G);
+        printf("f(");
+        for (int i = 0; i < DOF; i++) {
+            if (i != DOF - 1) {
+                printf("%f, ", X[i]);
+            }
+            else {
+                printf("%f) = %f\n", X[i], func(X));
+            }
+        }
+        grad(X, G);
+        printf("grad of f(");
+        for (int i = 0; i < DOF; i++) {
+            if (i != DOF - 1) {
+                printf("%f, ", X[i]);
+            }
+            else {
+                printf("%f) = [", X[i]);
+            }
+        }
+        for (int i = 0; i < DOF; i++) {
+            if (i != DOF - 1) {
+                printf("%f, ", G[i]);
+            }
+            else {
+                printf("%f]\n", G[i]);
+            }
+        }
+        double sum_of_residuals = 0;
+        for (int i = 0; i < DOF; ++i) {
+            sum_of_residuals += G[i] * G[i];
+
+        }
+        sum_of_residuals = sqrtf((sum_of_residuals / DOF));
+        if (minimized == true || sum_of_residuals < 1e-7) {
+            std::cout << "found minimum\n";
+            std::cout << "Number of Steps: " << j << "\n";
+            found_minimum = true;
+        }
+
+    }
+}
+
+
+/**
+ * 1. Compute new L-BFGS step direction
+ *   Pseudocode from wikipedia:
+ *   q = g.i // search direction to be updated
+ *   for j = i-1 to i-m:
+ *     alpha.j = rho.j * s.j.T * q // dot product
+ *     q = q - alpha.j * y.j // vector scale & subtraction
+ *   gamma.i = s.i-1.T * y.i-1 / y.i-1.T * y.i-1 // dot products in numerator and denominator
+ *   q = gamma.i * q
+ *   for j = i-m to i-1:
+ *     beta = rho.j * y.j.T * q // dot product
+ *     q = q + (alpha.j - beta) * s.j // vector scale & addition
+ *   q = -q  // negate applied above instead of here most likely
+ *   gamma = s.i.T * y.i / y.i.T * y.i
+ *   rho.j = 1 / (y.j.T * s.j)
+ */
+
+void minimize_step(T* X_input, T* G_input) {
+    for (int i = 0; i < DOF; ++i) {
+        X[i] = X_input[i];
+        G[i] = G_input[i];
+    }
+    for (int i = 0; i < DOF; ++i) {
+        q[i] = G[i];
+    }
+    update();
+    for (int i = m - 1; i >= 0; --i) {
+        alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF);
+        for (int j = 0; j < DOF; ++j) {
+            q[j] -= alpha[i] * y[i * DOF + j];
+        }
+    }
+
+    T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF);
+    for (int j = 0; j < DOF; ++j) {
+        q[j] *= gamma;
+    }
+
+    for (int i = 0; i < m; ++i) {
+        T beta = rho[i] * dot_product(y + i * DOF, q, DOF);
+
+        for (int j = 0; j < DOF; ++j) {
+            q[j] += (alpha[i] - beta) * s[i * DOF + j];
+        }
+    }
+    BacktrackingLineSearch<double> backtracking(X, G, q, DOF, func);
+    step_size = backtracking.linesearch(); 
+    std::cout << "step size: " << step_size << "\n";
+    for (int j = 0; j < DOF; ++j) {
+        X[j] += (step_size * -q[j]);
+        X_input[j] = X[j];
+    }
+    
+}
+
+void update() {
+    for (int i = 0; i < ((m - 1) * DOF); ++i) {
+        s[i] = s[i + DOF];
+        y[i] = y[i + DOF];
+
+    }
+    for (int i = 0; i < DOF; ++i) {
+        s[(m - 1) * DOF + i] = X[i] - prev_positions[i];
+        y[(m - 1) * DOF + i] = G[i] - prev_gradient[i];
+    }
+    for (int i = 0; i < m - 1; ++i) {
+        rho[i] = rho[i + 1];
+    }
+
+    double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF);
+
+    if (s_dot_y == 0) {
+        std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n";
+        minimized = true;
+    }
+    else {
+        rho[m - 1] = 1 / s_dot_y;
+    }
+    for (int i = 0; i < DOF; ++i) {
+        prev_positions[i] = X[i];
+        prev_gradient[i] = G[i];
+    }
+}
+
+private:
+    int m; // Number of previous gradients to use for hessian approximation (5-7)
+    int DOF; // Degrees of freedom
+    T min_step_size; // terminate minimization with steps smaller than this number
+    T max_step_size; // ensures that lbfgs doesn't overshoot minimum
+    bool minimized;
+
+    T *beta; 
+    T *gamma; // s projected onto y
+    T *rho; // [m] rho_{i} = (s_{i}^T * y_{i}
+    T *alpha; // [m] alpha_{i} = rho_{i} * s_{i}^T * y_{i}
+
+    T *X;
+    T *G;
+    T *search; // [DOF] L-BFGS search direction
+    T *prev_positions; // [DOF] x_{i-1}
+    T *prev_gradient; // [DOF] g_{i-1}
+
+    T* q;
+    T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i}
+    T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i}
+    T step_size;
+
+    std::function<T(T*)> func;
+    std::function<void(T*, T*)> grad;
+
+    T dot_product(T* a, T* b, int n) {
+        T result = 0;
+        for (int i = 0; i < n; ++i) {
+            result += a[i] * b[i];
+        }
+        return result;
+    }
+};
+
+template <typename T>
+class BacktrackingLineSearch {
+public:
+    BacktrackingLineSearch(T* x, T* g, T* q, int dof, function<T(T*)> user_func) : X(x), G(g), DOF(dof), func(user_func) {
+        x_plus_step = (T*)malloc(DOF * sizeof(T));
+        search_direction = (T*)malloc(DOF * sizeof(T));
+        for (int i = 0; i < DOF; i++) {
+            search_direction[i] = -q[i];
+        }
+
+    }
+    T linesearch() {
+        int max_it = 1000;
+        T c = 0.5;
+        T tau = 0.75;
+        T m = dot_product(G, search_direction, DOF);
+        T step_size = 1;
+        for (int i = 0; i < max_it; i++) {
+            for (int j = 0; j < DOF; ++j) {
+                x_plus_step[j] = X[j] + (step_size * search_direction[j]);
+            }
+            if (func(X) - func(x_plus_step) >= step_size * -c * m) {
+                return step_size;
+            }
+            else {
+                step_size *= tau;
+            }
+        }
+        return 0;
+    }
+    ~BacktrackingLineSearch() {
+        free(x_plus_step);
+        free(search_direction);
+    }
+private:
+    T* X;
+    T* G;
+    T* x_plus_step;
+    T* search_direction;
+    int DOF;
+    function<T(T*)> func;
+
+    T dot_product(T* a, T* b, int n) {
+        T result = 0;
+        for (int i = 0; i < n; ++i) {
+            result += a[i] * b[i];
+        }
+        return result;
+    }
+
+};
+
+#endif //LBFGS_H

From 64f6e6daec5cb6329dd77d808f6598b6a40ac950 Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Wed, 14 May 2025 13:02:29 -0700
Subject: [PATCH 2/8] Initial commit for LBFGS minimze on CPU

---
 src/CMakeLists.txt     |   2 +
 src/run/run.cu         |   4 +-
 src/run/run.h          |   5 ++
 src/update/LBFGS.h     | 154 +++++++++--------------------------------
 src/update/minimize.cu |  33 ++++++++-
 5 files changed, 74 insertions(+), 124 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 403e8e1..cdfb899 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 cmake_minimum_required(VERSION 3.0)
 project(blade LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 
 file(GLOB_RECURSE sources *.cxx *.cu *.h)
 
diff --git a/src/run/run.cu b/src/run/run.cu
index 4e0e9d8..567e4ae 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -355,7 +355,9 @@ void Run::set_variable(char *line,char *token,System *system)
     dxRMSInit=io_nextf(line)*ANGSTROM;
   } else if (strcmp(token,"mintype")==0) {
     std::string minString=io_nexts(line);
-    if (strcmp(minString.c_str(),"sd")==0) {
+    if (strcmp(minString.c_str(), "lbfgs") == 0){
+      minType=elbfgs;
+    } else if (strcmp(minString.c_str(),"sd")==0) {
       minType=esd;
     } else if (strcmp(minString.c_str(),"sdfd")==0) {
       minType=esdfd;
diff --git a/src/run/run.h b/src/run/run.h
index f7de7d1..15fcd38 100644
--- a/src/run/run.h
+++ b/src/run/run.h
@@ -8,6 +8,7 @@
 
 #include "xdr/xdrfile.h"
 #include "xdr/xdrfile_xtc.h"
+#include "update/LBFGS.h"
 
 // Forward delcaration
 class System;
@@ -19,6 +20,7 @@ struct Cutoffs {
 };
 
 typedef enum emin {
+  elbfgs, // Limited memory BFGS algo
   esd, // steepest descent
   esdfd, // steepest descent with finite difference to choose step length
   eminend} EMin;
@@ -56,6 +58,9 @@ class Run {
   real dxRMS;
   EMin minType; // minimization scheme
 
+  // L-BFGS variables
+  LBFGS<real>* lbfgs;
+
   real betaEwald;
   real rCut;
   real rSwitch;
diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h
index 2cc765a..6697b27 100644
--- a/src/update/LBFGS.h
+++ b/src/update/LBFGS.h
@@ -1,8 +1,8 @@
 
 #ifndef LBFGS_H
 #define LBFGS_H
-#include <type_traits>
 
+#include <type_traits>
 #include <iostream>
 #include <cstdlib>
 #include <cassert>
@@ -12,12 +12,11 @@
 
 using namespace std;
 
-template <typename T>
-class LBFGS {
+template <typename T> class LBFGS {
     static_assert(std::is_floating_point<T>::value, "L-BFGS can only be used with floating point types");
 
 public:
-    LBFGS(int m, int DOF, std::function<T(T*)> user_func, std::function<void(T*, T*)> user_grad) : m(m), DOF(DOF), func(user_func), grad(user_grad) {
+    LBFGS(int m, int DOF, std::function<T(T*, T*)> user_grad) : m(m), DOF(DOF), grad(user_grad) {
         minimized = false;
         min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7;
         max_step_size = 1.0;
@@ -27,7 +26,6 @@ class LBFGS {
         beta = (T*) malloc(sizeof(T));
         step_size = 0.0;
             
-
         G = (T*) malloc(DOF*sizeof(T));
         X = (T*) malloc(DOF*sizeof(T));
 
@@ -36,6 +34,7 @@ class LBFGS {
         prev_gradient = (T*)malloc(DOF * sizeof(T));
 
         q = (T*) malloc(DOF*sizeof(T));
+        x_plus_step = (T*) malloc(DOF*sizeof(T));
         s = (T*) calloc(m*DOF, sizeof(T));
         y = (T*) calloc(m*DOF, sizeof(T));
     }
@@ -54,6 +53,7 @@ class LBFGS {
         free(prev_positions);
         free(prev_gradient);
         free(q);
+        free(x_plus_step);
         free(s);
         free(y);
     }
@@ -69,7 +69,6 @@ void init(T* X, T* G, T steepest_descent_step_size) {
         prev_positions[i] = X[i];
         prev_gradient[i] = G[i];
     }
-
     for (int i = 0; i < DOF; ++i) {
         X[i] -= steepest_descent_step_size * G[i];
         s[i + (m - 1) * DOF] = X[i] - prev_positions[i];
@@ -79,59 +78,6 @@ void init(T* X, T* G, T steepest_descent_step_size) {
     grad(this->X, this->G);
 }
 
-/**
- * This will loop over minimize step and a line search. Implement later
- */
-void minimize() {
-    bool found_minimum = false;
-    int j = 0;
-    int maxcount2 = 1000000;
-    while (!found_minimum && j < maxcount2) {
-        ++j;
-        minimize_step(X, G);
-        printf("f(");
-        for (int i = 0; i < DOF; i++) {
-            if (i != DOF - 1) {
-                printf("%f, ", X[i]);
-            }
-            else {
-                printf("%f) = %f\n", X[i], func(X));
-            }
-        }
-        grad(X, G);
-        printf("grad of f(");
-        for (int i = 0; i < DOF; i++) {
-            if (i != DOF - 1) {
-                printf("%f, ", X[i]);
-            }
-            else {
-                printf("%f) = [", X[i]);
-            }
-        }
-        for (int i = 0; i < DOF; i++) {
-            if (i != DOF - 1) {
-                printf("%f, ", G[i]);
-            }
-            else {
-                printf("%f]\n", G[i]);
-            }
-        }
-        double sum_of_residuals = 0;
-        for (int i = 0; i < DOF; ++i) {
-            sum_of_residuals += G[i] * G[i];
-
-        }
-        sum_of_residuals = sqrtf((sum_of_residuals / DOF));
-        if (minimized == true || sum_of_residuals < 1e-7) {
-            std::cout << "found minimum\n";
-            std::cout << "Number of Steps: " << j << "\n";
-            found_minimum = true;
-        }
-
-    }
-}
-
-
 /**
  * 1. Compute new L-BFGS step direction
  *   Pseudocode from wikipedia:
@@ -148,8 +94,10 @@ void minimize() {
  *   gamma = s.i.T * y.i / y.i.T * y.i
  *   rho.j = 1 / (y.j.T * s.j)
  */
-
 void minimize_step(T* X_input, T* G_input) {
+    if (minimized){
+        return;
+    }
     for (int i = 0; i < DOF; ++i) {
         X[i] = X_input[i];
         G[i] = G_input[i];
@@ -164,12 +112,10 @@ void minimize_step(T* X_input, T* G_input) {
             q[j] -= alpha[i] * y[i * DOF + j];
         }
     }
-
     T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF);
     for (int j = 0; j < DOF; ++j) {
         q[j] *= gamma;
     }
-
     for (int i = 0; i < m; ++i) {
         T beta = rho[i] * dot_product(y + i * DOF, q, DOF);
 
@@ -177,14 +123,12 @@ void minimize_step(T* X_input, T* G_input) {
             q[j] += (alpha[i] - beta) * s[i * DOF + j];
         }
     }
-    BacktrackingLineSearch<double> backtracking(X, G, q, DOF, func);
-    step_size = backtracking.linesearch(); 
+    step_size = linesearch(); 
     std::cout << "step size: " << step_size << "\n";
     for (int j = 0; j < DOF; ++j) {
         X[j] += (step_size * -q[j]);
         X_input[j] = X[j];
     }
-    
 }
 
 void update() {
@@ -200,9 +144,7 @@ void update() {
     for (int i = 0; i < m - 1; ++i) {
         rho[i] = rho[i + 1];
     }
-
     double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF);
-
     if (s_dot_y == 0) {
         std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n";
         minimized = true;
@@ -216,6 +158,26 @@ void update() {
     }
 }
 
+T linesearch() {
+    int max_it = 1000;
+    T c = 0.5;
+    T tau = 0.75;
+    T m = dot_product(G, q, DOF);
+    T step_size = 1;
+    for (int i = 0; i < max_it; i++) {
+        for (int j = 0; j < DOF; ++j) {
+            x_plus_step[j] = X[j] - (step_size * q[j]);
+        }
+        if (grad(X, G) - grad(x_plus_step, G) >= step_size * -c * m) {
+            return step_size;
+        }
+        else {
+            step_size *= tau;
+        }
+    }
+    return 0;
+}
+
 private:
     int m; // Number of previous gradients to use for hessian approximation (5-7)
     int DOF; // Degrees of freedom
@@ -235,63 +197,12 @@ void update() {
     T *prev_gradient; // [DOF] g_{i-1}
 
     T* q;
+    T* x_plus_step;
     T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i}
     T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i}
     T step_size;
 
-    std::function<T(T*)> func;
-    std::function<void(T*, T*)> grad;
-
-    T dot_product(T* a, T* b, int n) {
-        T result = 0;
-        for (int i = 0; i < n; ++i) {
-            result += a[i] * b[i];
-        }
-        return result;
-    }
-};
-
-template <typename T>
-class BacktrackingLineSearch {
-public:
-    BacktrackingLineSearch(T* x, T* g, T* q, int dof, function<T(T*)> user_func) : X(x), G(g), DOF(dof), func(user_func) {
-        x_plus_step = (T*)malloc(DOF * sizeof(T));
-        search_direction = (T*)malloc(DOF * sizeof(T));
-        for (int i = 0; i < DOF; i++) {
-            search_direction[i] = -q[i];
-        }
-
-    }
-    T linesearch() {
-        int max_it = 1000;
-        T c = 0.5;
-        T tau = 0.75;
-        T m = dot_product(G, search_direction, DOF);
-        T step_size = 1;
-        for (int i = 0; i < max_it; i++) {
-            for (int j = 0; j < DOF; ++j) {
-                x_plus_step[j] = X[j] + (step_size * search_direction[j]);
-            }
-            if (func(X) - func(x_plus_step) >= step_size * -c * m) {
-                return step_size;
-            }
-            else {
-                step_size *= tau;
-            }
-        }
-        return 0;
-    }
-    ~BacktrackingLineSearch() {
-        free(x_plus_step);
-        free(search_direction);
-    }
-private:
-    T* X;
-    T* G;
-    T* x_plus_step;
-    T* search_direction;
-    int DOF;
-    function<T(T*)> func;
+    std::function<T(T*, T*)> grad;
 
     T dot_product(T* a, T* b, int n) {
         T result = 0;
@@ -300,7 +211,6 @@ class BacktrackingLineSearch {
         }
         return result;
     }
-
 };
 
-#endif //LBFGS_H
+#endif
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index 71547c1..e1362d7 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -12,6 +12,9 @@
 
 #include "main/real3.h"
 
+#include <functional>
+#include "msld/msld.h"
+
 
 
 __global__ void no_mass_weight_kernel(int N,real* masses,real* ones)
@@ -120,7 +123,35 @@ void State::min_move(int step,int nsteps,System *system)
   real scaling, rescaling;
   real frac;
 
-  if (r->minType==esd) {
+  if (r->minType==elbfgs){
+    if (system->id==0){
+      recv_energy();
+      if (system->verbose>0) display_nrg(system);
+      currEnergy=energy[eepotential];
+      if (step == 0){
+        // Function called by LBFGS class to get energy & gradient 
+        std::function<real(real*, real*)> grad = [system](real* X, real* G){
+          // Copy X onto CUDA device
+          int DOF = system->state->atomCount*3 + system->msld->blockCount;
+          cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault);
+          // Potential & Grad Eval
+          system->potential->calc_force(0, system);
+          // Copy grad(F(X)) into G
+          cudaMemcpy(G, system->state->force_d, DOF*sizeof(real), cudaMemcpyDefault);
+          real energy = system->state->energy[eepotential];
+          return energy;
+        };
+
+        int DOF = system->state->atomCount*3 + system->msld->blockCount;
+        int m = 7; // Past grad depth
+        r->lbfgs = new LBFGS<real>(7, DOF, grad);
+        // Steepest decent step
+        r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2);
+      }
+      r->lbfgs->minimize_step((real*)system->state->positionBuffer, (real*) system->state->forceBuffer);
+    }
+  }
+  else if (r->minType==esd) {
     if (system->id==0) {
       recv_energy();
       if (system->verbose>0) display_nrg(system);

From 31eff07dea50433ee3aade6542662a45c38ecc45 Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Wed, 14 May 2025 17:06:54 -0700
Subject: [PATCH 3/8] Minor changes to LBFGS

---
 src/update/LBFGS.h     | 68 ++++++++++++++++++++----------------------
 src/update/minimize.cu |  4 +--
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h
index 6697b27..d9067f6 100644
--- a/src/update/LBFGS.h
+++ b/src/update/LBFGS.h
@@ -21,27 +21,21 @@ template <typename T> class LBFGS {
         min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7;
         max_step_size = 1.0;
         rho = (T*) calloc(m, sizeof(T));
-        alpha = (T*) malloc(m*sizeof(T));
-        gamma = (T*) malloc(sizeof(T));
-        beta = (T*) malloc(sizeof(T));
+        alpha = (T*) calloc(m, sizeof(T));
+        gamma = (T*) calloc(m, sizeof(T));
+        beta = (T*) calloc(m, sizeof(T));
         step_size = 0.0;
-            
-        G = (T*) malloc(DOF*sizeof(T));
-        X = (T*) malloc(DOF*sizeof(T));
+         
+        search = (T*) calloc(DOF, sizeof(T));
+        prev_positions = (T*) calloc(DOF, sizeof(T));
+        prev_gradient = (T*) calloc(DOF, sizeof(T));
 
-        search = (T*) malloc(DOF*sizeof(T));
-        prev_positions = (T*)malloc(DOF * sizeof(T));
-        prev_gradient = (T*)malloc(DOF * sizeof(T));
-
-        q = (T*) malloc(DOF*sizeof(T));
-        x_plus_step = (T*) malloc(DOF*sizeof(T));
+        q = (T*) calloc(DOF, sizeof(T));
+        x_plus_step = (T*) calloc(DOF, sizeof(T));
         s = (T*) calloc(m*DOF, sizeof(T));
         y = (T*) calloc(m*DOF, sizeof(T));
     }
 
-    /**
-     * TODO: Free all memory used by LBFGS
-     */
     ~LBFGS() {
         free(rho);
         free(alpha);
@@ -59,23 +53,26 @@ template <typename T> class LBFGS {
     }
 
 /**
- * TODO: Implement steepest decent step
- * Sets variables and returns 1
+ * Steepest decent step
+ * 
  * @param X Initial Position
  * @param G Initial Position Grad
  */
 void init(T* X, T* G, T steepest_descent_step_size) {
+    // f(X0), g(X0)
+    grad(X, G);
+    // Update
     for (int i = 0; i < DOF; ++i) {
         prev_positions[i] = X[i];
         prev_gradient[i] = G[i];
     }
     for (int i = 0; i < DOF; ++i) {
-        X[i] -= steepest_descent_step_size * G[i];
         s[i + (m - 1) * DOF] = X[i] - prev_positions[i];
         y[i + (m - 1) * DOF] = G[i] - prev_gradient[i];
-        this->X[i] = X[i];
+        X[i] -= steepest_descent_step_size * G[i];
     }
-    grad(this->X, this->G);
+    // f(X0-size*G), g(X0-size*G)
+    grad(X, G);
 }
 
 /**
@@ -94,18 +91,16 @@ void init(T* X, T* G, T steepest_descent_step_size) {
  *   gamma = s.i.T * y.i / y.i.T * y.i
  *   rho.j = 1 / (y.j.T * s.j)
  */
-void minimize_step(T* X_input, T* G_input) {
+void minimize_step(T* X, T* G) {
     if (minimized){
         return;
     }
-    for (int i = 0; i < DOF; ++i) {
-        X[i] = X_input[i];
-        G[i] = G_input[i];
-    }
+    // Eval function at X
+    T p0 = grad(X, G);
     for (int i = 0; i < DOF; ++i) {
         q[i] = G[i];
     }
-    update();
+    update(X, G);
     for (int i = m - 1; i >= 0; --i) {
         alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF);
         for (int j = 0; j < DOF; ++j) {
@@ -118,24 +113,23 @@ void minimize_step(T* X_input, T* G_input) {
     }
     for (int i = 0; i < m; ++i) {
         T beta = rho[i] * dot_product(y + i * DOF, q, DOF);
-
         for (int j = 0; j < DOF; ++j) {
             q[j] += (alpha[i] - beta) * s[i * DOF + j];
         }
     }
-    step_size = linesearch(); 
+    // min_a f(X + a*q)
+    step_size = this->linesearch(p0, X, G); 
     std::cout << "step size: " << step_size << "\n";
+    // Update system positions
     for (int j = 0; j < DOF; ++j) {
         X[j] += (step_size * -q[j]);
-        X_input[j] = X[j];
     }
 }
 
-void update() {
+void update(T* X, T* G) {
     for (int i = 0; i < ((m - 1) * DOF); ++i) {
         s[i] = s[i + DOF];
         y[i] = y[i + DOF];
-
     }
     for (int i = 0; i < DOF; ++i) {
         s[(m - 1) * DOF + i] = X[i] - prev_positions[i];
@@ -158,21 +152,23 @@ void update() {
     }
 }
 
-T linesearch() {
+T linesearch(T p0, T* X, T* G) {
     int max_it = 1000;
     T c = 0.5;
     T tau = 0.75;
     T m = dot_product(G, q, DOF);
     T step_size = 1;
     for (int i = 0; i < max_it; i++) {
+        T sum = 0;
         for (int j = 0; j < DOF; ++j) {
             x_plus_step[j] = X[j] - (step_size * q[j]);
         }
-        if (grad(X, G) - grad(x_plus_step, G) >= step_size * -c * m) {
+        T new_value = grad(x_plus_step, G);
+        if (p0 - new_value >= step_size * -c * m) {
             return step_size;
-        }
-        else {
+        } else {
             step_size *= tau;
+            printf("Step_size: %f, p0: %f, new: %f, x+s: %f\n", step_size, p0, new_value, q[50]);
         }
     }
     return 0;
@@ -213,4 +209,4 @@ T linesearch() {
     }
 };
 
-#endif
\ No newline at end of file
+#endif // LBFGS
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index e1362d7..51bd33b 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -130,7 +130,7 @@ void State::min_move(int step,int nsteps,System *system)
       currEnergy=energy[eepotential];
       if (step == 0){
         // Function called by LBFGS class to get energy & gradient 
-        std::function<real(real*, real*)> grad = [system](real* X, real* G){
+        std::function<real(real*, real*)> energy_and_grad = [system](real* X, real* G){
           // Copy X onto CUDA device
           int DOF = system->state->atomCount*3 + system->msld->blockCount;
           cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault);
@@ -144,7 +144,7 @@ void State::min_move(int step,int nsteps,System *system)
 
         int DOF = system->state->atomCount*3 + system->msld->blockCount;
         int m = 7; // Past grad depth
-        r->lbfgs = new LBFGS<real>(7, DOF, grad);
+        r->lbfgs = new LBFGS<real>(7, DOF, energy_and_grad);
         // Steepest decent step
         r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2);
       }

From f3cd0cfbda7583610b027ce9d132af545914c18e Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Tue, 16 Dec 2025 20:38:13 -0800
Subject: [PATCH 4/8] Updated L-BFGS implementation to work on floating point
 position and gradient buffers directly. Still working out bugs.

---
 src/run/run.cu           |  10 +-
 src/run/run.h            |   5 +-
 src/system/potential.cxx |   1 +
 src/update/LBFGS.h       | 212 ---------------------------------------
 src/update/lbfgs.cu      | 156 ++++++++++++++++++++++++++++
 src/update/lbfgs.h       |  38 +++++++
 src/update/minimize.cu   |  62 ++++++++----
 7 files changed, 246 insertions(+), 238 deletions(-)
 delete mode 100644 src/update/LBFGS.h
 create mode 100644 src/update/lbfgs.cu
 create mode 100644 src/update/lbfgs.h

diff --git a/src/run/run.cu b/src/run/run.cu
index 4559e6b..c33df91 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -359,14 +359,14 @@ void Run::set_variable(char *line,char *token,System *system)
     dxRMSInit=io_nextf(line)*ANGSTROM;
   } else if (strcmp(token,"mintype")==0) {
     std::string minString=io_nexts(line);
-    if (strcmp(minString.c_str(), "lbfgs") == 0){
+    if (strcmp(minString.c_str(), "lbfgs")==0){
       minType=elbfgs;
     } else if (strcmp(minString.c_str(),"sd")==0) {
       minType=esd;
     } else if (strcmp(minString.c_str(),"sdfd")==0) {
       minType=esdfd;
     } else {
-      fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: sd or sdfd\n",minString.c_str());
+      fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: lbfgs, sd, or sdfd\n",minString.c_str());
     }
   } else if (strcmp(token,"domdecheuristic")==0) {
     domdecHeuristic=io_nextb(line);
@@ -503,6 +503,12 @@ void Run::minimize(char *line,char *token,System *system)
     gpuCheck(cudaPeekAtLastError());
   }
 
+  if(system->run->minType==elbfgs){
+    printf("L-BFGS took %d force evaluations in %d steps.\n", system->run->lbfgs_energy_evals, system->run->lbfgs->step);
+    printf("U0: %f, Uf: %f, Uf - U0: %f\n", 
+      system->run->lbfgs->U0, system->run->lbfgs->Uf, system->run->lbfgs->Uf - system->run->lbfgs->U0);
+  }
+
   system->state->min_dest(system);
   dynamics_finalize(system);
 }
diff --git a/src/run/run.h b/src/run/run.h
index 15fcd38..e8269e6 100644
--- a/src/run/run.h
+++ b/src/run/run.h
@@ -8,7 +8,7 @@
 
 #include "xdr/xdrfile.h"
 #include "xdr/xdrfile_xtc.h"
-#include "update/LBFGS.h"
+#include "update/lbfgs.h"
 
 // Forward delcaration
 class System;
@@ -59,7 +59,8 @@ class Run {
   EMin minType; // minimization scheme
 
   // L-BFGS variables
-  LBFGS<real>* lbfgs;
+  LBFGS* lbfgs;
+  int lbfgs_energy_evals = 0;
 
   real betaEwald;
   real rCut;
diff --git a/src/system/potential.cxx b/src/system/potential.cxx
index 4fdd10b..93d75c4 100644
--- a/src/system/potential.cxx
+++ b/src/system/potential.cxx
@@ -1617,6 +1617,7 @@ void Potential::calc_force(int step,System *system)
   if (system->run->freqNPT>0) {
     calcEnergy=(calcEnergy||(step%system->run->freqNPT==0));
   }
+  calcEnergy=true;
 #ifdef REPLICAEXCHANGE
   if (system->run->freqREx>0) {
     calcEnergy=(calcEnergy||(step%system->run->freqREx==0));
diff --git a/src/update/LBFGS.h b/src/update/LBFGS.h
deleted file mode 100644
index d9067f6..0000000
--- a/src/update/LBFGS.h
+++ /dev/null
@@ -1,212 +0,0 @@
-
-#ifndef LBFGS_H
-#define LBFGS_H
-
-#include <type_traits>
-#include <iostream>
-#include <cstdlib>
-#include <cassert>
-#include <cmath>
-#include <type_traits>
-#include <functional>
-
-using namespace std;
-
-template <typename T> class LBFGS {
-    static_assert(std::is_floating_point<T>::value, "L-BFGS can only be used with floating point types");
-
-public:
-    LBFGS(int m, int DOF, std::function<T(T*, T*)> user_grad) : m(m), DOF(DOF), grad(user_grad) {
-        minimized = false;
-        min_step_size = sizeof(T) == 4 ? 1e-4 : 1e-7;
-        max_step_size = 1.0;
-        rho = (T*) calloc(m, sizeof(T));
-        alpha = (T*) calloc(m, sizeof(T));
-        gamma = (T*) calloc(m, sizeof(T));
-        beta = (T*) calloc(m, sizeof(T));
-        step_size = 0.0;
-         
-        search = (T*) calloc(DOF, sizeof(T));
-        prev_positions = (T*) calloc(DOF, sizeof(T));
-        prev_gradient = (T*) calloc(DOF, sizeof(T));
-
-        q = (T*) calloc(DOF, sizeof(T));
-        x_plus_step = (T*) calloc(DOF, sizeof(T));
-        s = (T*) calloc(m*DOF, sizeof(T));
-        y = (T*) calloc(m*DOF, sizeof(T));
-    }
-
-    ~LBFGS() {
-        free(rho);
-        free(alpha);
-        free(gamma);
-        free(beta);
-        free(G);
-        free(X);
-        free(search);
-        free(prev_positions);
-        free(prev_gradient);
-        free(q);
-        free(x_plus_step);
-        free(s);
-        free(y);
-    }
-
-/**
- * Steepest decent step
- * 
- * @param X Initial Position
- * @param G Initial Position Grad
- */
-void init(T* X, T* G, T steepest_descent_step_size) {
-    // f(X0), g(X0)
-    grad(X, G);
-    // Update
-    for (int i = 0; i < DOF; ++i) {
-        prev_positions[i] = X[i];
-        prev_gradient[i] = G[i];
-    }
-    for (int i = 0; i < DOF; ++i) {
-        s[i + (m - 1) * DOF] = X[i] - prev_positions[i];
-        y[i + (m - 1) * DOF] = G[i] - prev_gradient[i];
-        X[i] -= steepest_descent_step_size * G[i];
-    }
-    // f(X0-size*G), g(X0-size*G)
-    grad(X, G);
-}
-
-/**
- * 1. Compute new L-BFGS step direction
- *   Pseudocode from wikipedia:
- *   q = g.i // search direction to be updated
- *   for j = i-1 to i-m:
- *     alpha.j = rho.j * s.j.T * q // dot product
- *     q = q - alpha.j * y.j // vector scale & subtraction
- *   gamma.i = s.i-1.T * y.i-1 / y.i-1.T * y.i-1 // dot products in numerator and denominator
- *   q = gamma.i * q
- *   for j = i-m to i-1:
- *     beta = rho.j * y.j.T * q // dot product
- *     q = q + (alpha.j - beta) * s.j // vector scale & addition
- *   q = -q  // negate applied above instead of here most likely
- *   gamma = s.i.T * y.i / y.i.T * y.i
- *   rho.j = 1 / (y.j.T * s.j)
- */
-void minimize_step(T* X, T* G) {
-    if (minimized){
-        return;
-    }
-    // Eval function at X
-    T p0 = grad(X, G);
-    for (int i = 0; i < DOF; ++i) {
-        q[i] = G[i];
-    }
-    update(X, G);
-    for (int i = m - 1; i >= 0; --i) {
-        alpha[i] = rho[i] * dot_product(s + i * DOF, q, DOF);
-        for (int j = 0; j < DOF; ++j) {
-            q[j] -= alpha[i] * y[i * DOF + j];
-        }
-    }
-    T gamma = dot_product(s + (m - 1) * DOF, y + (m - 1) * DOF, DOF) / dot_product(y + (m - 1) * DOF, y + (m - 1) * DOF, DOF);
-    for (int j = 0; j < DOF; ++j) {
-        q[j] *= gamma;
-    }
-    for (int i = 0; i < m; ++i) {
-        T beta = rho[i] * dot_product(y + i * DOF, q, DOF);
-        for (int j = 0; j < DOF; ++j) {
-            q[j] += (alpha[i] - beta) * s[i * DOF + j];
-        }
-    }
-    // min_a f(X + a*q)
-    step_size = this->linesearch(p0, X, G); 
-    std::cout << "step size: " << step_size << "\n";
-    // Update system positions
-    for (int j = 0; j < DOF; ++j) {
-        X[j] += (step_size * -q[j]);
-    }
-}
-
-void update(T* X, T* G) {
-    for (int i = 0; i < ((m - 1) * DOF); ++i) {
-        s[i] = s[i + DOF];
-        y[i] = y[i + DOF];
-    }
-    for (int i = 0; i < DOF; ++i) {
-        s[(m - 1) * DOF + i] = X[i] - prev_positions[i];
-        y[(m - 1) * DOF + i] = G[i] - prev_gradient[i];
-    }
-    for (int i = 0; i < m - 1; ++i) {
-        rho[i] = rho[i + 1];
-    }
-    double s_dot_y = dot_product((s + (m - 1) * DOF), (y + (m - 1) * DOF), DOF);
-    if (s_dot_y == 0) {
-        std::cout << "Error: Dividing by zero. Function is likely already at a minimum.\n";
-        minimized = true;
-    }
-    else {
-        rho[m - 1] = 1 / s_dot_y;
-    }
-    for (int i = 0; i < DOF; ++i) {
-        prev_positions[i] = X[i];
-        prev_gradient[i] = G[i];
-    }
-}
-
-T linesearch(T p0, T* X, T* G) {
-    int max_it = 1000;
-    T c = 0.5;
-    T tau = 0.75;
-    T m = dot_product(G, q, DOF);
-    T step_size = 1;
-    for (int i = 0; i < max_it; i++) {
-        T sum = 0;
-        for (int j = 0; j < DOF; ++j) {
-            x_plus_step[j] = X[j] - (step_size * q[j]);
-        }
-        T new_value = grad(x_plus_step, G);
-        if (p0 - new_value >= step_size * -c * m) {
-            return step_size;
-        } else {
-            step_size *= tau;
-            printf("Step_size: %f, p0: %f, new: %f, x+s: %f\n", step_size, p0, new_value, q[50]);
-        }
-    }
-    return 0;
-}
-
-private:
-    int m; // Number of previous gradients to use for hessian approximation (5-7)
-    int DOF; // Degrees of freedom
-    T min_step_size; // terminate minimization with steps smaller than this number
-    T max_step_size; // ensures that lbfgs doesn't overshoot minimum
-    bool minimized;
-
-    T *beta; 
-    T *gamma; // s projected onto y
-    T *rho; // [m] rho_{i} = (s_{i}^T * y_{i}
-    T *alpha; // [m] alpha_{i} = rho_{i} * s_{i}^T * y_{i}
-
-    T *X;
-    T *G;
-    T *search; // [DOF] L-BFGS search direction
-    T *prev_positions; // [DOF] x_{i-1}
-    T *prev_gradient; // [DOF] g_{i-1}
-
-    T* q;
-    T* x_plus_step;
-    T *s; // [m*DOF] x_{i+1} - x_{i} = s_{i}
-    T *y; // [m*DOF] grad_{i+1} - grad_{i} = y_{i}
-    T step_size;
-
-    std::function<T(T*, T*)> grad;
-
-    T dot_product(T* a, T* b, int n) {
-        T result = 0;
-        for (int i = 0; i < n; ++i) {
-            result += a[i] * b[i];
-        }
-        return result;
-    }
-};
-
-#endif // LBFGS
\ No newline at end of file
diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu
new file mode 100644
index 0000000..2bc2528
--- /dev/null
+++ b/src/update/lbfgs.cu
@@ -0,0 +1,156 @@
+#include <iostream>
+#include <cstdlib>
+#include <cassert>
+#include <cmath>
+#include <type_traits>
+#include <cstdio>
+
+#include <cuda_runtime.h>
+#include "update/lbfgs.h"
+#include "main/real3.h" // for real_sum_reduce
+
+// C = u*A + w*(d^n)*B, C and A and B can be related
+__global__ void vector_add(
+    int N, real u, real *A, real w, real* d, real n, real *B, 
+    real* C) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        if(d){
+            C[i] = u * A[i] + w * pow(*d, n) * B[i];
+        } else {
+            C[i] = u * A[i] + w * B[i];
+        }
+    }
+}
+
+    // *dot = w*u^p*(A).(B), w is cpu real, u is gpu real, p is cpu real
+__global__ void dot_product(
+    int N, real w, real* u, real p, real *A, real *B, 
+    real* dot) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real lDot = 0;
+    extern __shared__ real sDot[];
+    if(i == 0){
+        *dot = 0;
+    }
+    if (i < N) {
+        if(u){
+            lDot = w*powf(*u, p)*A[i]*B[i];
+        } else {
+            lDot = w*A[i]*B[i];
+        }
+    }
+
+    real_sum_reduce(lDot, sDot, dot);
+}
+
+//  <position precision, real precision>
+LBFGS::LBFGS(int m, int DOF, std::function<real()> user_grad, real *position, real *gradient)
+    : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) {
+    step = 0;
+    cudaMalloc(&tmp_d, sizeof(real));
+    cudaMalloc(&gamma_d, sizeof(real));
+    cudaMalloc(&rho_inv_d, m*sizeof(real));
+    cudaMalloc(&alpha_d, m*sizeof(real));
+    cudaMalloc(&beta_d, m*sizeof(real));
+    cudaMalloc(&q_d, DOF*sizeof(real));
+    cudaMalloc(&prev_positions_d, DOF*sizeof(real));
+    cudaMalloc(&prev_gradient_d, DOF*sizeof(real));
+    cudaMalloc(&s_d, m*DOF*sizeof(real));
+    cudaMalloc(&y_d, m*DOF*sizeof(real));
+
+    U0 = user_grad(); // first energy call by lbfgs
+}
+
+LBFGS::~LBFGS() {
+    if (tmp_d) cudaFree(tmp_d);
+    if (rho_inv_d) cudaFree(rho_inv_d);
+    if (alpha_d) cudaFree(alpha_d);
+    if (gamma_d) cudaFree(gamma_d);
+    if (beta_d) cudaFree(beta_d);
+    if (prev_positions_d) cudaFree(prev_positions_d);
+    if (prev_gradient_d) cudaFree(prev_gradient_d);
+    if (q_d) cudaFree(q_d);
+    if (s_d) cudaFree(s_d);
+    if (y_d) cudaFree(y_d);
+}
+
+void LBFGS::minimize_step(real f0) { // f0 from outer loop
+    cudaMemcpy(q_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); // G from outer loop
+    //vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, G_d, 0, NULL, 1, q_d, q_d);
+    if(step != 0 && m != 0) update_sk();
+    // Two-loop recursion - Ch7.4, p178 of Nocedal & Wright
+    for (int i = step-1; i >= step-m; i--) {
+        if(i < 0) break;
+        int index = i % m;
+        // alpha.i = rho.i*s.i*q
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, s_d+index*DOF, q_d, alpha_d+index);
+        // q = q - alpha.i*y.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, 1, y_d+index*DOF, q_d);
+    }
+    if(step != 0 && m != 0){
+        int index = (step-1) % m;
+        // norm = y.i*y.i
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, y_d+index*DOF, tmp_d);
+        // dot = s.i*y.i/norm
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, tmp_d, -1, s_d+index*DOF, y_d+index*DOF, gamma_d);
+        // q = gamma*q
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 0, q_d, 1, gamma_d, 1, q_d, q_d);
+    }
+    for (int i = step-m; i < step; i++) {
+        if(step == 0) break;
+        if(i < 0) continue;
+        int index = i % m;
+        // B = rho.i*y.i*q
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, y_d+index*DOF, q_d, tmp_d);
+        // q = q + s.i*alpha.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, +1, alpha_d+index, 1, s_d+index*DOF, q_d); 
+        // q = q - B*s.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, 1, s_d+index*DOF, q_d);
+    }
+    // q = -q
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, 0, NULL, 1, q_d, q_d);
+    // min_a f(X + a*q)
+    linesearch(f0); // backtracking, leaves X & G at final X+a*q
+    step++;
+}
+
+void LBFGS::update_sk() {
+    int index = (step - 1) % m;
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, 1, prev_positions_d, s_d+index*DOF);
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, 1, prev_gradient_d, y_d+index*DOF);
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, s_d+index*DOF, rho_inv_d+index);
+
+    cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real), cudaMemcpyDefault);
+    cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real), cudaMemcpyDefault);
+}
+
+// Ch3.2, p37 of Nocedal & Wright 
+real LBFGS::linesearch(real f0) {
+    int max_it = 15;
+    real a = 5e-3; // max step size
+    real a_prev = 0;
+    real c1 = 1e-4; // Armijo cond
+    real tau = 0.25; // labeled rho in reference
+    real fi = 0;
+    real m0;
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, G_d, q_d, tmp_d); // df(0)/da
+    cudaMemcpy(&m0, tmp_d, sizeof(real), cudaMemcpyDefault);
+    for (int i = 0; i < max_it; i++) {
+        // Bump positions back and forth
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, (a-a_prev), NULL, 1, q_d, X_d);
+        fi = system_grad(); // update G
+        printf("a: %f, U0-Uf: %f, m0: %f\n", a, f0-fi, m0);
+        if (fi <= f0 + c1*a*m0) {
+            Uf = fi;
+            return a; // position & gradient already updated at new point)
+        } else {
+            a_prev = a;
+            a *= tau;
+        }
+    }
+    // Linesearch failed
+    printf("Linesearch Failed!\n", a); // a0*tau^(max_iter)
+    Uf = fi;
+    return a;
+}
\ No newline at end of file
diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h
new file mode 100644
index 0000000..bc0d757
--- /dev/null
+++ b/src/update/lbfgs.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <functional>
+#include "main/defines.h"
+
+class LBFGS {
+public:
+    int m; // Number of previous gradients to use for hessian approximation (5-7)
+    int step;
+    bool minimized;
+    real U0;
+    real Uf;
+
+    LBFGS(int m, int DOF, std::function<real()> user_grad, real* position, real* gradient);
+    ~LBFGS();
+    void minimize_step(real f0);
+
+private:
+    int DOF; // Degrees of freedom to minimize
+    // All data stored on GPU
+    real *beta_d;
+    real *tmp_d; // 1 real temp storage
+    real *gamma_d; // sk-1 projected onto yk-1
+    real *rho_inv_d; // [m] rho_inv[i] = s[i]^T * y[i]
+    real *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i]
+    real *search_d; // [DOF] L-BFGS search direction
+    real *prev_positions_d; // [DOF] x[i-1]
+    real *prev_gradient_d; // [DOF] g[i-1]
+    real *X_d; // points to user GPU memory, positions to be optimized
+    real *G_d; // points to user GPU memory
+    real *q_d;
+    real *s_d; // [m, DOF] s[i] = x[i+1] - x[i] (1D array)
+    real *y_d; // [m, DOF] y[i] = grad[i+1] - grad[i] (1D array)
+    std::function<real()> system_grad;
+
+    void update_sk();
+    real linesearch(real f0);
+};
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index 51bd33b..19a23b8 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -85,6 +85,18 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire
   real_sum_reduce(3*lDot/N,sDot,dot);
 }
 
+__global__ void double_convert_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  if (i<N) {
+    if(direction){ // forward
+      floatBuffer[i] = (real)doubleBuffer[i];
+    } else { // backward
+      doubleBuffer[i] = (real_x)floatBuffer[i];
+    }
+  }
+}
+
 void State::min_init(System *system)
 {
   // Set masses to 1 for shake during minimization, except virtual sites
@@ -99,6 +111,29 @@ void State::min_init(System *system)
   if (system->run->minType==esdfd) {
     cudaMalloc(&minDirection_d,3*atomCount*sizeof(real_v));
   }
+  if (system->run->minType==elbfgs){
+    // Function called by L-BFGS class to get energy & gradient 
+    std::function<real()> energy_and_grad = [system](){
+      // Copy X onto double precision buffer
+      int DOF = system->state->atomCount*3;
+      int shift = system->state->lambdaCount;
+      double_convert_float<<<(system->state->atomCount+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
+            DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, false);
+      // Potential & Grad Eval -> step=0 to calc energy
+      system->domdec->update_domdec(system,true);
+      system->potential->calc_force(0, system);
+      // grad(F(X)) G already stored
+      system->state->recv_energy();
+      system->run->lbfgs_energy_evals++;
+      return (real)system->state->energy[eepotential];
+    };
+
+    int shift = system->state->lambdaCount;
+    int DOF = system->state->atomCount*3; // only minimize atom positions
+    int m = 7; // Past grad depth
+    system->state->recv_state();
+    system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_fd+shift, system->state->forceBuffer_d+shift);
+  }
 }
 
 void State::min_dest(System *system)
@@ -112,9 +147,12 @@ void State::min_dest(System *system)
   if (system->run->minType==esdfd) {
     cudaFree(minDirection_d);
   }
+  if (system->run->minType==elbfgs){
+    system->run->lbfgs->~LBFGS();
+  }
 }
 
-void State::min_move(int step,int nsteps,System *system)
+void State::min_move(int step, int nsteps, System *system)
 {
   Run *r=system->run;
   real_e grads2[2];
@@ -128,27 +166,7 @@ void State::min_move(int step,int nsteps,System *system)
       recv_energy();
       if (system->verbose>0) display_nrg(system);
       currEnergy=energy[eepotential];
-      if (step == 0){
-        // Function called by LBFGS class to get energy & gradient 
-        std::function<real(real*, real*)> energy_and_grad = [system](real* X, real* G){
-          // Copy X onto CUDA device
-          int DOF = system->state->atomCount*3 + system->msld->blockCount;
-          cudaMemcpy(system->state->positionBuffer_d, X, DOF*sizeof(real), cudaMemcpyDefault);
-          // Potential & Grad Eval
-          system->potential->calc_force(0, system);
-          // Copy grad(F(X)) into G
-          cudaMemcpy(G, system->state->force_d, DOF*sizeof(real), cudaMemcpyDefault);
-          real energy = system->state->energy[eepotential];
-          return energy;
-        };
-
-        int DOF = system->state->atomCount*3 + system->msld->blockCount;
-        int m = 7; // Past grad depth
-        r->lbfgs = new LBFGS<real>(7, DOF, energy_and_grad);
-        // Steepest decent step
-        r->lbfgs->init((real*)system->state->positionBuffer, (real*) system->state->forceBuffer, 1e-2);
-      }
-      r->lbfgs->minimize_step((real*)system->state->positionBuffer, (real*) system->state->forceBuffer);
+      r->lbfgs->minimize_step(currEnergy);
     }
   }
   else if (r->minType==esd) {

From 7ec444550715780f2993dce8a1ee26a1f7dd1a07 Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Sun, 21 Dec 2025 19:59:56 -0800
Subject: [PATCH 5/8] Fixed multiple bugs, most importantly in line search
 checking of wolfe cond... book reference misleading

---
 src/run/run.cu         |   7 +-
 src/system/state.cxx   |   1 +
 src/system/state.h     |   1 +
 src/update/lbfgs.cu    | 411 ++++++++++++++++++++++++++++++-----------
 src/update/lbfgs.h     |  66 ++++---
 src/update/minimize.cu |  25 +--
 6 files changed, 370 insertions(+), 141 deletions(-)

diff --git a/src/run/run.cu b/src/run/run.cu
index c33df91..493fa96 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -504,9 +504,10 @@ void Run::minimize(char *line,char *token,System *system)
   }
 
   if(system->run->minType==elbfgs){
-    printf("L-BFGS took %d force evaluations in %d steps.\n", system->run->lbfgs_energy_evals, system->run->lbfgs->step);
-    printf("U0: %f, Uf: %f, Uf - U0: %f\n", 
-      system->run->lbfgs->U0, system->run->lbfgs->Uf, system->run->lbfgs->Uf - system->run->lbfgs->U0);
+    Run* r = system->run;
+    printf("L-BFGS (m=%d) did %d force evaluations in %d steps. Reset memory %d times.\n", 
+      r->lbfgs->m, r->lbfgs_energy_evals, r->lbfgs->step_count, r->lbfgs->reset_count);
+    printf("U0: %f, Uf: %f, Uf - U0: %f\n", r->lbfgs->U0, r->lbfgs->Uf, r->lbfgs->Uf - r->lbfgs->U0);
   }
 
   system->state->min_dest(system);
diff --git a/src/system/state.cxx b/src/system/state.cxx
index 7635c0f..de03543 100644
--- a/src/system/state.cxx
+++ b/src/system/state.cxx
@@ -42,6 +42,7 @@ State::State(System *system) {
   cudaMalloc(&(positionBackup_d),(2*nL+3*n)*sizeof(real_x));
   forceBuffer=(real_f*)calloc(rootFactor*(2*nL+3*n),sizeof(real_f));
   cudaMalloc(&(forceBuffer_d),rootFactor*(2*nL+3*n)*sizeof(real_f));
+  cudaMalloc(&(forceBufferX_d),rootFactor*(2*nL+3*n)*sizeof(real_x));
   cudaMalloc(&(forceBackup_d),(2*nL+3*n)*sizeof(real_f));
 
   if (system->idCount>0) { // OMP
diff --git a/src/system/state.h b/src/system/state.h
index 1270515..66ebf83 100644
--- a/src/system/state.h
+++ b/src/system/state.h
@@ -67,6 +67,7 @@ class State {
   real *positionBuffer_omp;
   real_f *forceBuffer;
   real_f *forceBuffer_d;
+  real_x *forceBufferX_d; 
   real_f *forceBackup_d; // For NPT
   real_f *forceBuffer_omp;
 
diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu
index 2bc2528..858d0aa 100644
--- a/src/update/lbfgs.cu
+++ b/src/update/lbfgs.cu
@@ -4,153 +4,356 @@
 #include <cmath>
 #include <type_traits>
 #include <cstdio>
+#include <algorithm>
 
 #include <cuda_runtime.h>
 #include "update/lbfgs.h"
 #include "main/real3.h" // for real_sum_reduce
 
-// C = u*A + w*(d^n)*B, C and A and B can be related
-__global__ void vector_add(
-    int N, real u, real *A, real w, real* d, real n, real *B, 
-    real* C) {
+/*
+    Cuda Kernels
+*/
+
+// d = 1/sqrt(d) - element-wise
+__global__ void inv_sqrt(int N, real_x *d) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = 1.0/sqrt(d[i]);
+    }
+}
+
+// d = 1/d - element-wise
+__global__ void recip(int N, real_x *d) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = 1.0/d[i];
+    }
+}
+
+// d = abs(d) - element-wise
+__global__ void vector_abs(int N, real_x *d){
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = abs(d[i]);
+    }
+}
+
+// C = u*A + w*d*B, C and A and B can be related
+__global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real_x *B, real_x *C) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < N) {
         if(d){
-            C[i] = u * A[i] + w * pow(*d, n) * B[i];
+            C[i] = u * A[i] + w * d[0] * B[i];
         } else {
             C[i] = u * A[i] + w * B[i];
         }
     }
 }
 
-    // *dot = w*u^p*(A).(B), w is cpu real, u is gpu real, p is cpu real
-__global__ void dot_product(
-    int N, real w, real* u, real p, real *A, real *B, 
-    real* dot) {
+// C = A[0]*B - element-wise
+__global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real lDot = 0;
-    extern __shared__ real sDot[];
-    if(i == 0){
-        *dot = 0;
+    if (i < N) {
+        C[i] = A[0]*B[i];
+    }
+}
+
+// B = c*A - element-wise
+__global__ void vector_scale(int N, real_x c, real_x *A, real_x *B){
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        B[i] = c*A[i];
     }
+}
+
+__global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real_x lDot = 0;
+    extern __shared__ real sDot[];
     if (i < N) {
-        if(u){
-            lDot = w*powf(*u, p)*A[i]*B[i];
-        } else {
-            lDot = w*A[i]*B[i];
-        }
+        lDot = A[i]*B[i];
     }
 
-    real_sum_reduce(lDot, sDot, dot);
+    real_sum_reduce((real)lDot, sDot, dot);
 }
 
-//  <position precision, real precision>
-LBFGS::LBFGS(int m, int DOF, std::function<real()> user_grad, real *position, real *gradient)
+/*
+    Class setup
+*/
+
+//  <position precision, real_x precision>
+LBFGS::LBFGS(int m, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient)
     : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) {
-    step = 0;
-    cudaMalloc(&tmp_d, sizeof(real));
-    cudaMalloc(&gamma_d, sizeof(real));
-    cudaMalloc(&rho_inv_d, m*sizeof(real));
-    cudaMalloc(&alpha_d, m*sizeof(real));
-    cudaMalloc(&beta_d, m*sizeof(real));
-    cudaMalloc(&q_d, DOF*sizeof(real));
-    cudaMalloc(&prev_positions_d, DOF*sizeof(real));
-    cudaMalloc(&prev_gradient_d, DOF*sizeof(real));
-    cudaMalloc(&s_d, m*DOF*sizeof(real));
-    cudaMalloc(&y_d, m*DOF*sizeof(real));
+    k = 0;
+    cudaMalloc(&tmp_d, sizeof(real_x));
+    cudaMalloc(&gamma_d, sizeof(real_x));
+    cudaMalloc(&rho_d, m*sizeof(real_x));
+    cudaMalloc(&alpha_d, m*sizeof(real_x));
+    cudaMalloc(&q_d, DOF*sizeof(real_x));
+    cudaMalloc(&prev_positions_d, DOF*sizeof(real_x));
+    cudaMalloc(&prev_gradient_d, DOF*sizeof(real_x));
+    cudaMalloc(&s_d, m*DOF*sizeof(real_x));
+    cudaMalloc(&y_d, m*DOF*sizeof(real_x));
+    cudaMalloc(&s_tmp_d, DOF*sizeof(real_x));
+    cudaMalloc(&y_tmp_d, DOF*sizeof(real_x));
 
     U0 = user_grad(); // first energy call by lbfgs
+
+    // Normalize first s.d. step
+    gamma_norm();
+
+    // Set up s.d. step
+    cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d);
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d);
 }
 
 LBFGS::~LBFGS() {
     if (tmp_d) cudaFree(tmp_d);
-    if (rho_inv_d) cudaFree(rho_inv_d);
+    if (rho_d) cudaFree(rho_d);
     if (alpha_d) cudaFree(alpha_d);
     if (gamma_d) cudaFree(gamma_d);
-    if (beta_d) cudaFree(beta_d);
     if (prev_positions_d) cudaFree(prev_positions_d);
     if (prev_gradient_d) cudaFree(prev_gradient_d);
     if (q_d) cudaFree(q_d);
     if (s_d) cudaFree(s_d);
     if (y_d) cudaFree(y_d);
+    if (s_tmp_d) cudaFree(s_tmp_d);
+    if (y_tmp_d) cudaFree(y_tmp_d);
 }
 
-void LBFGS::minimize_step(real f0) { // f0 from outer loop
-    cudaMemcpy(q_d, G_d, DOF*sizeof(real), cudaMemcpyDefault); // G from outer loop
-    //vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, G_d, 0, NULL, 1, q_d, q_d);
-    if(step != 0 && m != 0) update_sk();
-    // Two-loop recursion - Ch7.4, p178 of Nocedal & Wright
-    for (int i = step-1; i >= step-m; i--) {
-        if(i < 0) break;
+/*
+    L-BFGS iteration
+*/
+
+bool LBFGS::check_convergence(){
+    // return true if rmsg minimized
+    real_x grad_norm = 0;
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d);
+    cudaMemcpy(&grad_norm, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    grad_norm = sqrt(grad_norm);
+    return grad_norm < eps;
+}
+
+// set gamma = 1/|g_d|
+void LBFGS::gamma_norm(){
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d);
+    inv_sqrt<<<1,1>>>(1, tmp_d);
+    cudaMemcpy(gamma_d, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+}
+
+// Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4)
+void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop
+    // Copy kth positions & gradients
+    cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+
+    // min_a f(X + a*q)
+    step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q)
+
+    // Check convergence
+    // TODO: implement convergence checks
+
+    // kth position and gradient deltas
+    update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory
+    k++;
+
+    // Two-loop recursion
+    cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); 
+    for (int i = k-1; i >= std::max(0, k-m); i--) {
         int index = i % m;
-        // alpha.i = rho.i*s.i*q
-        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, s_d+index*DOF, q_d, alpha_d+index);
-        // q = q - alpha.i*y.i
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, 1, y_d+index*DOF, q_d);
-    }
-    if(step != 0 && m != 0){
-        int index = (step-1) % m;
-        // norm = y.i*y.i
-        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, y_d+index*DOF, tmp_d);
-        // dot = s.i*y.i/norm
-        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, tmp_d, -1, s_d+index*DOF, y_d+index*DOF, gamma_d);
-        // q = gamma*q
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 0, q_d, 1, gamma_d, 1, q_d, q_d);
+        // q = q - (rho.i*(s.i dot q))*y.i
+        cudaMemset(alpha_d+index, 0, sizeof(real_x));
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index);
+        vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index);
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d);
     }
-    for (int i = step-m; i < step; i++) {
-        if(step == 0) break;
-        if(i < 0) continue;
+    // q = gamma.k*q = r
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d);
+    for (int i = std::max(0, k-m); i <= k-1; i++) {
         int index = i % m;
-        // B = rho.i*y.i*q
-        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, rho_inv_d+index, -1, y_d+index*DOF, q_d, tmp_d);
-        // q = q + s.i*alpha.i
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, +1, alpha_d+index, 1, s_d+index*DOF, q_d); 
-        // q = q - B*s.i
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, 1, s_d+index*DOF, q_d);
+        // B = rho.i*(y.i dot q)
+        // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i
+        cudaMemset(tmp_d, 0, sizeof(real_x));
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d);
+        vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d);
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, s_d+index*DOF, q_d);
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, 1, alpha_d+index, s_d+index*DOF, q_d); 
     }
+
     // q = -q
-    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, 0, NULL, 1, q_d, q_d);
-    // min_a f(X + a*q)
-    linesearch(f0); // backtracking, leaves X & G at final X+a*q
-    step++;
-}
-
-void LBFGS::update_sk() {
-    int index = (step - 1) % m;
-    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, 1, prev_positions_d, s_d+index*DOF);
-    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, 1, prev_gradient_d, y_d+index*DOF);
-    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, y_d+index*DOF, s_d+index*DOF, rho_inv_d+index);
-
-    cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real), cudaMemcpyDefault);
-    cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real), cudaMemcpyDefault);
-}
-
-// Ch3.2, p37 of Nocedal & Wright 
-real LBFGS::linesearch(real f0) {
-    int max_it = 15;
-    real a = 5e-3; // max step size
-    real a_prev = 0;
-    real c1 = 1e-4; // Armijo cond
-    real tau = 0.25; // labeled rho in reference
-    real fi = 0;
-    real m0;
-    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real)/32, 0>>>(DOF, 1, NULL, 1, G_d, q_d, tmp_d); // df(0)/da
-    cudaMemcpy(&m0, tmp_d, sizeof(real), cudaMemcpyDefault);
-    for (int i = 0; i < max_it; i++) {
-        // Bump positions back and forth
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, (a-a_prev), NULL, 1, q_d, X_d);
-        fi = system_grad(); // update G
-        printf("a: %f, U0-Uf: %f, m0: %f\n", a, f0-fi, m0);
-        if (fi <= f0 + c1*a*m0) {
-            Uf = fi;
-            return a; // position & gradient already updated at new point)
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d);
+
+    step_count++;
+}
+
+void LBFGS::update_sk_yk() {
+    if(m == 0) { // Steepest decent, normalize step size via gamma
+        gamma_norm();
+        return;
+    } 
+    int index = k % m;
+
+    // s & y tmp
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d);
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, prev_gradient_d, y_tmp_d);
+
+    real_x yy = 0;
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d);
+    cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+
+    real_x sy = 0;
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d);
+    cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+
+    // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS)
+    if(sy < 1e-6){ 
+        printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy);
+        cudaMemset(s_d, 0, m*DOF*sizeof(real_x));
+        cudaMemset(y_d, 0, m*DOF*sizeof(real_x));
+        gamma_norm();
+        k = -1;
+        reset_count++;
+        return;
+    } 
+
+    // rho = 1/y.s
+    real_x rho = 1.0/sy;
+    // gamma_k = s.y/(y.y)
+    real_x gamma = sy/yy;
+    cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma);
+}
+
+/*
+    Line Search
+*/
+
+// return [f(X + alpha*p), df(X+alpha*p)/da]
+void LBFGS::phi(real_x alpha, real_x* result, bool leave_x){
+    cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d);
+    result[0] = system_grad();
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da
+    cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    printf("alpha: %f, phi: %f, phi': %f\n", alpha, result[0], result[1]);
+    if(!leave_x) {
+        cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+        cudaMemcpy(G_d, prev_gradient_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    }
+}
+
+real_x sign(real_x value){
+    if(value < 0){
+        return -1;
+    }
+    return 1;
+}
+
+// Ch3.5, p59 of Nocedal & Wright (Equation 3.59) 
+real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){
+    real_x d1 = ga + gb - 3*(fb - fa)/(a-b);
+    real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb);
+    real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2);
+    if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){
+        printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b);
+        return (a + b) / 2.0;
+    }
+    return arg;
+}
+
+// Ch3.5, p61 of Nocedal & Wright (Algorithm 3.6)
+real_x LBFGS::zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0){
+    printf("Zoom started!\n");
+    int max_iter = 5;
+    real_x phil[2];
+    memcpy(phil, phi_lower, 2*sizeof(real_x));
+    real_x phiu[2];
+    memcpy(phiu, phi_upper, 2*sizeof(real_x));
+    real_x phij[2];
+    real_x aj;
+    for(int i = 0; i < max_iter; i++){
+        aj = cubic_interp(al, phil[0], phil[1], au, phiu[0], phiu[1]);
+        phi(aj, phij, false);
+        if(phij[0] > phi0[0] + c1*aj*phi0[1] || phij[0] >= phil[0]){
+            au = aj;
+            memcpy(phiu, phij, 2*sizeof(real_x));
         } else {
-            a_prev = a;
-            a *= tau;
+            // note that checking for strong wolf is important
+            if (abs(phij[1]) <= c2*abs(phi0[1])){
+                return aj;
+            }
+            if (phij[1]*(au - al) >= 0){
+                au = al;
+                memcpy(phiu, phil, 2*sizeof(real_x));
+            }
+            al = aj;
+            memcpy(phil, phij, 2*sizeof(real_x));
+        }
+    }
+    printf("Zoom reached max iterations! aj = %f, phi: %f, phi': %f\n", aj, phij[0], phij[1]);
+    return aj;
+}
+
+// Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5)
+real_x LBFGS::linesearch(real_x f0){
+    printf("Linesearch started!\n");
+    real_x max_iter = 10;
+    real_x aim1 = 0; 
+    real_x ai = 1;
+    real_x amax = 10;
+    real_x tau = 1.5;
+
+    real_x phi0[2];
+    phi(0, phi0, true);
+    real_x phiim1[2];
+    memcpy(phiim1, phi0, 2*sizeof(real_x));
+    real_x phii[2];
+    for(int i = 0; i < max_iter; i++){
+        phi(ai, phii, false);
+        if (phii[0] > phi0[0] + c1*ai*phi0[1] || (phii[0] >= phiim1[0] && i > 0)){
+            ai = zoom(aim1, phiim1, ai, phii, phi0);
+            phi(ai, phii, true); 
+            Uf = phii[0];
+            break;
+        }
+        // note that checking for strong wolf is important
+        if (abs(phii[1]) >= c2*abs(phi0[1])){
+            phi(ai, phii, false);
+            Uf = phii[0];
+            break;
         }
+        if (phii[1] >= 0){
+            ai = zoom(aim1, phiim1, ai, phii, phi0);
+            phi(ai, phii, false); 
+            Uf = phii[0];
+            break;
+        }
+        aim1 = ai;
+        memcpy(phiim1, phii, 2*sizeof(real_x));
+        ai = tau*ai;
+        if(ai > amax){
+            ai /= tau;
+            printf("Reached Max Step Size!");
+            phi(ai, phii, false); 
+            break;
+        }
+    }
+
+    // Protect from taking too small or steps that increase potential 
+    if(ai < 1e-5 || phii[0] > phi0[0]){
+        ai = 1e-4; // if g.p < 0 this will probably give ok results
     }
-    // Linesearch failed
-    printf("Linesearch Failed!\n", a); // a0*tau^(max_iter)
-    Uf = fi;
-    return a;
+    phi(ai, phii, true); // update positions & leave
+    Uf = phii[0];
+    //printf("Linesearch reached max iterations! Defaulting to alpha: %f - phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+    return ai;
 }
\ No newline at end of file
diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h
index bc0d757..d8c46c3 100644
--- a/src/update/lbfgs.h
+++ b/src/update/lbfgs.h
@@ -3,36 +3,56 @@
 #include <functional>
 #include "main/defines.h"
 
+// Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation
 class LBFGS {
 public:
-    int m; // Number of previous gradients to use for hessian approximation (5-7)
-    int step;
+    int m=5; // Number of previous gradients to use for hessian approximation (5-7)
+    int step_count; // total number of minimize calls
+    int reset_count; // number of times lbfgs memory was cleared
     bool minimized;
-    real U0;
-    real Uf;
+    real_x U0;
+    real_x Uf;
+    // Line search parameters
+    real_x c1 = 1e-4; // sufficient decent
+    real_x c2 = .9; // curvature cond.
+    // TODO: Implement convergence/stability checks
+    real_x eps = 1e-4; // Minimization criteria 
+    real_x delta = 1e-4; // stopping criteria
 
-    LBFGS(int m, int DOF, std::function<real()> user_grad, real* position, real* gradient);
+    LBFGS(int m, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient);
     ~LBFGS();
-    void minimize_step(real f0);
+    void minimize_step(real_x f0);
 
 private:
     int DOF; // Degrees of freedom to minimize
+    real_x step_size = 1; // previous step size
     // All data stored on GPU
-    real *beta_d;
-    real *tmp_d; // 1 real temp storage
-    real *gamma_d; // sk-1 projected onto yk-1
-    real *rho_inv_d; // [m] rho_inv[i] = s[i]^T * y[i]
-    real *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i]
-    real *search_d; // [DOF] L-BFGS search direction
-    real *prev_positions_d; // [DOF] x[i-1]
-    real *prev_gradient_d; // [DOF] g[i-1]
-    real *X_d; // points to user GPU memory, positions to be optimized
-    real *G_d; // points to user GPU memory
-    real *q_d;
-    real *s_d; // [m, DOF] s[i] = x[i+1] - x[i] (1D array)
-    real *y_d; // [m, DOF] y[i] = grad[i+1] - grad[i] (1D array)
-    std::function<real()> system_grad;
-
-    void update_sk();
-    real linesearch(real f0);
+    int k = 0; // lbfgs iteration step
+    real_x *tmp_d; // 1 real temperary storage
+    real_x *gamma_d; // s[k-1] projected onto y[k-1]
+    real_x *rho_d; // [m] rho[k] = 1/(s[k]^T * y[k])
+    real_x *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i]
+    real_x *search_d; // [DOF] L-BFGS search direction
+    real_x *prev_positions_d; // [DOF] x[k-1]
+    real_x *prev_gradient_d; // [DOF] g[k-1]
+    real_x *X_d; // points to user GPU memory, positions to be optimized
+    real_x *G_d; // points to user GPU memory
+    real_x *q_d;
+    real_x *s_d; // [m, DOF] s[k-1] = x[k] - x[k-1] (1D array)
+    real_x *y_d; // [m, DOF] y[k-1] = grad[k] - grad[k-1] (1D array)
+
+    real_x *s_tmp_d; // [DOF] x[k] - x[k-1], used to check curvature 
+    real_x *y_tmp_d; // [DOF] g[k] - g[k-1], used to check curvature
+    bool skipped_step = false; // skipped hessian update, if 2 updates are skipped, we clear s & y memory
+
+    std::function<real_x()> system_grad;
+
+    void gamma_norm();
+    void update_sk_yk();
+    void phi(real_x alpha, real_x *result, bool leave_x);
+
+    bool check_convergence();
+    
+    real_x zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0);
+    real_x linesearch(real_x f0);
 };
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index 19a23b8..9659848 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -85,13 +85,13 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire
   real_sum_reduce(3*lDot/N,sDot,dot);
 }
 
-__global__ void double_convert_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction)
+__global__ void double_onto_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction)
 {
   int i=blockIdx.x*blockDim.x+threadIdx.x;
   if (i<N) {
-    if(direction){ // forward
+    if(direction){ // forward - float = double 
       floatBuffer[i] = (real)doubleBuffer[i];
-    } else { // backward
+    } else { // backward - double = float
       doubleBuffer[i] = (real_x)floatBuffer[i];
     }
   }
@@ -113,26 +113,29 @@ void State::min_init(System *system)
   }
   if (system->run->minType==elbfgs){
     // Function called by L-BFGS class to get energy & gradient 
-    std::function<real()> energy_and_grad = [system](){
-      // Copy X onto double precision buffer
+    std::function<real_x()> energy_and_grad = [system](){
+      // Copy double precision onto float precision buffer
       int DOF = system->state->atomCount*3;
       int shift = system->state->lambdaCount;
-      double_convert_float<<<(system->state->atomCount+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
-            DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, false);
+      double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
+            DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, true);
       // Potential & Grad Eval -> step=0 to calc energy
-      system->domdec->update_domdec(system,true);
+      //system->domdec->update_domdec(system,true);
       system->potential->calc_force(0, system);
       // grad(F(X)) G already stored
       system->state->recv_energy();
+      // Copy float precision onto double precision buffer
+      double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
+            DOF, system->state->forceBufferX_d+shift, system->state->forceBuffer_d+shift, false);
       system->run->lbfgs_energy_evals++;
-      return (real)system->state->energy[eepotential];
+      return system->state->energy[eepotential];
     };
 
     int shift = system->state->lambdaCount;
     int DOF = system->state->atomCount*3; // only minimize atom positions
-    int m = 7; // Past grad depth
+    int m = 10; // Past grad depth
     system->state->recv_state();
-    system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_fd+shift, system->state->forceBuffer_d+shift);
+    system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift);
   }
 }
 

From be90583be0d97ced2d2d3105b060a9dcb39db316 Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Mon, 22 Dec 2025 14:55:44 -0800
Subject: [PATCH 6/8] Added script options for lbfgs and minimize checks on
 grms

---
 src/run/run.cu         |   4 +
 src/run/run.h          |   2 +
 src/update/lbfgs.cu    | 166 +++++++++++++++++------------------------
 src/update/lbfgs.h     |  32 ++++----
 src/update/minimize.cu |   6 +-
 5 files changed, 94 insertions(+), 116 deletions(-)

diff --git a/src/run/run.cu b/src/run/run.cu
index 493fa96..b73bce3 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -357,6 +357,10 @@ void Run::set_variable(char *line,char *token,System *system)
     dxAtomMax=io_nextf(line)*ANGSTROM;
   } else if (strcmp(token,"dxrmsinit")==0) {
     dxRMSInit=io_nextf(line)*ANGSTROM;
+  } else if (strcmp(token, "lbfgs_m")==0){
+    lbfgs_m=io_nexti(line);
+  } else if (strcmp(token, "lbfgs_eps")==0){
+    lbfgs_eps=io_nextf(line);
   } else if (strcmp(token,"mintype")==0) {
     std::string minString=io_nexts(line);
     if (strcmp(minString.c_str(), "lbfgs")==0){
diff --git a/src/run/run.h b/src/run/run.h
index e8269e6..184b31e 100644
--- a/src/run/run.h
+++ b/src/run/run.h
@@ -61,6 +61,8 @@ class Run {
   // L-BFGS variables
   LBFGS* lbfgs;
   int lbfgs_energy_evals = 0;
+  int lbfgs_m = 7; // gradient history length*DOF
+  real lbfgs_eps = 1; // gradient rms convergence criteria
 
   real betaEwald;
   real rCut;
diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu
index 858d0aa..46238cc 100644
--- a/src/update/lbfgs.cu
+++ b/src/update/lbfgs.cu
@@ -50,6 +50,15 @@ __global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real
     }
 }
 
+// specific for 2nd lbfgs loop
+// C = A + (u-w)*B, C and A and B can be related
+__global__ void vector_add(int N, real_x *A, real_x *u, real_x *w, real_x *B, real_x *C) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        C[i] = A[i] + (u[0]-w[0]) * B[i];
+    }
+}
+
 // C = A[0]*B - element-wise
 __global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -82,8 +91,8 @@ __global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) {
 */
 
 //  <position precision, real_x precision>
-LBFGS::LBFGS(int m, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient)
-    : m(m), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) {
+LBFGS::LBFGS(int m, real_x eps, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient)
+    : m(m), eps_tol(eps), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) {
     k = 0;
     cudaMalloc(&tmp_d, sizeof(real_x));
     cudaMalloc(&gamma_d, sizeof(real_x));
@@ -126,14 +135,30 @@ LBFGS::~LBFGS() {
     L-BFGS iteration
 */
 
-bool LBFGS::check_convergence(){
+bool LBFGS::converged(){
     // return true if rmsg minimized
-    real_x grad_norm = 0;
+    rmsg = 0;
+    grad_mag = 0;
+    grad_pos_mag = 0;
+
     cudaMemset(tmp_d, 0, sizeof(real_x));
     dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d);
-    cudaMemcpy(&grad_norm, tmp_d, sizeof(real_x), cudaMemcpyDefault);
-    grad_norm = sqrt(grad_norm);
-    return grad_norm < eps;
+    cudaMemcpy(&rmsg, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    grad_mag = sqrt(rmsg);
+    rmsg = sqrt(rmsg/DOF);
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, X_d, X_d, tmp_d);
+    cudaMemcpy(&grad_pos_mag, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    grad_pos_mag = grad_mag / std::max(1.0, sqrt(grad_pos_mag));
+
+    printf("   rmsg = %f\n", rmsg);
+    printf("    |g| = %f\n", grad_mag);
+    printf("|g|/|x| = %f\n", grad_pos_mag);
+    if(rmsg < eps_tol){
+        printf("Structure minimized!!\n");
+        minimized=true;
+    }
+    return rmsg < eps_tol;
 }
 
 // set gamma = 1/|g_d|
@@ -145,17 +170,15 @@ void LBFGS::gamma_norm(){
 }
 
 // Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4)
-void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop
+void LBFGS::minimize_step(real_x f0) { // f0 & G filled from class initializer
     // Copy kth positions & gradients
     cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault);
     cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    if(converged()) return;
 
     // min_a f(X + a*q)
     step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q)
 
-    // Check convergence
-    // TODO: implement convergence checks
-
     // kth position and gradient deltas
     update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory
     k++;
@@ -164,10 +187,11 @@ void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop
     cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); 
     for (int i = k-1; i >= std::max(0, k-m); i--) {
         int index = i % m;
-        // q = q - (rho.i*(s.i dot q))*y.i
+        // alpha.i = rho.i*(s.i dot q)
         cudaMemset(alpha_d+index, 0, sizeof(real_x));
         dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index);
         vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index);
+        // q = q - alpha.i*y.i
         vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d);
     }
     // q = gamma.k*q = r
@@ -175,26 +199,24 @@ void LBFGS::minimize_step(real_x f0) { // f0 & G filled from outer minimize loop
     for (int i = std::max(0, k-m); i <= k-1; i++) {
         int index = i % m;
         // B = rho.i*(y.i dot q)
-        // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i
         cudaMemset(tmp_d, 0, sizeof(real_x));
         dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d);
         vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d);
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, tmp_d, s_d+index*DOF, q_d);
-        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, 1, alpha_d+index, s_d+index*DOF, q_d); 
-    }
+        // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, q_d, alpha_d+index, tmp_d, s_d+index*DOF, q_d);
+   }
 
     // q = -q
     vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d);
-
     step_count++;
 }
 
+// this method is overly cautious & slow, likely doesn't need the cpu grad checks or resets
 void LBFGS::update_sk_yk() {
     if(m == 0) { // Steepest decent, normalize step size via gamma
         gamma_norm();
         return;
     } 
-    int index = k % m;
 
     // s & y tmp
     vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d);
@@ -204,14 +226,13 @@ void LBFGS::update_sk_yk() {
     cudaMemset(tmp_d, 0, sizeof(real_x));
     dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d);
     cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
-
     real_x sy = 0;
     cudaMemset(tmp_d, 0, sizeof(real_x));
     dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d);
     cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
 
     // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS)
-    if(sy < 1e-6){ 
+    if(sy < 1e-10){ 
         printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy);
         cudaMemset(s_d, 0, m*DOF*sizeof(real_x));
         cudaMemset(y_d, 0, m*DOF*sizeof(real_x));
@@ -226,10 +247,11 @@ void LBFGS::update_sk_yk() {
     // gamma_k = s.y/(y.y)
     real_x gamma = sy/yy;
     cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault);
+    int index = k % m;
     cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault);
     cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
     cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
-    printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma);
+    //printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma);
 }
 
 /*
@@ -237,18 +259,13 @@ void LBFGS::update_sk_yk() {
 */
 
 // return [f(X + alpha*p), df(X+alpha*p)/da]
-void LBFGS::phi(real_x alpha, real_x* result, bool leave_x){
+void LBFGS::phi(real_x alpha, real_x* result){
     cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault);
     vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d);
     result[0] = system_grad();
     cudaMemset(tmp_d, 0, sizeof(real_x));
     dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da
     cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault);
-    printf("alpha: %f, phi: %f, phi': %f\n", alpha, result[0], result[1]);
-    if(!leave_x) {
-        cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault);
-        cudaMemcpy(G_d, prev_gradient_d, DOF*sizeof(real_x), cudaMemcpyDefault);
-    }
 }
 
 real_x sign(real_x value){
@@ -260,100 +277,55 @@ real_x sign(real_x value){
 
 // Ch3.5, p59 of Nocedal & Wright (Equation 3.59) 
 real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){
-    real_x d1 = ga + gb - 3*(fb - fa)/(a-b);
+    real_x d1 = ga + gb - 3*(fa - fb)/(a-b);
     real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb);
     real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2);
     if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){
-        printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b);
+        //printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b);
         return (a + b) / 2.0;
     }
     return arg;
 }
 
-// Ch3.5, p61 of Nocedal & Wright (Algorithm 3.6)
-real_x LBFGS::zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0){
-    printf("Zoom started!\n");
-    int max_iter = 5;
-    real_x phil[2];
-    memcpy(phil, phi_lower, 2*sizeof(real_x));
-    real_x phiu[2];
-    memcpy(phiu, phi_upper, 2*sizeof(real_x));
-    real_x phij[2];
-    real_x aj;
-    for(int i = 0; i < max_iter; i++){
-        aj = cubic_interp(al, phil[0], phil[1], au, phiu[0], phiu[1]);
-        phi(aj, phij, false);
-        if(phij[0] > phi0[0] + c1*aj*phi0[1] || phij[0] >= phil[0]){
-            au = aj;
-            memcpy(phiu, phij, 2*sizeof(real_x));
-        } else {
-            // note that checking for strong wolf is important
-            if (abs(phij[1]) <= c2*abs(phi0[1])){
-                return aj;
-            }
-            if (phij[1]*(au - al) >= 0){
-                au = al;
-                memcpy(phiu, phil, 2*sizeof(real_x));
-            }
-            al = aj;
-            memcpy(phil, phij, 2*sizeof(real_x));
-        }
-    }
-    printf("Zoom reached max iterations! aj = %f, phi: %f, phi': %f\n", aj, phij[0], phij[1]);
-    return aj;
-}
-
 // Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5)
 real_x LBFGS::linesearch(real_x f0){
-    printf("Linesearch started!\n");
-    real_x max_iter = 10;
+    real_x max_iter = 5;
     real_x aim1 = 0; 
     real_x ai = 1;
     real_x amax = 10;
-    real_x tau = 1.5;
-
-    real_x phi0[2];
-    phi(0, phi0, true);
+    // Data is already loaded, saves 1 energy & grad call
+    real_x phi0[2] = {f0, 0};
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da
+    cudaMemcpy(&phi0[1], tmp_d, sizeof(real_x), cudaMemcpyDefault);
     real_x phiim1[2];
     memcpy(phiim1, phi0, 2*sizeof(real_x));
+    // Evaluate at 1 & iterate using cubic interpolation
     real_x phii[2];
+    phi(ai, phii);
+    printf("alpha: %f, phi: %f, phi': %f\n", aim1, phi0[0], phi0[1]);
+    printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
     for(int i = 0; i < max_iter; i++){
-        phi(ai, phii, false);
-        if (phii[0] > phi0[0] + c1*ai*phi0[1] || (phii[0] >= phiim1[0] && i > 0)){
-            ai = zoom(aim1, phiim1, ai, phii, phi0);
-            phi(ai, phii, true); 
+        // Check strong wolfe condition
+        if (phii[0] <= phi0[0] + c1*ai*phi0[1] && abs(phii[1]) <= c2*abs(phi0[1])){
             Uf = phii[0];
-            break;
-        }
-        // note that checking for strong wolf is important
-        if (abs(phii[1]) >= c2*abs(phi0[1])){
-            phi(ai, phii, false);
-            Uf = phii[0];
-            break;
+            return ai;
         }
-        if (phii[1] >= 0){
-            ai = zoom(aim1, phiim1, ai, phii, phi0);
-            phi(ai, phii, false); 
-            Uf = phii[0];
-            break;
-        }
-        aim1 = ai;
+        real_x tmp = ai;
+        ai = cubic_interp(aim1, phiim1[0], phiim1[1], ai, phii[0], phii[1]);
+        phi(ai, phii);
+        printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+        aim1 = tmp;
         memcpy(phiim1, phii, 2*sizeof(real_x));
-        ai = tau*ai;
-        if(ai > amax){
-            ai /= tau;
-            printf("Reached Max Step Size!");
-            phi(ai, phii, false); 
+        if(ai < 0 || ai > amax){
             break;
         }
     }
 
-    // Protect from taking too small or steps that increase potential 
-    if(ai < 1e-5 || phii[0] > phi0[0]){
-        ai = 1e-4; // if g.p < 0 this will probably give ok results
-    }
-    phi(ai, phii, true); // update positions & leave
+    // Failed line search
+    ai = 1e-5; // default step size, a small enough step should decrease function if s.y > 0 
+    phi(ai, phii);
     Uf = phii[0];
-    //printf("Linesearch reached max iterations! Defaulting to alpha: %f - phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+    printf("Linesearch failed! Defaulting to ai = %e, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
     return ai;
 }
\ No newline at end of file
diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h
index d8c46c3..43c05e5 100644
--- a/src/update/lbfgs.h
+++ b/src/update/lbfgs.h
@@ -6,24 +6,26 @@
 // Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation
 class LBFGS {
 public:
+    int step_count=0; // total number of minimize calls
+    int reset_count=0; // number of times lbfgs memory was cleared
     int m=5; // Number of previous gradients to use for hessian approximation (5-7)
-    int step_count; // total number of minimize calls
-    int reset_count; // number of times lbfgs memory was cleared
-    bool minimized;
-    real_x U0;
-    real_x Uf;
-    // Line search parameters
-    real_x c1 = 1e-4; // sufficient decent
-    real_x c2 = .9; // curvature cond.
-    // TODO: Implement convergence/stability checks
-    real_x eps = 1e-4; // Minimization criteria 
-    real_x delta = 1e-4; // stopping criteria
+    bool minimized=false;
+    real_x U0, Uf;
 
-    LBFGS(int m, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient);
+    LBFGS(int m, real_x eps, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient);
     ~LBFGS();
     void minimize_step(real_x f0);
 
 private:
+    // Line search parameters
+    real_x c1 = 1e-4; // sufficient decent
+    real_x c2 = .9; // curvature cond.
+    // Convergence
+    real_x eps_tol = 1; // rms criteria (1e-1 is very tight and unlikely to work)
+    real_x rmsg = 0; // |g| / sqrt(DOF)
+    real_x grad_mag = 0; // |g| 
+    real_x grad_pos_mag = 0; // |g| / max(1, |x|)
+
     int DOF; // Degrees of freedom to minimize
     real_x step_size = 1; // previous step size
     // All data stored on GPU
@@ -47,12 +49,10 @@ class LBFGS {
 
     std::function<real_x()> system_grad;
 
+    bool converged();
     void gamma_norm();
     void update_sk_yk();
-    void phi(real_x alpha, real_x *result, bool leave_x);
 
-    bool check_convergence();
-    
-    real_x zoom(real_x al, real_x const * const phi_lower, real_x au, real_x const * const phi_upper, real_x const * const phi0);
+    void phi(real_x alpha, real_x *result);
     real_x linesearch(real_x f0);
 };
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index 9659848..c0dd28f 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -133,9 +133,8 @@ void State::min_init(System *system)
 
     int shift = system->state->lambdaCount;
     int DOF = system->state->atomCount*3; // only minimize atom positions
-    int m = 10; // Past grad depth
-    system->state->recv_state();
-    system->run->lbfgs = new LBFGS(m, DOF, energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift);
+    system->run->lbfgs = new LBFGS(system->run->lbfgs_m, system->run->lbfgs_eps, DOF, 
+      energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift);
   }
 }
 
@@ -170,6 +169,7 @@ void State::min_move(int step, int nsteps, System *system)
       if (system->verbose>0) display_nrg(system);
       currEnergy=energy[eepotential];
       r->lbfgs->minimize_step(currEnergy);
+      if(r->lbfgs->minimized){ system->run->step = nsteps; } // do we want this?
     }
   }
   else if (r->minType==esd) {

From 5b2582fe87dabbb1ef2c937140972ec036e0556a Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Mon, 22 Dec 2025 15:03:37 -0800
Subject: [PATCH 7/8] Remove energy=true from get_force call, free new state
 double force buffer

---
 src/system/potential.cxx | 1 -
 src/system/state.cxx     | 1 +
 src/update/lbfgs.cu      | 4 ++--
 src/update/lbfgs.h       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/system/potential.cxx b/src/system/potential.cxx
index 93d75c4..4fdd10b 100644
--- a/src/system/potential.cxx
+++ b/src/system/potential.cxx
@@ -1617,7 +1617,6 @@ void Potential::calc_force(int step,System *system)
   if (system->run->freqNPT>0) {
     calcEnergy=(calcEnergy||(step%system->run->freqNPT==0));
   }
-  calcEnergy=true;
 #ifdef REPLICAEXCHANGE
   if (system->run->freqREx>0) {
     calcEnergy=(calcEnergy||(step%system->run->freqREx==0));
diff --git a/src/system/state.cxx b/src/system/state.cxx
index de03543..d45d914 100644
--- a/src/system/state.cxx
+++ b/src/system/state.cxx
@@ -157,6 +157,7 @@ State::~State() {
   if (positionBackup_d) cudaFree(positionBackup_d);
   if (forceBuffer) free(forceBuffer);
   if (forceBuffer_d) cudaFree(forceBuffer_d);
+  if (forceBufferX_d) cudaFree(forceBufferX_d);
   if (forceBackup_d) cudaFree(forceBackup_d);
   // Other buffers
   if (energy) free(energy);
diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu
index 46238cc..73d0440 100644
--- a/src/update/lbfgs.cu
+++ b/src/update/lbfgs.cu
@@ -119,12 +119,12 @@ LBFGS::LBFGS(int m, real_x eps, int DOF, std::function<real_x()> user_grad, real
 
 LBFGS::~LBFGS() {
     if (tmp_d) cudaFree(tmp_d);
+    if (gamma_d) cudaFree(gamma_d);
     if (rho_d) cudaFree(rho_d);
     if (alpha_d) cudaFree(alpha_d);
-    if (gamma_d) cudaFree(gamma_d);
+    if (q_d) cudaFree(q_d);
     if (prev_positions_d) cudaFree(prev_positions_d);
     if (prev_gradient_d) cudaFree(prev_gradient_d);
-    if (q_d) cudaFree(q_d);
     if (s_d) cudaFree(s_d);
     if (y_d) cudaFree(y_d);
     if (s_tmp_d) cudaFree(s_tmp_d);
diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h
index 43c05e5..8fac8eb 100644
--- a/src/update/lbfgs.h
+++ b/src/update/lbfgs.h
@@ -8,7 +8,7 @@ class LBFGS {
 public:
     int step_count=0; // total number of minimize calls
     int reset_count=0; // number of times lbfgs memory was cleared
-    int m=5; // Number of previous gradients to use for hessian approximation (5-7)
+    int m=7; // Number of previous gradients to use for hessian approximation (5-7)
     bool minimized=false;
     real_x U0, Uf;
 

From bbbc3ef042c280593b9bfbd88021b2daab38ff7f Mon Sep 17 00:00:00 2001
From: Matthew Speranza <speranzm@uci.edu>
Date: Thu, 5 Feb 2026 11:59:28 -0800
Subject: [PATCH 8/8] small updates

---
 src/run/run.cu         | 1 +
 src/update/minimize.cu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/run/run.cu b/src/run/run.cu
index b73bce3..e12b812 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -412,6 +412,7 @@ void Run::energy(char *line,char *token,System *system)
   system->potential->calc_force(0,system);
   system->state->recv_energy();
   print_nrg(0,system);
+  display_nrg(system);
   dynamics_finalize(system);
 }
 
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index c0dd28f..6ed4798 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -8,6 +8,7 @@
 #include "system/potential.h"
 #include "domdec/domdec.h"
 #include "holonomic/holonomic.h"
+#include "holonomic/rectify.h"
 #include "io/io.h"
 
 #include "main/real3.h"