diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 403e8e1..cdfb899 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 cmake_minimum_required(VERSION 3.0)
 project(blade LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 
 file(GLOB_RECURSE sources *.cxx *.cu *.h)
 
diff --git a/src/run/run.cu b/src/run/run.cu
index 7588f98..e12b812 100644
--- a/src/run/run.cu
+++ b/src/run/run.cu
@@ -357,14 +357,20 @@ void Run::set_variable(char *line,char *token,System *system)
     dxAtomMax=io_nextf(line)*ANGSTROM;
   } else if (strcmp(token,"dxrmsinit")==0) {
     dxRMSInit=io_nextf(line)*ANGSTROM;
+  } else if (strcmp(token, "lbfgs_m")==0){
+    lbfgs_m=io_nexti(line);
+  } else if (strcmp(token, "lbfgs_eps")==0){
+    lbfgs_eps=io_nextf(line);
   } else if (strcmp(token,"mintype")==0) {
     std::string minString=io_nexts(line);
-    if (strcmp(minString.c_str(),"sd")==0) {
+    if (strcmp(minString.c_str(), "lbfgs")==0){
+      minType=elbfgs;
+    } else if (strcmp(minString.c_str(),"sd")==0) {
       minType=esd;
     } else if (strcmp(minString.c_str(),"sdfd")==0) {
       minType=esdfd;
     } else {
-      fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: sd or sdfd\n",minString.c_str());
+      fatal(__FILE__,__LINE__,"Unrecognized token %s for minimization type minType. Options are: lbfgs, sd, or sdfd\n",minString.c_str());
     }
   } else if (strcmp(token,"domdecheuristic")==0) {
     domdecHeuristic=io_nextb(line);
@@ -406,6 +412,7 @@ void Run::energy(char *line,char *token,System *system)
   system->potential->calc_force(0,system);
   system->state->recv_energy();
   print_nrg(0,system);
+  display_nrg(system);
   dynamics_finalize(system);
 }
 
@@ -501,6 +508,13 @@ void Run::minimize(char *line,char *token,System *system)
     gpuCheck(cudaPeekAtLastError());
   }
 
+  if(system->run->minType==elbfgs){
+    Run* r = system->run;
+    printf("L-BFGS (m=%d) did %d force evaluations in %d steps. Reset memory %d times.\n", 
+      r->lbfgs->m, r->lbfgs_energy_evals, r->lbfgs->step_count, r->lbfgs->reset_count);
+    printf("U0: %f, Uf: %f, Uf - U0: %f\n", r->lbfgs->U0, r->lbfgs->Uf, r->lbfgs->Uf - r->lbfgs->U0);
+  }
+
   system->state->min_dest(system);
   dynamics_finalize(system);
 }
diff --git a/src/run/run.h b/src/run/run.h
index f7de7d1..184b31e 100644
--- a/src/run/run.h
+++ b/src/run/run.h
@@ -8,6 +8,7 @@
 
 #include "xdr/xdrfile.h"
 #include "xdr/xdrfile_xtc.h"
+#include "update/lbfgs.h"
 
 // Forward delcaration
 class System;
@@ -19,6 +20,7 @@ struct Cutoffs {
 };
 
 typedef enum emin {
+  elbfgs, // Limited memory BFGS algo
   esd, // steepest descent
   esdfd, // steepest descent with finite difference to choose step length
   eminend} EMin;
@@ -56,6 +58,12 @@ class Run {
   real dxRMS;
   EMin minType; // minimization scheme
 
+  // L-BFGS variables
+  LBFGS* lbfgs;
+  int lbfgs_energy_evals = 0;
+  int lbfgs_m = 7; // gradient history length*DOF
+  real lbfgs_eps = 1; // gradient rms convergence criteria
+
   real betaEwald;
   real rCut;
   real rSwitch;
diff --git a/src/system/state.cxx b/src/system/state.cxx
index 7635c0f..d45d914 100644
--- a/src/system/state.cxx
+++ b/src/system/state.cxx
@@ -42,6 +42,7 @@ State::State(System *system) {
   cudaMalloc(&(positionBackup_d),(2*nL+3*n)*sizeof(real_x));
   forceBuffer=(real_f*)calloc(rootFactor*(2*nL+3*n),sizeof(real_f));
   cudaMalloc(&(forceBuffer_d),rootFactor*(2*nL+3*n)*sizeof(real_f));
+  cudaMalloc(&(forceBufferX_d),rootFactor*(2*nL+3*n)*sizeof(real_x));
   cudaMalloc(&(forceBackup_d),(2*nL+3*n)*sizeof(real_f));
 
   if (system->idCount>0) { // OMP
@@ -156,6 +157,7 @@ State::~State() {
   if (positionBackup_d) cudaFree(positionBackup_d);
   if (forceBuffer) free(forceBuffer);
   if (forceBuffer_d) cudaFree(forceBuffer_d);
+  if (forceBufferX_d) cudaFree(forceBufferX_d);
   if (forceBackup_d) cudaFree(forceBackup_d);
   // Other buffers
   if (energy) free(energy);
diff --git a/src/system/state.h b/src/system/state.h
index 1270515..66ebf83 100644
--- a/src/system/state.h
+++ b/src/system/state.h
@@ -67,6 +67,7 @@ class State {
   real *positionBuffer_omp;
   real_f *forceBuffer;
   real_f *forceBuffer_d;
+  real_x *forceBufferX_d; 
   real_f *forceBackup_d; // For NPT
   real_f *forceBuffer_omp;
 
diff --git a/src/update/lbfgs.cu b/src/update/lbfgs.cu
new file mode 100644
index 0000000..73d0440
--- /dev/null
+++ b/src/update/lbfgs.cu
@@ -0,0 +1,331 @@
+#include <iostream>
+#include <cstdlib>
+#include <cassert>
+#include <cmath>
+#include <type_traits>
+#include <cstdio>
+#include <algorithm>
+
+#include <cuda_runtime.h>
+#include "update/lbfgs.h"
+#include "main/real3.h" // for real_sum_reduce
+
+/*
+    Cuda Kernels
+*/
+
+// d = 1/sqrt(d) - element-wise
+__global__ void inv_sqrt(int N, real_x *d) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = 1.0/sqrt(d[i]);
+    }
+}
+
+// d = 1/d - element-wise
+__global__ void recip(int N, real_x *d) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = 1.0/d[i];
+    }
+}
+
+// d = abs(d) - element-wise
+__global__ void vector_abs(int N, real_x *d){
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        d[i] = abs(d[i]);
+    }
+}
+
+// C = u*A + w*d*B, C and A and B can be related
+__global__ void vector_add(int N, real_x u, real_x *A, real_x w, real_x *d, real_x *B, real_x *C) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        if(d){
+            C[i] = u * A[i] + w * d[0] * B[i];
+        } else {
+            C[i] = u * A[i] + w * B[i];
+        }
+    }
+}
+
+// specific for 2nd lbfgs loop
+// C = A + (u-w)*B, C and A and B can be related
+__global__ void vector_add(int N, real_x *A, real_x *u, real_x *w, real_x *B, real_x *C) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        C[i] = A[i] + (u[0]-w[0]) * B[i];
+    }
+}
+
+// C = A[0]*B - element-wise
+__global__ void vector_scale(int N, real_x *A, real_x *B, real_x *C){
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        C[i] = A[0]*B[i];
+    }
+}
+
+// B = c*A - element-wise
+__global__ void vector_scale(int N, real_x c, real_x *A, real_x *B){
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) {
+        B[i] = c*A[i];
+    }
+}
+
+__global__ void dot_product(int N, real_x *A, real_x *B, real_x* dot) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real_x lDot = 0;
+    extern __shared__ real sDot[];
+    if (i < N) {
+        lDot = A[i]*B[i];
+    }
+
+    real_sum_reduce((real)lDot, sDot, dot);
+}
+
+/*
+    Class setup
+*/
+
+//  <position precision, real_x precision>
+LBFGS::LBFGS(int m, real_x eps, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient)
+    : m(m), eps_tol(eps), DOF(DOF), system_grad(user_grad), X_d(position), G_d(gradient) {
+    k = 0;
+    cudaMalloc(&tmp_d, sizeof(real_x));
+    cudaMalloc(&gamma_d, sizeof(real_x));
+    cudaMalloc(&rho_d, m*sizeof(real_x));
+    cudaMalloc(&alpha_d, m*sizeof(real_x));
+    cudaMalloc(&q_d, DOF*sizeof(real_x));
+    cudaMalloc(&prev_positions_d, DOF*sizeof(real_x));
+    cudaMalloc(&prev_gradient_d, DOF*sizeof(real_x));
+    cudaMalloc(&s_d, m*DOF*sizeof(real_x));
+    cudaMalloc(&y_d, m*DOF*sizeof(real_x));
+    cudaMalloc(&s_tmp_d, DOF*sizeof(real_x));
+    cudaMalloc(&y_tmp_d, DOF*sizeof(real_x));
+
+    U0 = user_grad(); // first energy call by lbfgs
+
+    // Normalize first s.d. step
+    gamma_norm();
+
+    // Set up s.d. step
+    cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d);
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d);
+}
+
+LBFGS::~LBFGS() {
+    if (tmp_d) cudaFree(tmp_d);
+    if (gamma_d) cudaFree(gamma_d);
+    if (rho_d) cudaFree(rho_d);
+    if (alpha_d) cudaFree(alpha_d);
+    if (q_d) cudaFree(q_d);
+    if (prev_positions_d) cudaFree(prev_positions_d);
+    if (prev_gradient_d) cudaFree(prev_gradient_d);
+    if (s_d) cudaFree(s_d);
+    if (y_d) cudaFree(y_d);
+    if (s_tmp_d) cudaFree(s_tmp_d);
+    if (y_tmp_d) cudaFree(y_tmp_d);
+}
+
+/*
+    L-BFGS iteration
+*/
+
+bool LBFGS::converged(){
+    // return true if rmsg minimized
+    rmsg = 0;
+    grad_mag = 0;
+    grad_pos_mag = 0;
+
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d);
+    cudaMemcpy(&rmsg, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    grad_mag = sqrt(rmsg);
+    rmsg = sqrt(rmsg/DOF);
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, X_d, X_d, tmp_d);
+    cudaMemcpy(&grad_pos_mag, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    grad_pos_mag = grad_mag / std::max(1.0, sqrt(grad_pos_mag));
+
+    printf("   rmsg = %f\n", rmsg);
+    printf("    |g| = %f\n", grad_mag);
+    printf("|g|/|x| = %f\n", grad_pos_mag);
+    if(rmsg < eps_tol){
+        printf("Structure minimized!!\n");
+        minimized=true;
+    }
+    return rmsg < eps_tol;
+}
+
+// set gamma = 1/|g_d|
+void LBFGS::gamma_norm(){
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, G_d, tmp_d);
+    inv_sqrt<<<1,1>>>(1, tmp_d);
+    cudaMemcpy(gamma_d, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+}
+
+// Ch7.4, p178 of Nocedal & Wright (Algorithm 7.4)
+void LBFGS::minimize_step(real_x f0) { // f0 & G filled from class initializer
+    // Copy kth positions & gradients
+    cudaMemcpy(prev_positions_d, X_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(prev_gradient_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    if(converged()) return;
+
+    // min_a f(X + a*q)
+    step_size = linesearch(f0); // X & G left at & evaluated at f(X+a*q)
+
+    // kth position and gradient deltas
+    update_sk_yk(); // potentially skip decrement k or set k=0 & clear s & y memory
+    k++;
+
+    // Two-loop recursion
+    cudaMemcpy(q_d, G_d, DOF*sizeof(real_x), cudaMemcpyDefault); 
+    for (int i = k-1; i >= std::max(0, k-m); i--) {
+        int index = i % m;
+        // alpha.i = rho.i*(s.i dot q)
+        cudaMemset(alpha_d+index, 0, sizeof(real_x));
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_d+index*DOF, q_d, alpha_d+index);
+        vector_scale<<<1,1>>>(1, rho_d+index, alpha_d+index, alpha_d+index);
+        // q = q - alpha.i*y.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, q_d, -1, alpha_d+index, y_d+index*DOF, q_d);
+    }
+    // q = gamma.k*q = r
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, gamma_d, q_d, q_d);
+    for (int i = std::max(0, k-m); i <= k-1; i++) {
+        int index = i % m;
+        // B = rho.i*(y.i dot q)
+        cudaMemset(tmp_d, 0, sizeof(real_x));
+        dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_d+index*DOF, q_d, tmp_d);
+        vector_scale<<<1,1>>>(1, rho_d+index, tmp_d, tmp_d);
+        // q = q + (alpha.i - B)*s.i = q - B*s.i + alpha.i*s.i
+        vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, q_d, alpha_d+index, tmp_d, s_d+index*DOF, q_d);
+   }
+
+    // q = -q
+    vector_scale<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, -1, q_d, q_d);
+    step_count++;
+}
+
+// this method is overly cautious & slow, likely doesn't need the cpu grad checks or resets
+void LBFGS::update_sk_yk() {
+    if(m == 0) { // Steepest decent, normalize step size via gamma
+        gamma_norm();
+        return;
+    } 
+
+    // s & y tmp
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, -1, NULL, prev_positions_d, s_tmp_d);
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, G_d, -1, NULL, prev_gradient_d, y_tmp_d);
+
+    real_x yy = 0;
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, y_tmp_d, y_tmp_d, tmp_d);
+    cudaMemcpy(&yy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    real_x sy = 0;
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, s_tmp_d, y_tmp_d, tmp_d);
+    cudaMemcpy(&sy, tmp_d, sizeof(real_x), cudaMemcpyDefault);
+
+    // Check curvature s.T H s = s.T y > 0 for positive def matrix satisfying secant eq Hs=y (required for L-BFGS)
+    if(sy < 1e-10){ 
+        printf("Curvature condition (sy = %e) not satistied! Clearing L-BFGS memory!\n", sy);
+        cudaMemset(s_d, 0, m*DOF*sizeof(real_x));
+        cudaMemset(y_d, 0, m*DOF*sizeof(real_x));
+        gamma_norm();
+        k = -1;
+        reset_count++;
+        return;
+    } 
+
+    // rho = 1/y.s
+    real_x rho = 1.0/sy;
+    // gamma_k = s.y/(y.y)
+    real_x gamma = sy/yy;
+    cudaMemcpy(gamma_d, &gamma, sizeof(real_x), cudaMemcpyDefault);
+    int index = k % m;
+    cudaMemcpy(rho_d+index, &rho, sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(s_d + index*DOF, s_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    cudaMemcpy(y_d + index*DOF, y_tmp_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    //printf("iter: %d, k: %d, index: %d, rho: %f, gamma: %f\n", step_count, k, index, rho, gamma);
+}
+
+/*
+    Line Search
+*/
+
+// return [f(X + alpha*p), df(X+alpha*p)/da]
+void LBFGS::phi(real_x alpha, real_x* result){
+    cudaMemcpy(X_d, prev_positions_d, DOF*sizeof(real_x), cudaMemcpyDefault);
+    vector_add<<<(DOF+BLUP-1)/BLUP,BLUP>>>(DOF, 1, X_d, alpha, NULL, q_d, X_d);
+    result[0] = system_grad();
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da
+    cudaMemcpy(&result[1], tmp_d, sizeof(real_x), cudaMemcpyDefault);
+}
+
+real_x sign(real_x value){
+    if(value < 0){
+        return -1;
+    }
+    return 1;
+}
+
+// Ch3.5, p59 of Nocedal & Wright (Equation 3.59) 
+real_x cubic_interp(real_x a, real_x fa, real_x ga, real_x b, real_x fb, real_x gb){
+    real_x d1 = ga + gb - 3*(fa - fb)/(a-b);
+    real_x d2 = sign(b - a)*sqrt(d1*d1 - ga*gb);
+    real_x arg = b - (b-a)*(gb + d2 - d1)/(gb - ga + 2*d2);
+    if(isinf(arg) || isnan(arg) || abs(arg - a) < 1e-7 || abs(arg - b) < 1e-7 || arg < 0){
+        //printf("Quadratic interpolation failed! arg: %f, a: %f, b: %f\n", arg, a, b);
+        return (a + b) / 2.0;
+    }
+    return arg;
+}
+
+// Ch3.5, p60 of Nocedal & Wright (Algorithm 3.5)
+real_x LBFGS::linesearch(real_x f0){
+    real_x max_iter = 5;
+    real_x aim1 = 0; 
+    real_x ai = 1;
+    real_x amax = 10;
+    // Data is already loaded, saves 1 energy & grad call
+    real_x phi0[2] = {f0, 0};
+    cudaMemset(tmp_d, 0, sizeof(real_x));
+    dot_product<<<(DOF+BLUP-1)/BLUP,BLUP,BLUP*sizeof(real_x)/32, 0>>>(DOF, G_d, q_d, tmp_d); // df(0)/da
+    cudaMemcpy(&phi0[1], tmp_d, sizeof(real_x), cudaMemcpyDefault);
+    real_x phiim1[2];
+    memcpy(phiim1, phi0, 2*sizeof(real_x));
+    // Evaluate at 1 & iterate using cubic interpolation
+    real_x phii[2];
+    phi(ai, phii);
+    printf("alpha: %f, phi: %f, phi': %f\n", aim1, phi0[0], phi0[1]);
+    printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+    for(int i = 0; i < max_iter; i++){
+        // Check strong wolfe condition
+        if (phii[0] <= phi0[0] + c1*ai*phi0[1] && abs(phii[1]) <= c2*abs(phi0[1])){
+            Uf = phii[0];
+            return ai;
+        }
+        real_x tmp = ai;
+        ai = cubic_interp(aim1, phiim1[0], phiim1[1], ai, phii[0], phii[1]);
+        phi(ai, phii);
+        printf("alpha: %f, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+        aim1 = tmp;
+        memcpy(phiim1, phii, 2*sizeof(real_x));
+        if(ai < 0 || ai > amax){
+            break;
+        }
+    }
+
+    // Failed line search
+    ai = 1e-5; // default step size, a small enough step should decrease function if s.y > 0 
+    phi(ai, phii);
+    Uf = phii[0];
+    printf("Linesearch failed! Defaulting to ai = %e, phi: %f, phi': %f\n", ai, phii[0], phii[1]);
+    return ai;
+}
\ No newline at end of file
diff --git a/src/update/lbfgs.h b/src/update/lbfgs.h
new file mode 100644
index 0000000..8fac8eb
--- /dev/null
+++ b/src/update/lbfgs.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <functional>
+#include "main/defines.h"
+
+// Implementation based on Nocedal & Wright book with references made to OpenMM extern lib implementation
+class LBFGS {
+public:
+    int step_count=0; // total number of minimize calls
+    int reset_count=0; // number of times lbfgs memory was cleared
+    int m=7; // Number of previous gradients to use for hessian approximation (5-7)
+    bool minimized=false;
+    real_x U0, Uf;
+
+    LBFGS(int m, real_x eps, int DOF, std::function<real_x()> user_grad, real_x *position, real_x *gradient);
+    ~LBFGS();
+    void minimize_step(real_x f0);
+
+private:
+    // Line search parameters
+    real_x c1 = 1e-4; // sufficient decent
+    real_x c2 = .9; // curvature cond.
+    // Convergence
+    real_x eps_tol = 1; // rms criteria (1e-1 is very tight and unlikely to work)
+    real_x rmsg = 0; // |g| / sqrt(DOF)
+    real_x grad_mag = 0; // |g| 
+    real_x grad_pos_mag = 0; // |g| / max(1, |x|)
+
+    int DOF; // Degrees of freedom to minimize
+    real_x step_size = 1; // previous step size
+    // All data stored on GPU
+    int k = 0; // lbfgs iteration step
+    real_x *tmp_d; // 1 real temperary storage
+    real_x *gamma_d; // s[k-1] projected onto y[k-1]
+    real_x *rho_d; // [m] rho[k] = 1/(s[k]^T * y[k])
+    real_x *alpha_d; // [m] alpha[i] = rho[i] * s[i]^T * y[i]
+    real_x *search_d; // [DOF] L-BFGS search direction
+    real_x *prev_positions_d; // [DOF] x[k-1]
+    real_x *prev_gradient_d; // [DOF] g[k-1]
+    real_x *X_d; // points to user GPU memory, positions to be optimized
+    real_x *G_d; // points to user GPU memory
+    real_x *q_d;
+    real_x *s_d; // [m, DOF] s[k-1] = x[k] - x[k-1] (1D array)
+    real_x *y_d; // [m, DOF] y[k-1] = grad[k] - grad[k-1] (1D array)
+
+    real_x *s_tmp_d; // [DOF] x[k] - x[k-1], used to check curvature 
+    real_x *y_tmp_d; // [DOF] g[k] - g[k-1], used to check curvature
+    bool skipped_step = false; // skipped hessian update, if 2 updates are skipped, we clear s & y memory
+
+    std::function<real_x()> system_grad;
+
+    bool converged();
+    void gamma_norm();
+    void update_sk_yk();
+
+    void phi(real_x alpha, real_x *result);
+    real_x linesearch(real_x f0);
+};
\ No newline at end of file
diff --git a/src/update/minimize.cu b/src/update/minimize.cu
index 71547c1..6ed4798 100644
--- a/src/update/minimize.cu
+++ b/src/update/minimize.cu
@@ -8,10 +8,14 @@
 #include "system/potential.h"
 #include "domdec/domdec.h"
 #include "holonomic/holonomic.h"
+#include "holonomic/rectify.h"
 #include "io/io.h"
 
 #include "main/real3.h"
 
+#include <functional>
+#include "msld/msld.h"
+
 
 
 __global__ void no_mass_weight_kernel(int N,real* masses,real* ones)
@@ -82,6 +86,18 @@ __global__ void sdfd_dotproduct_kernel(int N,struct LeapState ls,real_v *minDire
   real_sum_reduce(3*lDot/N,sDot,dot);
 }
 
+__global__ void double_onto_float(int N, real_x* doubleBuffer, real* floatBuffer, bool direction)
+{
+  int i=blockIdx.x*blockDim.x+threadIdx.x;
+  if (i<N) {
+    if(direction){ // forward - float = double 
+      floatBuffer[i] = (real)doubleBuffer[i];
+    } else { // backward - double = float
+      doubleBuffer[i] = (real_x)floatBuffer[i];
+    }
+  }
+}
+
 void State::min_init(System *system)
 {
   // Set masses to 1 for shake during minimization, except virtual sites
@@ -96,6 +112,31 @@ void State::min_init(System *system)
   if (system->run->minType==esdfd) {
     cudaMalloc(&minDirection_d,3*atomCount*sizeof(real_v));
   }
+  if (system->run->minType==elbfgs){
+    // Function called by L-BFGS class to get energy & gradient 
+    std::function<real_x()> energy_and_grad = [system](){
+      // Copy double precision onto float precision buffer
+      int DOF = system->state->atomCount*3;
+      int shift = system->state->lambdaCount;
+      double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
+            DOF, system->state->positionBuffer_d+shift, system->state->positionBuffer_fd+shift, true);
+      // Potential & Grad Eval -> step=0 to calc energy
+      //system->domdec->update_domdec(system,true);
+      system->potential->calc_force(0, system);
+      // grad(F(X)) G already stored
+      system->state->recv_energy();
+      // Copy float precision onto double precision buffer
+      double_onto_float<<<(DOF+BLUP-1)/BLUP,BLUP,0,system->run->updateStream>>>(
+            DOF, system->state->forceBufferX_d+shift, system->state->forceBuffer_d+shift, false);
+      system->run->lbfgs_energy_evals++;
+      return system->state->energy[eepotential];
+    };
+
+    int shift = system->state->lambdaCount;
+    int DOF = system->state->atomCount*3; // only minimize atom positions
+    system->run->lbfgs = new LBFGS(system->run->lbfgs_m, system->run->lbfgs_eps, DOF, 
+      energy_and_grad, system->state->positionBuffer_d+shift, system->state->forceBufferX_d+shift);
+  }
 }
 
 void State::min_dest(System *system)
@@ -109,9 +150,12 @@ void State::min_dest(System *system)
   if (system->run->minType==esdfd) {
     cudaFree(minDirection_d);
   }
+  if (system->run->minType==elbfgs){
+    system->run->lbfgs->~LBFGS();
+  }
 }
 
-void State::min_move(int step,int nsteps,System *system)
+void State::min_move(int step, int nsteps, System *system)
 {
   Run *r=system->run;
   real_e grads2[2];
@@ -120,7 +164,16 @@ void State::min_move(int step,int nsteps,System *system)
   real scaling, rescaling;
   real frac;
 
-  if (r->minType==esd) {
+  if (r->minType==elbfgs){
+    if (system->id==0){
+      recv_energy();
+      if (system->verbose>0) display_nrg(system);
+      currEnergy=energy[eepotential];
+      r->lbfgs->minimize_step(currEnergy);
+      if(r->lbfgs->minimized){ system->run->step = nsteps; } // do we want this?
+    }
+  }
+  else if (r->minType==esd) {
     if (system->id==0) {
       recv_energy();
       if (system->verbose>0) display_nrg(system);