From 0fafa8c8395815358d564cdfa1d897addcda8ba1 Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Mon, 10 May 2021 18:37:24 -0600
Subject: [PATCH 01/26] "Fix" off by one iepoch

---
 nequip/train/trainer.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 5aa7fc36..74d822fd 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -619,19 +619,23 @@ def train(self):
         self.wall = perf_counter()
 
         stop = False
+
         if not self.restart:
             self.best_val_metrics = float("inf")
             self.best_epoch = 0
             self.iepoch = 0
+        else:
+            # if a restart, iepoch is the index of the last epoch that was *completed*
+            # our first epoch will be the epoch after that
+            self.iepoch += 1
+
         self.init_metrics()
 
         while self.iepoch < self.max_epochs and not stop:
-
             early_stop = self.epoch_step()
             if early_stop:
                 stop = False
                 self.stop_arg = "early stop"
-
             self.iepoch += 1
 
         if not stop:
@@ -642,7 +646,12 @@ def train(self):
 
         self.final_log()
 
+        # This is a painful hack to avoid an off-by-one error when restarting a run that ran out of frames.
+        # `iepoch` is supposed to be the index of the last completed epoch when `.save()` is called.
+        # But this `.save()` comes after `iepoch += 1` in the training loop above.
+        self.iepoch -= 1
         self.save(self.trainer_save_path)
+        self.iepoch += 1
 
     def batch_step(self, data, validation=False):
         # no need to have gradients from old steps taking up memory

From 93ed096d105aeca3af2be04972c4a5b94ebde709 Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Mon, 10 May 2021 18:38:10 -0600
Subject: [PATCH 02/26] Changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d257816..8386af6d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Most recent change on the bottom.
 
 ## [Unreleased]
+### Fixed
+- `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs`
 
 ## [0.3.0] - 2021-05-07
 ### Added

From 77fd2695c3eb3e0a152c477c07ae0843c8d46422 Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Tue, 11 May 2021 12:56:29 -0600
Subject: [PATCH 03/26] add more orthogonal inits

---
 nequip/utils/initialization.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/nequip/utils/initialization.py b/nequip/utils/initialization.py
index 39d98694..09aff503 100644
--- a/nequip/utils/initialization.py
+++ b/nequip/utils/initialization.py
@@ -51,3 +51,17 @@ def orthogonal_initialize_linears(mod: torch.nn.Module):
     if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights:
         for w in mod.weight_views():
             unit_uniform_init_(w)
+
+
+def orthogonal_initialize_fcs(mod: torch.nn.Module):
+    if isinstance(mod, e3nn.nn.FullyConnectedNet):
+        for w in mod.weights:
+            torch.nn.init.orthogonal_(w)
+    elif isinstance(mod, torch.nn.Linear):
+        torch.nn.init.orthogonal_(mod.weight)
+
+
+def unit_orthogonal_initialize_e3nn_fcs(mod: torch.nn.Module):
+    if isinstance(mod, e3nn.nn.FullyConnectedNet):
+        for w in mod.weights:
+            unit_orthogonal_init_(w)

From c0f294f909bdc00cb5650b4483f0c8bc95d3807c Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Tue, 11 May 2021 15:14:28 -0600
Subject: [PATCH 04/26] docstrings

---
 nequip/utils/initialization.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/nequip/utils/initialization.py b/nequip/utils/initialization.py
index 09aff503..2e8c0e02 100644
--- a/nequip/utils/initialization.py
+++ b/nequip/utils/initialization.py
@@ -8,10 +8,12 @@
 
 # == Uniform init ==
 def unit_uniform_init_(t: torch.Tensor):
+    """Uniform initialization with <x_i^2> = 1"""
     t.uniform_(-math.sqrt(3), math.sqrt(3))
 
 
 def uniform_initialize_fcs(mod: torch.nn.Module):
+    """Initialize ``e3nn.nn.FullyConnectedNet``s with ``unit_uniform_init_``"""
     if isinstance(mod, e3nn.nn.FullyConnectedNet):
         for w in mod.weights:
             unit_uniform_init_(w)
@@ -19,17 +21,20 @@ def uniform_initialize_fcs(mod: torch.nn.Module):
 
 
 def uniform_initialize_linears(mod: torch.nn.Module):
+    """Initialize ``e3nn.o3.Linear``s with ``unit_uniform_init_``"""
     if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights:
         unit_uniform_init_(mod.weight)
 
 
 def uniform_initialize_tps(mod: torch.nn.Module):
+    """Initialize ``e3nn.o3.TensorProduct``s with ``unit_uniform_init_``"""
     if isinstance(mod, e3nn.o3.TensorProduct) and mod.internal_weights:
         unit_uniform_init_(mod.weight)
 
 
 # == Xavier ==
 def xavier_initialize_fcs(mod: torch.nn.Module):
+    """Initialize ``e3nn.nn.FullyConnectedNet``s and ``torch.nn.Linear``s with Xavier uniform initialization"""
     if isinstance(mod, e3nn.nn.FullyConnectedNet):
         for w in mod.weights:
             # in FC:
@@ -42,18 +47,20 @@ def xavier_initialize_fcs(mod: torch.nn.Module):
 
 # == Orthogonal ==
 def unit_orthogonal_init_(t: torch.Tensor):
-    """Orthogonal init with <x_i^2> = N"""
+    """Orthogonal init with <x_i^2> = 1"""
     assert t.ndim == 2
     torch.nn.init.orthogonal_(t, gain=math.sqrt(max(t.shape)))
 
 
 def orthogonal_initialize_linears(mod: torch.nn.Module):
+    """Initialize ``e3nn.o3.Linear``s with ``unit_orthogonal_init_``"""
     if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights:
         for w in mod.weight_views():
             unit_uniform_init_(w)
 
 
 def orthogonal_initialize_fcs(mod: torch.nn.Module):
+    """Initialize ``e3nn.nn.FullyConnectedNet``s and ``torch.nn.Linear``s with orthogonal initialization"""
     if isinstance(mod, e3nn.nn.FullyConnectedNet):
         for w in mod.weights:
             torch.nn.init.orthogonal_(w)
@@ -62,6 +69,7 @@ def orthogonal_initialize_fcs(mod: torch.nn.Module):
 
 
 def unit_orthogonal_initialize_e3nn_fcs(mod: torch.nn.Module):
+    """Initialize only ``e3nn.nn.FullyConnectedNet``s with ``unit_orthogonal_init_``"""
     if isinstance(mod, e3nn.nn.FullyConnectedNet):
         for w in mod.weights:
             unit_orthogonal_init_(w)

From 179911dc5e8c23ecc4e425b89cb0fbfce539fcfd Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 12:08:07 -0400
Subject: [PATCH 05/26] move save and log out from the epoch_step

---
 nequip/train/trainer.py | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 74d822fd..05873eea 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -618,27 +618,23 @@ def train(self):
         self.init_log()
         self.wall = perf_counter()
 
-        stop = False
-
         if not self.restart:
             self.best_val_metrics = float("inf")
             self.best_epoch = 0
             self.iepoch = 0
-        else:
-            # if a restart, iepoch is the index of the last epoch that was *completed*
-            # our first epoch will be the epoch after that
-            self.iepoch += 1
 
         self.init_metrics()
 
-        while self.iepoch < self.max_epochs and not stop:
+        early_stop = False
+        while self.iepoch < self.max_epochs and not early_stop:
+
             early_stop = self.epoch_step()
-            if early_stop:
-                stop = False
-                self.stop_arg = "early stop"
             self.iepoch += 1
 
-        if not stop:
+            self.end_of_epoch_log()
+            self.end_of_epoch_save()
+
+        if not early_stop:
             self.stop_arg = "max epochs"
 
         for callback in self.final_callbacks:
@@ -646,12 +642,7 @@ def train(self):
 
         self.final_log()
 
-        # This is a painful hack to avoid an off-by-one error when restarting a run that ran out of frames.
-        # `iepoch` is supposed to be the index of the last completed epoch when `.save()` is called.
-        # But this `.save()` comes after `iepoch += 1` in the training loop above.
-        self.iepoch -= 1
         self.save(self.trainer_save_path)
-        self.iepoch += 1
 
     def batch_step(self, data, validation=False):
         # no need to have gradients from old steps taking up memory
@@ -727,6 +718,7 @@ def early_stop_cond(self):
 
         if self.early_stop_lower_threshold is not None:
             if self.best_val_metrics < self.early_stop_lower_threshold:
+                self.stop_arg = "reach lower_thrdshold"
                 return True
         return False
 
@@ -766,9 +758,6 @@ def epoch_step(self):
                     for callback in self.end_of_train_callbacks:
                         callback(self)
 
-        self.end_of_epoch_log()
-        self.end_of_epoch_save()
-
         if self.lr_scheduler_name == "ReduceLROnPlateau":
             self.lr_sched.step(
                 metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"]
@@ -793,8 +782,8 @@ def end_of_batch_log(self, batch_type: str):
         store all the loss/mae of each batch
         """
 
-        mat_str = f"{self.iepoch+1:5d}, {self.ibatch+1:5d}"
-        log_str = f"{self.iepoch+1:5d} {self.ibatch+1:5d}"
+        mat_str = f"{self.iepoch:5d}, {self.ibatch+1:5d}"
+        log_str = f"{self.iepoch:5d} {self.ibatch+1:5d}"
 
         header = "epoch, batch"
         log_header = "# Epoch batch"
@@ -864,7 +853,7 @@ def end_of_epoch_save(self):
                         torch.save(self.model, save_path)
 
             self.logger.info(
-                f"! Best model {self.best_epoch+1:8d} {self.best_val_metrics:8.3f}"
+                f"! Best model {self.best_epoch:8d} {self.best_val_metrics:8.3f}"
             )
 
         if (self.iepoch + 1) % self.log_epoch_freq == 0:
@@ -905,11 +894,11 @@ def end_of_epoch_log(self):
         log_str = {}
 
         strings = ["Epoch", "wal", "LR"]
-        mat_str = f"{self.iepoch+1:10d}, {wall:8.3f}, {lr:8.3g}"
+        mat_str = f"{self.iepoch:10d}, {wall:8.3f}, {lr:8.3g}"
         for cat in categories:
             log_header[cat] = "# "
             log_header[cat] += " ".join([f"{s:>8s}" for s in strings])
-            log_str[cat] = f"{self.iepoch+1:10d} {wall:8.3f} {lr:8.3g}"
+            log_str[cat] = f"{self.iepoch:10d} {wall:8.3f} {lr:8.3g}"
 
         for category in categories:
 

From 9881e945f17514822751c48a7324970ffae45be4 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 12:12:45 -0400
Subject: [PATCH 06/26] remove auto-increase of max_epochs during restart

---
 nequip/scripts/restart.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/nequip/scripts/restart.py b/nequip/scripts/restart.py
index ac023895..288b424b 100644
--- a/nequip/scripts/restart.py
+++ b/nequip/scripts/restart.py
@@ -63,8 +63,13 @@ def restart(file_name, config, mode="update"):
     if "progress" in dictionary:
         stop_args = dictionary["progress"].pop("stop_arg", None)
         if stop_args is not None:
-            dictionary["progress"]["stop_arg"] = None
-            dictionary["max_epochs"] *= 2
+            if stop_args == "max epochs":
+                raise RuntimeError(
+                    f"Previous run reach max epochs. Please use the update config to increase max epochs"
+                )
+            else:
+                logging.warning(f"Restart by ignoring previous stop {stop_args}")
+                dictionary["progress"]["stop_arg"] = None
 
     if config.wandb:
         from nequip.train.trainer_wandb import TrainerWandB

From 0763a5a20fd0f6544336e59ad0d6a1914f0ad193 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 12:19:45 -0400
Subject: [PATCH 07/26] rename

---
 nequip/train/trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 05873eea..4c6cbbaf 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -221,7 +221,7 @@ def __init__(
         loss_coeffs: Union[dict, str] = AtomicDataDict.TOTAL_ENERGY_KEY,
         metrics_components: Optional[Union[dict, str]] = None,
         metrics_key: str = ABBREV.get(LOSS_KEY, LOSS_KEY),
-        early_stop_lower_threshold: Optional[float] = None,
+        early_stop_threshold: Optional[float] = None,
         max_epochs: int = 1000000,
         lr_sched=None,
         learning_rate: float = 1e-2,
@@ -716,9 +716,9 @@ def batch_step(self, data, validation=False):
     def early_stop_cond(self):
         """ kill the training early """
 
-        if self.early_stop_lower_threshold is not None:
-            if self.best_val_metrics < self.early_stop_lower_threshold:
-                self.stop_arg = "reach lower_thrdshold"
+        if self.early_stop_threshold is not None:
+            if self.best_val_metrics < self.early_stop_threshold:
+                self.stop_arg = "reach early stop thrdshold"
                 return True
         return False
 

From a5a305d7ba55336f7e4d55450603a2b0e1e23b20 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 12:23:01 -0400
Subject: [PATCH 08/26] fix batch epoch log problem

---
 nequip/train/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 05873eea..ed52374a 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -782,8 +782,8 @@ def end_of_batch_log(self, batch_type: str):
         store all the loss/mae of each batch
         """
 
-        mat_str = f"{self.iepoch:5d}, {self.ibatch+1:5d}"
-        log_str = f"{self.iepoch:5d} {self.ibatch+1:5d}"
+        mat_str = f"{self.iepoch+1:5d}, {self.ibatch+1:5d}"
+        log_str = f"{self.iepoch+1:5d} {self.ibatch+1:5d}"
 
         header = "epoch, batch"
         log_header = "# Epoch batch"

From 38457f606b8c436ba6e3391c40efb90f94333491 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 13:00:09 -0400
Subject: [PATCH 09/26] update sanity check for stop args

---
 nequip/scripts/restart.py | 12 ------------
 nequip/train/trainer.py   | 33 +++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/nequip/scripts/restart.py b/nequip/scripts/restart.py
index 288b424b..26f14e7c 100644
--- a/nequip/scripts/restart.py
+++ b/nequip/scripts/restart.py
@@ -59,18 +59,6 @@ def restart(file_name, config, mode="update"):
         {"float32": torch.float32, "float64": torch.float64}[config.default_dtype]
     )
 
-    # increase max_epochs if training has hit maximum epochs
-    if "progress" in dictionary:
-        stop_args = dictionary["progress"].pop("stop_arg", None)
-        if stop_args is not None:
-            if stop_args == "max epochs":
-                raise RuntimeError(
-                    f"Previous run reach max epochs. Please use the update config to increase max epochs"
-                )
-            else:
-                logging.warning(f"Restart by ignoring previous stop {stop_args}")
-                dictionary["progress"]["stop_arg"] = None
-
     if config.wandb:
         from nequip.train.trainer_wandb import TrainerWandB
 
diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index ed52374a..882c4867 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -468,12 +468,6 @@ def from_dict(cls, dictionary, append: Optional[bool] = None):
             model = d.pop("model")
         elif "progress" in d:
             progress = d["progress"]
-            stop_arg = progress.pop("stop_arg", None)
-            if stop_arg is not None:
-                raise RuntimeError(
-                    f"The previous run has properly stopped with {stop_arg}."
-                    "Please either increase the max_epoch or change early stop criteria"
-                )
 
             # load the model from file
             iepoch = progress["iepoch"]
@@ -515,11 +509,20 @@ def from_dict(cls, dictionary, append: Optional[bool] = None):
         if "progress" in d:
             trainer.best_val_metrics = progress["best_val_metrics"]
             trainer.best_epoch = progress["best_epoch"]
+            stop_arg = progress.pop("stop_arg", None)
         else:
             trainer.best_val_metrics = float("inf")
             trainer.best_epoch = 0
+            stop_arg = None
         trainer.iepoch = iepoch
 
+        # final sanity check
+        if trainer.stop_cond:
+            raise RuntimeError(
+                f"The previous run has properly stopped with {stop_arg}."
+                "Please either increase the max_epoch or change early stop criteria"
+            )
+
         return trainer
 
     def init(self):
@@ -625,18 +628,15 @@ def train(self):
 
         self.init_metrics()
 
-        early_stop = False
-        while self.iepoch < self.max_epochs and not early_stop:
+        stop = False
+        while not stop:
 
-            early_stop = self.epoch_step()
+            stop = self.epoch_step()
             self.iepoch += 1
 
             self.end_of_epoch_log()
             self.end_of_epoch_save()
 
-        if not early_stop:
-            self.stop_arg = "max epochs"
-
         for callback in self.final_callbacks:
             callback(self)
 
@@ -713,13 +713,18 @@ def batch_step(self, data, validation=False):
             self.batch_metrics = self.metrics(pred=out, ref=data)
 
     @property
-    def early_stop_cond(self):
+    def stop_cond(self):
         """ kill the training early """
 
         if self.early_stop_lower_threshold is not None:
             if self.best_val_metrics < self.early_stop_lower_threshold:
                 self.stop_arg = "reach lower_thrdshold"
                 return True
+
+        if self.iepoch >= self.max_epochs:
+            self.stop_arg = "max epochs"
+            return True
+
         return False
 
     def reset_metrics(self):
@@ -766,7 +771,7 @@ def epoch_step(self):
         for callback in self.end_of_epoch_callbacks:
             callback(self)
 
-        return self.early_stop_cond
+        return self.stop_cond
 
     def log_dictionary(self, dictionary: dict, name: str = ""):
         """

From 0dcfa37c57f55079f0c443bb0fd13158fb7f137d Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 14:04:37 -0400
Subject: [PATCH 10/26] fix max_epochs

---
 nequip/train/trainer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 882c4867..2f1d56bb 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -628,8 +628,7 @@ def train(self):
 
         self.init_metrics()
 
-        stop = False
-        while not stop:
+        while not self.stop_cond:
 
             stop = self.epoch_step()
             self.iepoch += 1
@@ -771,7 +770,6 @@ def epoch_step(self):
         for callback in self.end_of_epoch_callbacks:
             callback(self)
 
-        return self.stop_cond
 
     def log_dictionary(self, dictionary: dict, name: str = ""):
         """

From 9b91364905e7d88a49d93523143e4b002bf19c3a Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 14:29:06 -0400
Subject: [PATCH 11/26] add early stop class

---
 nequip/train/early_stopping.py | 107 +++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 nequip/train/early_stopping.py

diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py
new file mode 100644
index 00000000..e0c460c3
--- /dev/null
+++ b/nequip/train/early_stopping.py
@@ -0,0 +1,107 @@
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Callable, Mapping, Optional, cast
+
+
+class EarlyStopping:
+    """
+    Early stop conditions
+
+    There are three early stopping conditions:
+
+    1. a value lower than a defined lower bound
+    2. a value higher than a defined upper bound
+    3. a value hasn't decreased for x epochs within min_delta range
+
+    Args:
+
+    lower_bounds (dict): define the key and lower bound for condition 1
+    upper_bounds (dict): define the key and lower bound for condition 2
+    patiences (dict): defined the x epochs for condition 3
+    min_delta (dict): defined the delta range for condition 3. defaults are 0.0
+    cumulative_delta (bool): if True, the minimum value recorded for condition 3
+                             will not be updated when the newer value only decreases
+                             for a tiny value (< min_delta). default False
+    """
+
+    def __init__(
+        self,
+        lower_bounds: dict = {},
+        upper_bounds: dict = {},
+        patiences: dict = {},
+        min_delta: dict = {},
+        cumulative_delta: bool = False,
+    ):
+
+        self.patiences = deepcopy(patiences)
+        self.lower_bounds = deepcopy(lower_bounds)
+        self.upper_bounds = deepcopy(upper_bounds)
+        self.cumulative_delta = cumulative_delta
+
+        # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys()))
+
+        self.min_delta = {}
+        self.counter = {}
+        self.minimums = {}
+        for key, pat in self.patiences.items():
+            self.patiences[key] = int(pat)
+            self.counter[key] = 0
+            self.minimums[key] = None
+            self.min_delta[key] = min_delta.get(key, 0.0)
+
+            if pat < 1:
+                raise ValueError(f"Argument patience for {key} should be positive integer.")
+            if self.min_delta[key] < 0.0:
+                raise ValueError("Argument min_delta should not be a negative number.")
+
+        for key in self.min_delta:
+            if key not in self.patiences:
+                raise ValueError(f"patience for {key} should be defined")
+
+    def __call__(self, metrics) -> None:
+
+        stop = False
+        stop_args = "Early stopping:"
+        debug_args = None
+
+        # check whether key in metrics hasn't reduced for x epochs
+        for key, pat in self.patiences.items():
+
+            value = metrics[key]
+            minimums = self.minimums[key]
+            min_delta = self.min_delta[key]
+
+            if minimums is None:
+                minimums = value
+            elif value >= (minimums - self.min_delta[key]):
+                if not self.cumulative_delta and value > minimums:
+                    self.minimums[key] = value
+                self.counter[key] += 1
+                debug_args = f"EarlyStopping: {self.counter[key]} / {pat}"
+                if self.counter[key] >= pat:
+                    stop_args += " {key} has not reduced for {pat} epochs")
+                    stop = True
+            else:
+                self.minimums[key] = value
+                self.counter[key] = 0
+
+        for key, bound in self.lower_bounds.items():
+            if metrics[key] < bound:
+                stop_args += f" {key} is smaller than {bound}"
+                stop = True
+
+        for key, bound in self.upper_bounds.items():
+            if metrics[key] > bound:
+                stop_args += f" {key} is larger than {bound}"
+                stop = True
+
+        return stop, stop_args, debug_args
+
+    def state_dict(self) -> "OrderedDict[dict, dict]":
+        return OrderedDict(
+            [("counter", self.counter), ("minimums", self.minimums)]
+        )
+
+    def load_state_dict(self, state_dict: Mapping) -> None:
+        self.counter = state_dict["counter"]
+        self.minimums = state_dict["minimums"]

From 6ae6eb5e3f308368eb3ebffd40548832f1bba4d9 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 14:43:09 -0400
Subject: [PATCH 12/26] insert early_stopping init

---
 nequip/train/trainer.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index a3bde7f2..608fcbb6 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -39,6 +39,7 @@
 from .loss import Loss, LossStat
 from .metrics import Metrics
 from ._key import ABBREV, LOSS_KEY, TRAIN, VALIDATION
+from .early_stopping import EarlyStopping
 
 
 class Trainer:
@@ -221,7 +222,8 @@ def __init__(
         loss_coeffs: Union[dict, str] = AtomicDataDict.TOTAL_ENERGY_KEY,
         metrics_components: Optional[Union[dict, str]] = None,
         metrics_key: str = ABBREV.get(LOSS_KEY, LOSS_KEY),
-        early_stop_threshold: Optional[float] = None,
+        early_stopping: Optional[EarlyStopping] = None,
+        early_stopping_kwargs: Optional[dict] = None,
         max_epochs: int = 1000000,
         lr_sched=None,
         learning_rate: float = 1e-2,
@@ -301,6 +303,7 @@ def __init__(
         self.kwargs = deepcopy(kwargs)
         self.optimizer_kwargs = deepcopy(optimizer_kwargs)
         self.lr_scheduler_kwargs = deepcopy(lr_scheduler_kwargs)
+        self.early_stopping_kwargs = deepcopy(early_stopping_kwargs)
 
         # initialize the optimizer and scheduler, the params will be updated in the function
         self.init()
@@ -580,6 +583,19 @@ def init(self):
         self.loss_stat = LossStat(keys=list(self.loss.funcs.keys()))
         self._initialized = True
 
+        if self.early_stopping is None:
+            key_mapping, kwargs = instantiate(
+                EarlyStopping, 
+                prefix="early_stopping",
+                optional_args=self.early_stopping_kwargs,
+                all_args=self.kwargs,
+                return_args_only=True
+            )
+            for key, item in kwargs.items():
+                if key not in ['cumulative_delta']:
+                    kwargs["{VALIDATION}_{key}"]
+                              self.early_stopping_kwargs[]
+
     def init_metrics(self):
         if self.metrics_components is None:
             self.metrics_components = []

From b68f59cade87a0de98af9efdfd16cb0b0cd85192 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 14:51:27 -0400
Subject: [PATCH 13/26] move log back to epoch_step

---
 configs/minimal.yaml    |  2 +-
 nequip/train/trainer.py | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/configs/minimal.yaml b/configs/minimal.yaml
index f3a792c3..7967060e 100644
--- a/configs/minimal.yaml
+++ b/configs/minimal.yaml
@@ -21,7 +21,7 @@ dataset: aspirin
 dataset_file_name: benchmark_data/aspirin_ccsd-train.npz
 
 # logging
-wandb: false
+wandb: true
 wandb_project: aspirin
 # verbose: debug
 
diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 2f1d56bb..ec4bb3a0 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -630,10 +630,7 @@ def train(self):
 
         while not self.stop_cond:
 
-            stop = self.epoch_step()
-            self.iepoch += 1
-
-            self.end_of_epoch_log()
+            self.epoch_step()
             self.end_of_epoch_save()
 
         for callback in self.final_callbacks:
@@ -733,6 +730,7 @@ def reset_metrics(self):
         self.metrics.to(self.device)
 
     def epoch_step(self):
+
         datasets = [self.dl_train, self.dl_val]
         categories = [TRAIN, VALIDATION]
         self.metrics_dict = {}
@@ -762,6 +760,10 @@ def epoch_step(self):
                     for callback in self.end_of_train_callbacks:
                         callback(self)
 
+        self.iepoch += 1
+
+        self.end_of_epoch_log()
+
         if self.lr_scheduler_name == "ReduceLROnPlateau":
             self.lr_sched.step(
                 metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"]
@@ -770,7 +772,6 @@ def epoch_step(self):
         for callback in self.end_of_epoch_callbacks:
             callback(self)
 
-
     def log_dictionary(self, dictionary: dict, name: str = ""):
         """
         dump the keys and values of a dictionary

From 083856e73b3d6f948d845c8986d1137e90ae8f5c Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 15:14:52 -0400
Subject: [PATCH 14/26] add early stopping in training loops

---
 nequip/train/early_stopping.py | 10 +++----
 nequip/train/trainer.py        | 49 +++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py
index e0c460c3..a1913107 100644
--- a/nequip/train/early_stopping.py
+++ b/nequip/train/early_stopping.py
@@ -50,7 +50,9 @@ def __init__(
             self.min_delta[key] = min_delta.get(key, 0.0)
 
             if pat < 1:
-                raise ValueError(f"Argument patience for {key} should be positive integer.")
+                raise ValueError(
+                    f"Argument patience for {key} should be positive integer."
+                )
             if self.min_delta[key] < 0.0:
                 raise ValueError("Argument min_delta should not be a negative number.")
 
@@ -79,7 +81,7 @@ def __call__(self, metrics) -> None:
                 self.counter[key] += 1
                 debug_args = f"EarlyStopping: {self.counter[key]} / {pat}"
                 if self.counter[key] >= pat:
-                    stop_args += " {key} has not reduced for {pat} epochs")
+                    stop_args += " {key} has not reduced for {pat} epochs"
                     stop = True
             else:
                 self.minimums[key] = value
@@ -98,9 +100,7 @@ def __call__(self, metrics) -> None:
         return stop, stop_args, debug_args
 
     def state_dict(self) -> "OrderedDict[dict, dict]":
-        return OrderedDict(
-            [("counter", self.counter), ("minimums", self.minimums)]
-        )
+        return OrderedDict([("counter", self.counter), ("minimums", self.minimums)])
 
     def load_state_dict(self, state_dict: Mapping) -> None:
         self.counter = state_dict["counter"]
diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 0b6a778d..fe0335d1 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -367,6 +367,10 @@ def as_dict(self, state_dict: bool = False, training_progress: bool = False):
                 )
             if self.use_ema:
                 dictionary["state_dict"]["ema_state"] = self.ema.state_dict()
+            if self.early_stopping is not None:
+                dictionary["state_dict"][
+                    "early_stopping"
+                ] = self.early_stopping.state_dict()
 
         if hasattr(self.model, "save") and not issubclass(
             type(self.model), torch.jit.ScriptModule
@@ -502,6 +506,9 @@ def from_dict(cls, dictionary, append: Optional[bool] = None):
             if trainer.lr_sched is not None:
                 trainer.lr_sched.load_state_dict(state_dict["lr_sched"])
 
+            if trainer.early_stopping is not None:
+                trainer.early_stopping.load_state_dict(state_dict["early_stopping"])
+
             torch.set_rng_state(state_dict["rng_state"])
             if torch.cuda.is_available():
                 torch.cuda.set_rng_state(state_dict["cuda_rng_state"])
@@ -585,16 +592,27 @@ def init(self):
 
         if self.early_stopping is None:
             key_mapping, kwargs = instantiate(
-                EarlyStopping, 
+                EarlyStopping,
                 prefix="early_stopping",
                 optional_args=self.early_stopping_kwargs,
                 all_args=self.kwargs,
-                return_args_only=True
+                return_args_only=True,
             )
             for key, item in kwargs.items():
-                if key not in ['cumulative_delta']:
-                    kwargs["{VALIDATION}_{key}"]
-                              self.early_stopping_kwargs[]
+                # prepand VALIDATION string if k is not with
+                if isinstance(item, dict):
+                    new_dict = {}
+                    for k, v in item.items():
+                        if (
+                            k.startswith(VALIDATION)
+                            or k.startswith(TRAIN)
+                            or k in ["LR", "wall"]
+                        ):
+                            new_dict[k] = item[k]
+                        else:
+                            new_dict[f"{VALIDATION}_{k}"] = item[k]
+                    kwargs[key] = new_dict
+            self.early_stopping = EarlyStopping(**kwargs)
 
     def init_metrics(self):
         if self.metrics_components is None:
@@ -613,6 +631,12 @@ def init_metrics(self):
             all_args=self.kwargs,
         )
 
+        if not (
+            self.metrics_key.startswith(VALIDATION)
+            or self.metrics_key.startswith(TRAIN)
+        ):
+            self.metrics_key = f"{VALIDATION}_{self.metrics_key}"
+
     def init_model(self):
         logger = self.logger
         logger.info(
@@ -728,9 +752,12 @@ def batch_step(self, data, validation=False):
     def stop_cond(self):
         """ kill the training early """
 
-        if self.early_stop_threshold is not None:
-            if self.best_val_metrics < self.early_stop_threshold:
-                self.stop_arg = "reach early stop thrdshold"
+        if self.early_stopping is not None and hasattr(self, mae_dict):
+            early_stop, early_stop_args, debug_args = self.early_stopping(mae_dict)
+            if debug_args is not None:
+                self.logger.debug(debug_args)
+            if early_stop:
+                self.stop_args = early_stop_args
                 return True
 
         if self.iepoch >= self.max_epochs:
@@ -781,9 +808,7 @@ def epoch_step(self):
         self.end_of_epoch_log()
 
         if self.lr_scheduler_name == "ReduceLROnPlateau":
-            self.lr_sched.step(
-                metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"]
-            )
+            self.lr_sched.step(metrics=self.mae_dict[self.metrics_key])
 
         for callback in self.end_of_epoch_callbacks:
             callback(self)
@@ -851,7 +876,7 @@ def end_of_epoch_save(self):
         save model and trainer details
         """
 
-        val_metrics = self.mae_dict[f"{VALIDATION}_{self.metrics_key}"]
+        val_metrics = self.mae_dict[self.metrics_key]
         if val_metrics < self.best_val_metrics:
             self.best_val_metrics = val_metrics
             self.best_epoch = self.iepoch

From d49a7edfc764e05efa77064cba28ff31b246fe26 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 15:16:58 -0400
Subject: [PATCH 15/26] add to test

---
 nequip/train/trainer.py       | 4 ++--
 tests/trainer/test_trainer.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index fe0335d1..fd85c2bd 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -752,8 +752,8 @@ def batch_step(self, data, validation=False):
     def stop_cond(self):
         """ kill the training early """
 
-        if self.early_stopping is not None and hasattr(self, mae_dict):
-            early_stop, early_stop_args, debug_args = self.early_stopping(mae_dict)
+        if self.early_stopping is not None and hasattr(self, "mae_dict"):
+            early_stop, early_stop_args, debug_args = self.early_stopping(self.mae_dict)
             if debug_args is not None:
                 self.logger.debug(debug_args)
             if early_stop:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c0acfc1d..9f09f607 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -10,7 +10,7 @@
 import torch
 from torch.nn import Linear
 
-from nequip.data import NpzDataset, AtomicDataDict, AtomicData
+from nequip.data import AtomicDataDict
 from nequip.train.trainer import Trainer
 from nequip.utils.savenload import load_file
 from nequip.nn import GraphModuleMixin
@@ -32,6 +32,7 @@
     T_0=50,
     T_mult=2,
     loss_coeffs={"forces": 2},
+    early_stopping_patience={"LR": 1e-10},
 )
 configs_to_test = [dict(), minimal_config]
 loop_config = pytest.mark.parametrize("trainer", configs_to_test, indirect=True)

From afb0fc4fa4a79fa52cc4d00cb2ee7fea36941cd3 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 15:18:29 -0400
Subject: [PATCH 16/26] fix test profile

---
 nequip/train/trainer.py       | 4 +++-
 tests/trainer/test_trainer.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index fd85c2bd..eda85f4e 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -598,6 +598,7 @@ def init(self):
                 all_args=self.kwargs,
                 return_args_only=True,
             )
+            n_args = 0
             for key, item in kwargs.items():
                 # prepand VALIDATION string if k is not with
                 if isinstance(item, dict):
@@ -612,7 +613,8 @@ def init(self):
                         else:
                             new_dict[f"{VALIDATION}_{k}"] = item[k]
                     kwargs[key] = new_dict
-            self.early_stopping = EarlyStopping(**kwargs)
+                    n_args += len(new_dict)
+            self.early_stopping = EarlyStopping(**kwargs) if n_args > 0 else None
 
     def init_metrics(self):
         if self.metrics_components is None:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9f09f607..992a97ac 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -32,7 +32,8 @@
     T_0=50,
     T_mult=2,
     loss_coeffs={"forces": 2},
-    early_stopping_patience={"LR": 1e-10},
+    early_stopping_patiences={"loss": 50},
+    early_stopping_lower_bounds={"LR": 1e-10},
 )
 configs_to_test = [dict(), minimal_config]
 loop_config = pytest.mark.parametrize("trainer", configs_to_test, indirect=True)

From e4833bec14e801ea994163f3b781262e7b52243f Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 15:38:03 -0400
Subject: [PATCH 17/26] fix update bug

---
 nequip/train/early_stopping.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py
index a1913107..db84f2d3 100644
--- a/nequip/train/early_stopping.py
+++ b/nequip/train/early_stopping.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Callable, Mapping, Optional, cast
+from typing import Mapping, Optional, cast
 
 
 class EarlyStopping:
@@ -41,11 +41,11 @@ def __init__(
         # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys()))
 
         self.min_delta = {}
-        self.counter = {}
+        self.counters = {}
         self.minimums = {}
         for key, pat in self.patiences.items():
             self.patiences[key] = int(pat)
-            self.counter[key] = 0
+            self.counters[key] = 0
             self.minimums[key] = None
             self.min_delta[key] = min_delta.get(key, 0.0)
 
@@ -70,22 +70,22 @@ def __call__(self, metrics) -> None:
         for key, pat in self.patiences.items():
 
             value = metrics[key]
-            minimums = self.minimums[key]
+            minimum = self.minimums[key]
             min_delta = self.min_delta[key]
 
-            if minimums is None:
-                minimums = value
-            elif value >= (minimums - self.min_delta[key]):
-                if not self.cumulative_delta and value > minimums:
+            if minimum is None:
+                self.minimums[key] = value
+            elif value >= (minimum - min_delta):
+                if not self.cumulative_delta and value > minimum:
                     self.minimums[key] = value
-                self.counter[key] += 1
-                debug_args = f"EarlyStopping: {self.counter[key]} / {pat}"
-                if self.counter[key] >= pat:
+                self.counters[key] += 1
+                debug_args = f"EarlyStopping: {self.counters[key]} / {pat}"
+                if self.counters[key] >= pat:
                     stop_args += " {key} has not reduced for {pat} epochs"
                     stop = True
             else:
                 self.minimums[key] = value
-                self.counter[key] = 0
+                self.counters[key] = 0
 
         for key, bound in self.lower_bounds.items():
             if metrics[key] < bound:
@@ -100,8 +100,8 @@ def __call__(self, metrics) -> None:
         return stop, stop_args, debug_args
 
     def state_dict(self) -> "OrderedDict[dict, dict]":
-        return OrderedDict([("counter", self.counter), ("minimums", self.minimums)])
+        return OrderedDict([("counters", self.counters), ("minimums", self.minimums)])
 
     def load_state_dict(self, state_dict: Mapping) -> None:
-        self.counter = state_dict["counter"]
+        self.counters = state_dict["counters"]
         self.minimums = state_dict["minimums"]

From 6dc5229ac29b13f3ff7866b87ae6211492a70765 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 15:52:04 -0400
Subject: [PATCH 18/26] update full.yaml

---
 configs/full.yaml              | 15 +++++++++++++++
 nequip/train/early_stopping.py | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/configs/full.yaml b/configs/full.yaml
index eecec050..f273e7a0 100644
--- a/configs/full.yaml
+++ b/configs/full.yaml
@@ -87,6 +87,21 @@ use_ema: false
 ema_decay: 0.999                                                                   # ema weight, commonly set to 0.999
 ema_use_num_updates: true                                                          # whether to use number of updates when computing averages
 
+# early stopping based on metrics values. 
+# LR, wall and any keys printed in the log file can be used. 
+# The key can start with Training or Validation. If not defined, the validation value will be used.
+early_stopping_patiences:                                                          # stop early if a metric value stopped decreasing for n epochs
+  Validation_loss: 50                                                              # 
+  Training_loss: 100                                                               # 
+  mae: 100                                                                         # 
+early_stopping_delta:                                                              # If delta is defined, a tiny decrease smaller than delta will not be considered as a decrease
+  Training_loss: 0.005                                                             # 
+early_stopping_cumulative_delta: false                                             # If True, the minimum value recorded will not be updated when the decrease is smaller than delta
+early_stopping_lower_bounds:                                                       # stop early if a metric value is lower than the bound
+  LR: 1e-10                                                                        # 
+early_stopping_upper_bounds:                                                       # stop early if a metric value is higher than the bound
+  wall: 1e100                                                                      # 
+
 # loss function
 loss_coeffs:                                                                       # different weights to use in a weighted loss functions
   forces: 100                                                                      # for MD applications, we recommed a force weight of 100 and an energy weight of 1
diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py
index db84f2d3..65e97c10 100644
--- a/nequip/train/early_stopping.py
+++ b/nequip/train/early_stopping.py
@@ -11,17 +11,17 @@ class EarlyStopping:
 
     1. a value lower than a defined lower bound
     2. a value higher than a defined upper bound
-    3. a value hasn't decreased for x epochs within min_delta range
+    3. a value hasn't decreased for x epochs within delta range
 
     Args:
 
     lower_bounds (dict): define the key and lower bound for condition 1
     upper_bounds (dict): define the key and lower bound for condition 2
     patiences (dict): defined the x epochs for condition 3
-    min_delta (dict): defined the delta range for condition 3. defaults are 0.0
+    delta (dict): defined the delta range for condition 3. defaults are 0.0
     cumulative_delta (bool): if True, the minimum value recorded for condition 3
                              will not be updated when the newer value only decreases
-                             for a tiny value (< min_delta). default False
+                             for a tiny value (< delta). default False
     """
 
     def __init__(
@@ -29,7 +29,7 @@ def __init__(
         lower_bounds: dict = {},
         upper_bounds: dict = {},
         patiences: dict = {},
-        min_delta: dict = {},
+        delta: dict = {},
         cumulative_delta: bool = False,
     ):
 
@@ -40,23 +40,23 @@ def __init__(
 
         # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys()))
 
-        self.min_delta = {}
+        self.delta = {}
         self.counters = {}
         self.minimums = {}
         for key, pat in self.patiences.items():
             self.patiences[key] = int(pat)
             self.counters[key] = 0
             self.minimums[key] = None
-            self.min_delta[key] = min_delta.get(key, 0.0)
+            self.delta[key] = delta.get(key, 0.0)
 
             if pat < 1:
                 raise ValueError(
                     f"Argument patience for {key} should be positive integer."
                 )
-            if self.min_delta[key] < 0.0:
-                raise ValueError("Argument min_delta should not be a negative number.")
+            if self.delta[key] < 0.0:
+                raise ValueError("Argument delta should not be a negative number.")
 
-        for key in self.min_delta:
+        for key in self.delta:
             if key not in self.patiences:
                 raise ValueError(f"patience for {key} should be defined")
 
@@ -71,11 +71,11 @@ def __call__(self, metrics) -> None:
 
             value = metrics[key]
             minimum = self.minimums[key]
-            min_delta = self.min_delta[key]
+            delta = self.delta[key]
 
             if minimum is None:
                 self.minimums[key] = value
-            elif value >= (minimum - min_delta):
+            elif value >= (minimum - delta):
                 if not self.cumulative_delta and value > minimum:
                     self.minimums[key] = value
                 self.counters[key] += 1

From 5037276741ef20f06ddedefd1f7575b255a7ca46 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 16:00:15 -0400
Subject: [PATCH 19/26] update change log

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8386af6d..e477d307 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ Most recent change on the bottom.
 ## [Unreleased]
 ### Fixed
 - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs`
+### Added
+- `early_stopping_xxx` arguments added to enable early stop for platued values or values that out of lower/upper bounds.
 
 ## [0.3.0] - 2021-05-07
 ### Added

From f73afe9c0b35d5b58e691610fc683514c15d5438 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 16:08:10 -0400
Subject: [PATCH 20/26] remove comments

---
 nequip/train/early_stopping.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py
index 65e97c10..c1aee9e5 100644
--- a/nequip/train/early_stopping.py
+++ b/nequip/train/early_stopping.py
@@ -38,8 +38,6 @@ def __init__(
         self.upper_bounds = deepcopy(upper_bounds)
         self.cumulative_delta = cumulative_delta
 
-        # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys()))
-
         self.delta = {}
         self.counters = {}
         self.minimums = {}

From 9fdabaf2bfe8911149b5a9523e4f9d1419d9f32b Mon Sep 17 00:00:00 2001
From: Lixin Sun <nw13mifaso@gmail.com>
Date: Thu, 13 May 2021 16:12:24 -0400
Subject: [PATCH 21/26] Update minimal.yaml

---
 configs/minimal.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/configs/minimal.yaml b/configs/minimal.yaml
index 7967060e..e8cb2917 100644
--- a/configs/minimal.yaml
+++ b/configs/minimal.yaml
@@ -21,8 +21,7 @@ dataset: aspirin
 dataset_file_name: benchmark_data/aspirin_ccsd-train.npz
 
 # logging
-wandb: true
-wandb_project: aspirin
+wandb: false
 # verbose: debug
 
 # training

From d4b7f77f31903f440f38a7ace026a5b6f76fc7a8 Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Fri, 14 May 2021 15:26:02 -0400
Subject: [PATCH 22/26] support class name as prefix

---
 nequip/utils/auto_init.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nequip/utils/auto_init.py b/nequip/utils/auto_init.py
index 497c4ced..2ac039ac 100644
--- a/nequip/utils/auto_init.py
+++ b/nequip/utils/auto_init.py
@@ -141,10 +141,11 @@ def instantiate(
         return_args_only (bool): if True, do not instantiate, only return the arguments
     """
 
+    prefix_list = [builder.__name__] if inspect.isclass(builder)) else []
     if isinstance(prefix, str):
-        prefix_list = [prefix]
+        prefix_list += [prefix]
     else:
-        prefix_list = prefix
+        prefix_list += prefix
 
     # detect the input parameters needed from params
     config = Config.from_class(builder, remove_kwargs=remove_kwargs)
@@ -226,8 +227,8 @@ def instantiate(
             sub_prefix_list = [sub_builder.__name__, key]
             for prefix in prefix_list:
                 sub_prefix_list = sub_prefix_list + [
-                    prefix + "_" + key,
                     prefix,
+                    prefix + "_" + key,
                 ]
 
             nested_km, nested_kwargs = instantiate(

From 438320d4b0c76375c911eb221716f3a4079bcb5e Mon Sep 17 00:00:00 2001
From: nw13slx <nw13mifaso@gmail.com>
Date: Fri, 14 May 2021 15:27:47 -0400
Subject: [PATCH 23/26] fix )

---
 nequip/utils/auto_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nequip/utils/auto_init.py b/nequip/utils/auto_init.py
index 2ac039ac..a913dd1e 100644
--- a/nequip/utils/auto_init.py
+++ b/nequip/utils/auto_init.py
@@ -141,7 +141,7 @@ def instantiate(
         return_args_only (bool): if True, do not instantiate, only return the arguments
     """
 
-    prefix_list = [builder.__name__] if inspect.isclass(builder)) else []
+    prefix_list = [builder.__name__] if inspect.isclass(builder) else []
     if isinstance(prefix, str):
         prefix_list += [prefix]
     else:

From 151ce42ed4430da3473e5844ae3fe83ab25cd863 Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Fri, 14 May 2021 14:02:31 -0600
Subject: [PATCH 24/26] Changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e477d307..be85e793 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ Most recent change on the bottom.
 ## [Unreleased]
 ### Fixed
 - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs`
+- Builders, and not just sub-builders, use the class name as a default prefix
 ### Added
 - `early_stopping_xxx` arguments added to enable early stop for platued values or values that out of lower/upper bounds.
 

From f9af82381960faba78b0cf80ee726105c3140daf Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Fri, 14 May 2021 15:15:09 -0600
Subject: [PATCH 25/26] fix number formats

---
 configs/full.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/full.yaml b/configs/full.yaml
index f273e7a0..e80340c0 100644
--- a/configs/full.yaml
+++ b/configs/full.yaml
@@ -93,14 +93,14 @@ ema_use_num_updates: true
 early_stopping_patiences:                                                          # stop early if a metric value stopped decreasing for n epochs
   Validation_loss: 50                                                              # 
   Training_loss: 100                                                               # 
-  mae: 100                                                                         # 
+  e_mae: 100                                                                         # 
 early_stopping_delta:                                                              # If delta is defined, a tiny decrease smaller than delta will not be considered as a decrease
   Training_loss: 0.005                                                             # 
 early_stopping_cumulative_delta: false                                             # If True, the minimum value recorded will not be updated when the decrease is smaller than delta
 early_stopping_lower_bounds:                                                       # stop early if a metric value is lower than the bound
-  LR: 1e-10                                                                        # 
+  LR: 1.0e-10                                                                        # 
 early_stopping_upper_bounds:                                                       # stop early if a metric value is higher than the bound
-  wall: 1e100                                                                      # 
+  wall: 1.0e+100                                                                      # 
 
 # loss function
 loss_coeffs:                                                                       # different weights to use in a weighted loss functions

From 9e9a3fae898af22460b838b22c9af629c4383834 Mon Sep 17 00:00:00 2001
From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com>
Date: Fri, 14 May 2021 15:16:28 -0600
Subject: [PATCH 26/26] Bump version

---
 CHANGELOG.md       | 2 ++
 nequip/_version.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be85e793..48bbf0b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Most recent change on the bottom.
 
 ## [Unreleased]
+
+## [0.3.1]
 ### Fixed
 - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs`
 - Builders, and not just sub-builders, use the class name as a default prefix
diff --git a/nequip/_version.py b/nequip/_version.py
index 355845c6..ea12af4d 100644
--- a/nequip/_version.py
+++ b/nequip/_version.py
@@ -2,4 +2,4 @@
 # See Python packaging guide
 # https://packaging.python.org/guides/single-sourcing-package-version/
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"