From 0fafa8c8395815358d564cdfa1d897addcda8ba1 Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 10 May 2021 18:37:24 -0600 Subject: [PATCH 01/26] "Fix" off by one iepoch --- nequip/train/trainer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 5aa7fc36..74d822fd 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -619,19 +619,23 @@ def train(self): self.wall = perf_counter() stop = False + if not self.restart: self.best_val_metrics = float("inf") self.best_epoch = 0 self.iepoch = 0 + else: + # if a restart, iepoch is the index of the last epoch that was *completed* + # our first epoch will be the epoch after that + self.iepoch += 1 + self.init_metrics() while self.iepoch < self.max_epochs and not stop: - early_stop = self.epoch_step() if early_stop: stop = False self.stop_arg = "early stop" - self.iepoch += 1 if not stop: @@ -642,7 +646,12 @@ def train(self): self.final_log() + # This is a painful hack to avoid an off-by-one error when restarting a run that ran out of frames. + # `iepoch` is supposed to be the index of the last completed epoch when `.save()` is called. + # But this `.save()` comes after `iepoch += 1` in the training loop above. + self.iepoch -= 1 self.save(self.trainer_save_path) + self.iepoch += 1 def batch_step(self, data, validation=False): # no need to have gradients from old steps taking up memory From 93ed096d105aeca3af2be04972c4a5b94ebde709 Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 10 May 2021 18:38:10 -0600 Subject: [PATCH 02/26] Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d257816..8386af6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Most recent change on the bottom. ## [Unreleased] +### Fixed +- `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs` ## [0.3.0] - 2021-05-07 ### Added From 77fd2695c3eb3e0a152c477c07ae0843c8d46422 Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 11 May 2021 12:56:29 -0600 Subject: [PATCH 03/26] add more orthogonal inits --- nequip/utils/initialization.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nequip/utils/initialization.py b/nequip/utils/initialization.py index 39d98694..09aff503 100644 --- a/nequip/utils/initialization.py +++ b/nequip/utils/initialization.py @@ -51,3 +51,17 @@ def orthogonal_initialize_linears(mod: torch.nn.Module): if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights: for w in mod.weight_views(): unit_uniform_init_(w) + + +def orthogonal_initialize_fcs(mod: torch.nn.Module): + if isinstance(mod, e3nn.nn.FullyConnectedNet): + for w in mod.weights: + torch.nn.init.orthogonal_(w) + elif isinstance(mod, torch.nn.Linear): + torch.nn.init.orthogonal_(mod.weight) + + +def unit_orthogonal_initialize_e3nn_fcs(mod: torch.nn.Module): + if isinstance(mod, e3nn.nn.FullyConnectedNet): + for w in mod.weights: + unit_orthogonal_init_(w) From c0f294f909bdc00cb5650b4483f0c8bc95d3807c Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 11 May 2021 15:14:28 -0600 Subject: [PATCH 04/26] docstrings --- nequip/utils/initialization.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nequip/utils/initialization.py b/nequip/utils/initialization.py index 09aff503..2e8c0e02 100644 --- a/nequip/utils/initialization.py +++ b/nequip/utils/initialization.py @@ -8,10 +8,12 @@ # == Uniform init == def unit_uniform_init_(t: torch.Tensor): + """Uniform initialization with = 1""" t.uniform_(-math.sqrt(3), math.sqrt(3)) def uniform_initialize_fcs(mod: torch.nn.Module): + """Initialize ``e3nn.nn.FullyConnectedNet``s with ``unit_uniform_init_``""" if isinstance(mod, e3nn.nn.FullyConnectedNet): for w in mod.weights: unit_uniform_init_(w) @@ -19,17 +21,20 @@ def uniform_initialize_fcs(mod: torch.nn.Module): def uniform_initialize_linears(mod: torch.nn.Module): + """Initialize ``e3nn.o3.Linear``s with ``unit_uniform_init_``""" if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights: unit_uniform_init_(mod.weight) def uniform_initialize_tps(mod: torch.nn.Module): + """Initialize ``e3nn.o3.TensorProduct``s with ``unit_uniform_init_``""" if isinstance(mod, e3nn.o3.TensorProduct) and mod.internal_weights: unit_uniform_init_(mod.weight) # == Xavier == def xavier_initialize_fcs(mod: torch.nn.Module): + """Initialize ``e3nn.nn.FullyConnectedNet``s and ``torch.nn.Linear``s with Xavier uniform initialization""" if isinstance(mod, e3nn.nn.FullyConnectedNet): for w in mod.weights: # in FC: @@ -42,18 +47,20 @@ def xavier_initialize_fcs(mod: torch.nn.Module): # == Orthogonal == def unit_orthogonal_init_(t: torch.Tensor): - """Orthogonal init with = N""" + """Orthogonal init with = 1""" assert t.ndim == 2 torch.nn.init.orthogonal_(t, gain=math.sqrt(max(t.shape))) def orthogonal_initialize_linears(mod: torch.nn.Module): + """Initialize ``e3nn.o3.Linear``s with ``unit_orthogonal_init_``""" if isinstance(mod, e3nn.o3.Linear) and mod.internal_weights: for w in mod.weight_views(): unit_uniform_init_(w) def orthogonal_initialize_fcs(mod: torch.nn.Module): + """Initialize ``e3nn.nn.FullyConnectedNet``s and ``torch.nn.Linear``s with orthogonal initialization""" if isinstance(mod, e3nn.nn.FullyConnectedNet): for w in mod.weights: torch.nn.init.orthogonal_(w) @@ -62,6 +69,7 @@ def orthogonal_initialize_fcs(mod: torch.nn.Module): def unit_orthogonal_initialize_e3nn_fcs(mod: torch.nn.Module): + """Initialize only ``e3nn.nn.FullyConnectedNet``s with ``unit_orthogonal_init_``""" if isinstance(mod, e3nn.nn.FullyConnectedNet): for w in mod.weights: unit_orthogonal_init_(w) From 179911dc5e8c23ecc4e425b89cb0fbfce539fcfd Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 12:08:07 -0400 Subject: [PATCH 05/26] move save and log out from the epoch_step --- nequip/train/trainer.py | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 74d822fd..05873eea 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -618,27 +618,23 @@ def train(self): self.init_log() self.wall = perf_counter() - stop = False - if not self.restart: self.best_val_metrics = float("inf") self.best_epoch = 0 self.iepoch = 0 - else: - # if a restart, iepoch is the index of the last epoch that was *completed* - # our first epoch will be the epoch after that - self.iepoch += 1 self.init_metrics() - while self.iepoch < self.max_epochs and not stop: + early_stop = False + while self.iepoch < self.max_epochs and not early_stop: + early_stop = self.epoch_step() - if early_stop: - stop = False - self.stop_arg = "early stop" self.iepoch += 1 - if not stop: + self.end_of_epoch_log() + self.end_of_epoch_save() + + if not early_stop: self.stop_arg = "max epochs" for callback in self.final_callbacks: @@ -646,12 +642,7 @@ def train(self): self.final_log() - # This is a painful hack to avoid an off-by-one error when restarting a run that ran out of frames. - # `iepoch` is supposed to be the index of the last completed epoch when `.save()` is called. - # But this `.save()` comes after `iepoch += 1` in the training loop above. - self.iepoch -= 1 self.save(self.trainer_save_path) - self.iepoch += 1 def batch_step(self, data, validation=False): # no need to have gradients from old steps taking up memory @@ -727,6 +718,7 @@ def early_stop_cond(self): if self.early_stop_lower_threshold is not None: if self.best_val_metrics < self.early_stop_lower_threshold: + self.stop_arg = "reach lower_thrdshold" return True return False @@ -766,9 +758,6 @@ def epoch_step(self): for callback in self.end_of_train_callbacks: callback(self) - self.end_of_epoch_log() - self.end_of_epoch_save() - if self.lr_scheduler_name == "ReduceLROnPlateau": self.lr_sched.step( metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"] @@ -793,8 +782,8 @@ def end_of_batch_log(self, batch_type: str): store all the loss/mae of each batch """ - mat_str = f"{self.iepoch+1:5d}, {self.ibatch+1:5d}" - log_str = f"{self.iepoch+1:5d} {self.ibatch+1:5d}" + mat_str = f"{self.iepoch:5d}, {self.ibatch+1:5d}" + log_str = f"{self.iepoch:5d} {self.ibatch+1:5d}" header = "epoch, batch" log_header = "# Epoch batch" @@ -864,7 +853,7 @@ def end_of_epoch_save(self): torch.save(self.model, save_path) self.logger.info( - f"! Best model {self.best_epoch+1:8d} {self.best_val_metrics:8.3f}" + f"! Best model {self.best_epoch:8d} {self.best_val_metrics:8.3f}" ) if (self.iepoch + 1) % self.log_epoch_freq == 0: @@ -905,11 +894,11 @@ def end_of_epoch_log(self): log_str = {} strings = ["Epoch", "wal", "LR"] - mat_str = f"{self.iepoch+1:10d}, {wall:8.3f}, {lr:8.3g}" + mat_str = f"{self.iepoch:10d}, {wall:8.3f}, {lr:8.3g}" for cat in categories: log_header[cat] = "# " log_header[cat] += " ".join([f"{s:>8s}" for s in strings]) - log_str[cat] = f"{self.iepoch+1:10d} {wall:8.3f} {lr:8.3g}" + log_str[cat] = f"{self.iepoch:10d} {wall:8.3f} {lr:8.3g}" for category in categories: From 9881e945f17514822751c48a7324970ffae45be4 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 12:12:45 -0400 Subject: [PATCH 06/26] remove auto-increase of max_epochs during restart --- nequip/scripts/restart.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/nequip/scripts/restart.py b/nequip/scripts/restart.py index ac023895..288b424b 100644 --- a/nequip/scripts/restart.py +++ b/nequip/scripts/restart.py @@ -63,8 +63,13 @@ def restart(file_name, config, mode="update"): if "progress" in dictionary: stop_args = dictionary["progress"].pop("stop_arg", None) if stop_args is not None: - dictionary["progress"]["stop_arg"] = None - dictionary["max_epochs"] *= 2 + if stop_args == "max epochs": + raise RuntimeError( + f"Previous run reach max epochs. Please use the update config to increase max epochs" + ) + else: + logging.warning(f"Restart by ignoring previous stop {stop_args}") + dictionary["progress"]["stop_arg"] = None if config.wandb: from nequip.train.trainer_wandb import TrainerWandB From 0763a5a20fd0f6544336e59ad0d6a1914f0ad193 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 12:19:45 -0400 Subject: [PATCH 07/26] rename --- nequip/train/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 05873eea..4c6cbbaf 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -221,7 +221,7 @@ def __init__( loss_coeffs: Union[dict, str] = AtomicDataDict.TOTAL_ENERGY_KEY, metrics_components: Optional[Union[dict, str]] = None, metrics_key: str = ABBREV.get(LOSS_KEY, LOSS_KEY), - early_stop_lower_threshold: Optional[float] = None, + early_stop_threshold: Optional[float] = None, max_epochs: int = 1000000, lr_sched=None, learning_rate: float = 1e-2, @@ -716,9 +716,9 @@ def batch_step(self, data, validation=False): def early_stop_cond(self): """ kill the training early """ - if self.early_stop_lower_threshold is not None: - if self.best_val_metrics < self.early_stop_lower_threshold: - self.stop_arg = "reach lower_thrdshold" + if self.early_stop_threshold is not None: + if self.best_val_metrics < self.early_stop_threshold: + self.stop_arg = "reach early stop thrdshold" return True return False From a5a305d7ba55336f7e4d55450603a2b0e1e23b20 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 12:23:01 -0400 Subject: [PATCH 08/26] fix batch epoch log problem --- nequip/train/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 05873eea..ed52374a 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -782,8 +782,8 @@ def end_of_batch_log(self, batch_type: str): store all the loss/mae of each batch """ - mat_str = f"{self.iepoch:5d}, {self.ibatch+1:5d}" - log_str = f"{self.iepoch:5d} {self.ibatch+1:5d}" + mat_str = f"{self.iepoch+1:5d}, {self.ibatch+1:5d}" + log_str = f"{self.iepoch+1:5d} {self.ibatch+1:5d}" header = "epoch, batch" log_header = "# Epoch batch" From 38457f606b8c436ba6e3391c40efb90f94333491 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 13:00:09 -0400 Subject: [PATCH 09/26] update sanity check for stop args --- nequip/scripts/restart.py | 12 ------------ nequip/train/trainer.py | 33 +++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/nequip/scripts/restart.py b/nequip/scripts/restart.py index 288b424b..26f14e7c 100644 --- a/nequip/scripts/restart.py +++ b/nequip/scripts/restart.py @@ -59,18 +59,6 @@ def restart(file_name, config, mode="update"): {"float32": torch.float32, "float64": torch.float64}[config.default_dtype] ) - # increase max_epochs if training has hit maximum epochs - if "progress" in dictionary: - stop_args = dictionary["progress"].pop("stop_arg", None) - if stop_args is not None: - if stop_args == "max epochs": - raise RuntimeError( - f"Previous run reach max epochs. Please use the update config to increase max epochs" - ) - else: - logging.warning(f"Restart by ignoring previous stop {stop_args}") - dictionary["progress"]["stop_arg"] = None - if config.wandb: from nequip.train.trainer_wandb import TrainerWandB diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index ed52374a..882c4867 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -468,12 +468,6 @@ def from_dict(cls, dictionary, append: Optional[bool] = None): model = d.pop("model") elif "progress" in d: progress = d["progress"] - stop_arg = progress.pop("stop_arg", None) - if stop_arg is not None: - raise RuntimeError( - f"The previous run has properly stopped with {stop_arg}." - "Please either increase the max_epoch or change early stop criteria" - ) # load the model from file iepoch = progress["iepoch"] @@ -515,11 +509,20 @@ def from_dict(cls, dictionary, append: Optional[bool] = None): if "progress" in d: trainer.best_val_metrics = progress["best_val_metrics"] trainer.best_epoch = progress["best_epoch"] + stop_arg = progress.pop("stop_arg", None) else: trainer.best_val_metrics = float("inf") trainer.best_epoch = 0 + stop_arg = None trainer.iepoch = iepoch + # final sanity check + if trainer.stop_cond: + raise RuntimeError( + f"The previous run has properly stopped with {stop_arg}." + "Please either increase the max_epoch or change early stop criteria" + ) + return trainer def init(self): @@ -625,18 +628,15 @@ def train(self): self.init_metrics() - early_stop = False - while self.iepoch < self.max_epochs and not early_stop: + stop = False + while not stop: - early_stop = self.epoch_step() + stop = self.epoch_step() self.iepoch += 1 self.end_of_epoch_log() self.end_of_epoch_save() - if not early_stop: - self.stop_arg = "max epochs" - for callback in self.final_callbacks: callback(self) @@ -713,13 +713,18 @@ def batch_step(self, data, validation=False): self.batch_metrics = self.metrics(pred=out, ref=data) @property - def early_stop_cond(self): + def stop_cond(self): """ kill the training early """ if self.early_stop_lower_threshold is not None: if self.best_val_metrics < self.early_stop_lower_threshold: self.stop_arg = "reach lower_thrdshold" return True + + if self.iepoch >= self.max_epochs: + self.stop_arg = "max epochs" + return True + return False def reset_metrics(self): @@ -766,7 +771,7 @@ def epoch_step(self): for callback in self.end_of_epoch_callbacks: callback(self) - return self.early_stop_cond + return self.stop_cond def log_dictionary(self, dictionary: dict, name: str = ""): """ From 0dcfa37c57f55079f0c443bb0fd13158fb7f137d Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 14:04:37 -0400 Subject: [PATCH 10/26] fix max_epochs --- nequip/train/trainer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 882c4867..2f1d56bb 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -628,8 +628,7 @@ def train(self): self.init_metrics() - stop = False - while not stop: + while not self.stop_cond: stop = self.epoch_step() self.iepoch += 1 @@ -771,7 +770,6 @@ def epoch_step(self): for callback in self.end_of_epoch_callbacks: callback(self) - return self.stop_cond def log_dictionary(self, dictionary: dict, name: str = ""): """ From 9b91364905e7d88a49d93523143e4b002bf19c3a Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 14:29:06 -0400 Subject: [PATCH 11/26] add early stop class --- nequip/train/early_stopping.py | 107 +++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 nequip/train/early_stopping.py diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py new file mode 100644 index 00000000..e0c460c3 --- /dev/null +++ b/nequip/train/early_stopping.py @@ -0,0 +1,107 @@ +from collections import OrderedDict +from copy import deepcopy +from typing import Callable, Mapping, Optional, cast + + +class EarlyStopping: + """ + Early stop conditions + + There are three early stopping conditions: + + 1. a value lower than a defined lower bound + 2. a value higher than a defined upper bound + 3. a value hasn't decreased for x epochs within min_delta range + + Args: + + lower_bounds (dict): define the key and lower bound for condition 1 + upper_bounds (dict): define the key and lower bound for condition 2 + patiences (dict): defined the x epochs for condition 3 + min_delta (dict): defined the delta range for condition 3. defaults are 0.0 + cumulative_delta (bool): if True, the minimum value recorded for condition 3 + will not be updated when the newer value only decreases + for a tiny value (< min_delta). default False + """ + + def __init__( + self, + lower_bounds: dict = {}, + upper_bounds: dict = {}, + patiences: dict = {}, + min_delta: dict = {}, + cumulative_delta: bool = False, + ): + + self.patiences = deepcopy(patiences) + self.lower_bounds = deepcopy(lower_bounds) + self.upper_bounds = deepcopy(upper_bounds) + self.cumulative_delta = cumulative_delta + + # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys())) + + self.min_delta = {} + self.counter = {} + self.minimums = {} + for key, pat in self.patiences.items(): + self.patiences[key] = int(pat) + self.counter[key] = 0 + self.minimums[key] = None + self.min_delta[key] = min_delta.get(key, 0.0) + + if pat < 1: + raise ValueError(f"Argument patience for {key} should be positive integer.") + if self.min_delta[key] < 0.0: + raise ValueError("Argument min_delta should not be a negative number.") + + for key in self.min_delta: + if key not in self.patiences: + raise ValueError(f"patience for {key} should be defined") + + def __call__(self, metrics) -> None: + + stop = False + stop_args = "Early stopping:" + debug_args = None + + # check whether key in metrics hasn't reduced for x epochs + for key, pat in self.patiences.items(): + + value = metrics[key] + minimums = self.minimums[key] + min_delta = self.min_delta[key] + + if minimums is None: + minimums = value + elif value >= (minimums - self.min_delta[key]): + if not self.cumulative_delta and value > minimums: + self.minimums[key] = value + self.counter[key] += 1 + debug_args = f"EarlyStopping: {self.counter[key]} / {pat}" + if self.counter[key] >= pat: + stop_args += " {key} has not reduced for {pat} epochs") + stop = True + else: + self.minimums[key] = value + self.counter[key] = 0 + + for key, bound in self.lower_bounds.items(): + if metrics[key] < bound: + stop_args += f" {key} is smaller than {bound}" + stop = True + + for key, bound in self.upper_bounds.items(): + if metrics[key] > bound: + stop_args += f" {key} is larger than {bound}" + stop = True + + return stop, stop_args, debug_args + + def state_dict(self) -> "OrderedDict[dict, dict]": + return OrderedDict( + [("counter", self.counter), ("minimums", self.minimums)] + ) + + def load_state_dict(self, state_dict: Mapping) -> None: + self.counter = state_dict["counter"] + self.minimums = state_dict["minimums"] From 6ae6eb5e3f308368eb3ebffd40548832f1bba4d9 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 14:43:09 -0400 Subject: [PATCH 12/26] insert early_stopping init --- nequip/train/trainer.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index a3bde7f2..608fcbb6 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -39,6 +39,7 @@ from .loss import Loss, LossStat from .metrics import Metrics from ._key import ABBREV, LOSS_KEY, TRAIN, VALIDATION +from .early_stopping import EarlyStopping class Trainer: @@ -221,7 +222,8 @@ def __init__( loss_coeffs: Union[dict, str] = AtomicDataDict.TOTAL_ENERGY_KEY, metrics_components: Optional[Union[dict, str]] = None, metrics_key: str = ABBREV.get(LOSS_KEY, LOSS_KEY), - early_stop_threshold: Optional[float] = None, + early_stopping: Optional[EarlyStopping] = None, + early_stopping_kwargs: Optional[dict] = None, max_epochs: int = 1000000, lr_sched=None, learning_rate: float = 1e-2, @@ -301,6 +303,7 @@ def __init__( self.kwargs = deepcopy(kwargs) self.optimizer_kwargs = deepcopy(optimizer_kwargs) self.lr_scheduler_kwargs = deepcopy(lr_scheduler_kwargs) + self.early_stopping_kwargs = deepcopy(early_stopping_kwargs) # initialize the optimizer and scheduler, the params will be updated in the function self.init() @@ -580,6 +583,19 @@ def init(self): self.loss_stat = LossStat(keys=list(self.loss.funcs.keys())) self._initialized = True + if self.early_stopping is None: + key_mapping, kwargs = instantiate( + EarlyStopping, + prefix="early_stopping", + optional_args=self.early_stopping_kwargs, + all_args=self.kwargs, + return_args_only=True + ) + for key, item in kwargs.items(): + if key not in ['cumulative_delta']: + kwargs["{VALIDATION}_{key}"] + self.early_stopping_kwargs[] + def init_metrics(self): if self.metrics_components is None: self.metrics_components = [] From b68f59cade87a0de98af9efdfd16cb0b0cd85192 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 14:51:27 -0400 Subject: [PATCH 13/26] move log back to epoch_step --- configs/minimal.yaml | 2 +- nequip/train/trainer.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/configs/minimal.yaml b/configs/minimal.yaml index f3a792c3..7967060e 100644 --- a/configs/minimal.yaml +++ b/configs/minimal.yaml @@ -21,7 +21,7 @@ dataset: aspirin dataset_file_name: benchmark_data/aspirin_ccsd-train.npz # logging -wandb: false +wandb: true wandb_project: aspirin # verbose: debug diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 2f1d56bb..ec4bb3a0 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -630,10 +630,7 @@ def train(self): while not self.stop_cond: - stop = self.epoch_step() - self.iepoch += 1 - - self.end_of_epoch_log() + self.epoch_step() self.end_of_epoch_save() for callback in self.final_callbacks: @@ -733,6 +730,7 @@ def reset_metrics(self): self.metrics.to(self.device) def epoch_step(self): + datasets = [self.dl_train, self.dl_val] categories = [TRAIN, VALIDATION] self.metrics_dict = {} @@ -762,6 +760,10 @@ def epoch_step(self): for callback in self.end_of_train_callbacks: callback(self) + self.iepoch += 1 + + self.end_of_epoch_log() + if self.lr_scheduler_name == "ReduceLROnPlateau": self.lr_sched.step( metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"] @@ -770,7 +772,6 @@ def epoch_step(self): for callback in self.end_of_epoch_callbacks: callback(self) - def log_dictionary(self, dictionary: dict, name: str = ""): """ dump the keys and values of a dictionary From 083856e73b3d6f948d845c8986d1137e90ae8f5c Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 15:14:52 -0400 Subject: [PATCH 14/26] add early stopping in training loops --- nequip/train/early_stopping.py | 10 +++---- nequip/train/trainer.py | 49 +++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py index e0c460c3..a1913107 100644 --- a/nequip/train/early_stopping.py +++ b/nequip/train/early_stopping.py @@ -50,7 +50,9 @@ def __init__( self.min_delta[key] = min_delta.get(key, 0.0) if pat < 1: - raise ValueError(f"Argument patience for {key} should be positive integer.") + raise ValueError( + f"Argument patience for {key} should be positive integer." + ) if self.min_delta[key] < 0.0: raise ValueError("Argument min_delta should not be a negative number.") @@ -79,7 +81,7 @@ def __call__(self, metrics) -> None: self.counter[key] += 1 debug_args = f"EarlyStopping: {self.counter[key]} / {pat}" if self.counter[key] >= pat: - stop_args += " {key} has not reduced for {pat} epochs") + stop_args += " {key} has not reduced for {pat} epochs" stop = True else: self.minimums[key] = value @@ -98,9 +100,7 @@ def __call__(self, metrics) -> None: return stop, stop_args, debug_args def state_dict(self) -> "OrderedDict[dict, dict]": - return OrderedDict( - [("counter", self.counter), ("minimums", self.minimums)] - ) + return OrderedDict([("counter", self.counter), ("minimums", self.minimums)]) def load_state_dict(self, state_dict: Mapping) -> None: self.counter = state_dict["counter"] diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 0b6a778d..fe0335d1 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -367,6 +367,10 @@ def as_dict(self, state_dict: bool = False, training_progress: bool = False): ) if self.use_ema: dictionary["state_dict"]["ema_state"] = self.ema.state_dict() + if self.early_stopping is not None: + dictionary["state_dict"][ + "early_stopping" + ] = self.early_stopping.state_dict() if hasattr(self.model, "save") and not issubclass( type(self.model), torch.jit.ScriptModule @@ -502,6 +506,9 @@ def from_dict(cls, dictionary, append: Optional[bool] = None): if trainer.lr_sched is not None: trainer.lr_sched.load_state_dict(state_dict["lr_sched"]) + if trainer.early_stopping is not None: + trainer.early_stopping.load_state_dict(state_dict["early_stopping"]) + torch.set_rng_state(state_dict["rng_state"]) if torch.cuda.is_available(): torch.cuda.set_rng_state(state_dict["cuda_rng_state"]) @@ -585,16 +592,27 @@ def init(self): if self.early_stopping is None: key_mapping, kwargs = instantiate( - EarlyStopping, + EarlyStopping, prefix="early_stopping", optional_args=self.early_stopping_kwargs, all_args=self.kwargs, - return_args_only=True + return_args_only=True, ) for key, item in kwargs.items(): - if key not in ['cumulative_delta']: - kwargs["{VALIDATION}_{key}"] - self.early_stopping_kwargs[] + # prepand VALIDATION string if k is not with + if isinstance(item, dict): + new_dict = {} + for k, v in item.items(): + if ( + k.startswith(VALIDATION) + or k.startswith(TRAIN) + or k in ["LR", "wall"] + ): + new_dict[k] = item[k] + else: + new_dict[f"{VALIDATION}_{k}"] = item[k] + kwargs[key] = new_dict + self.early_stopping = EarlyStopping(**kwargs) def init_metrics(self): if self.metrics_components is None: @@ -613,6 +631,12 @@ def init_metrics(self): all_args=self.kwargs, ) + if not ( + self.metrics_key.startswith(VALIDATION) + or self.metrics_key.startswith(TRAIN) + ): + self.metrics_key = f"{VALIDATION}_{self.metrics_key}" + def init_model(self): logger = self.logger logger.info( @@ -728,9 +752,12 @@ def batch_step(self, data, validation=False): def stop_cond(self): """ kill the training early """ - if self.early_stop_threshold is not None: - if self.best_val_metrics < self.early_stop_threshold: - self.stop_arg = "reach early stop thrdshold" + if self.early_stopping is not None and hasattr(self, mae_dict): + early_stop, early_stop_args, debug_args = self.early_stopping(mae_dict) + if debug_args is not None: + self.logger.debug(debug_args) + if early_stop: + self.stop_args = early_stop_args return True if self.iepoch >= self.max_epochs: @@ -781,9 +808,7 @@ def epoch_step(self): self.end_of_epoch_log() if self.lr_scheduler_name == "ReduceLROnPlateau": - self.lr_sched.step( - metrics=self.mae_dict[f"{VALIDATION}_{self.metrics_key}"] - ) + self.lr_sched.step(metrics=self.mae_dict[self.metrics_key]) for callback in self.end_of_epoch_callbacks: callback(self) @@ -851,7 +876,7 @@ def end_of_epoch_save(self): save model and trainer details """ - val_metrics = self.mae_dict[f"{VALIDATION}_{self.metrics_key}"] + val_metrics = self.mae_dict[self.metrics_key] if val_metrics < self.best_val_metrics: self.best_val_metrics = val_metrics self.best_epoch = self.iepoch From d49a7edfc764e05efa77064cba28ff31b246fe26 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 15:16:58 -0400 Subject: [PATCH 15/26] add to test --- nequip/train/trainer.py | 4 ++-- tests/trainer/test_trainer.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index fe0335d1..fd85c2bd 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -752,8 +752,8 @@ def batch_step(self, data, validation=False): def stop_cond(self): """ kill the training early """ - if self.early_stopping is not None and hasattr(self, mae_dict): - early_stop, early_stop_args, debug_args = self.early_stopping(mae_dict) + if self.early_stopping is not None and hasattr(self, "mae_dict"): + early_stop, early_stop_args, debug_args = self.early_stopping(self.mae_dict) if debug_args is not None: self.logger.debug(debug_args) if early_stop: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index c0acfc1d..9f09f607 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -10,7 +10,7 @@ import torch from torch.nn import Linear -from nequip.data import NpzDataset, AtomicDataDict, AtomicData +from nequip.data import AtomicDataDict from nequip.train.trainer import Trainer from nequip.utils.savenload import load_file from nequip.nn import GraphModuleMixin @@ -32,6 +32,7 @@ T_0=50, T_mult=2, loss_coeffs={"forces": 2}, + early_stopping_patience={"LR": 1e-10}, ) configs_to_test = [dict(), minimal_config] loop_config = pytest.mark.parametrize("trainer", configs_to_test, indirect=True) From afb0fc4fa4a79fa52cc4d00cb2ee7fea36941cd3 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 15:18:29 -0400 Subject: [PATCH 16/26] fix test profile --- nequip/train/trainer.py | 4 +++- tests/trainer/test_trainer.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index fd85c2bd..eda85f4e 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -598,6 +598,7 @@ def init(self): all_args=self.kwargs, return_args_only=True, ) + n_args = 0 for key, item in kwargs.items(): # prepand VALIDATION string if k is not with if isinstance(item, dict): @@ -612,7 +613,8 @@ def init(self): else: new_dict[f"{VALIDATION}_{k}"] = item[k] kwargs[key] = new_dict - self.early_stopping = EarlyStopping(**kwargs) + n_args += len(new_dict) + self.early_stopping = EarlyStopping(**kwargs) if n_args > 0 else None def init_metrics(self): if self.metrics_components is None: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9f09f607..992a97ac 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -32,7 +32,8 @@ T_0=50, T_mult=2, loss_coeffs={"forces": 2}, - early_stopping_patience={"LR": 1e-10}, + early_stopping_patiences={"loss": 50}, + early_stopping_lower_bounds={"LR": 1e-10}, ) configs_to_test = [dict(), minimal_config] loop_config = pytest.mark.parametrize("trainer", configs_to_test, indirect=True) From e4833bec14e801ea994163f3b781262e7b52243f Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 15:38:03 -0400 Subject: [PATCH 17/26] fix update bug --- nequip/train/early_stopping.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py index a1913107..db84f2d3 100644 --- a/nequip/train/early_stopping.py +++ b/nequip/train/early_stopping.py @@ -1,6 +1,6 @@ from collections import OrderedDict from copy import deepcopy -from typing import Callable, Mapping, Optional, cast +from typing import Mapping, Optional, cast class EarlyStopping: @@ -41,11 +41,11 @@ def __init__( # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys())) self.min_delta = {} - self.counter = {} + self.counters = {} self.minimums = {} for key, pat in self.patiences.items(): self.patiences[key] = int(pat) - self.counter[key] = 0 + self.counters[key] = 0 self.minimums[key] = None self.min_delta[key] = min_delta.get(key, 0.0) @@ -70,22 +70,22 @@ def __call__(self, metrics) -> None: for key, pat in self.patiences.items(): value = metrics[key] - minimums = self.minimums[key] + minimum = self.minimums[key] min_delta = self.min_delta[key] - if minimums is None: - minimums = value - elif value >= (minimums - self.min_delta[key]): - if not self.cumulative_delta and value > minimums: + if minimum is None: + self.minimums[key] = value + elif value >= (minimum - min_delta): + if not self.cumulative_delta and value > minimum: self.minimums[key] = value - self.counter[key] += 1 - debug_args = f"EarlyStopping: {self.counter[key]} / {pat}" - if self.counter[key] >= pat: + self.counters[key] += 1 + debug_args = f"EarlyStopping: {self.counters[key]} / {pat}" + if self.counters[key] >= pat: stop_args += " {key} has not reduced for {pat} epochs" stop = True else: self.minimums[key] = value - self.counter[key] = 0 + self.counters[key] = 0 for key, bound in self.lower_bounds.items(): if metrics[key] < bound: @@ -100,8 +100,8 @@ def __call__(self, metrics) -> None: return stop, stop_args, debug_args def state_dict(self) -> "OrderedDict[dict, dict]": - return OrderedDict([("counter", self.counter), ("minimums", self.minimums)]) + return OrderedDict([("counters", self.counters), ("minimums", self.minimums)]) def load_state_dict(self, state_dict: Mapping) -> None: - self.counter = state_dict["counter"] + self.counters = state_dict["counters"] self.minimums = state_dict["minimums"] From 6dc5229ac29b13f3ff7866b87ae6211492a70765 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 15:52:04 -0400 Subject: [PATCH 18/26] update full.yaml --- configs/full.yaml | 15 +++++++++++++++ nequip/train/early_stopping.py | 22 +++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/configs/full.yaml b/configs/full.yaml index eecec050..f273e7a0 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -87,6 +87,21 @@ use_ema: false ema_decay: 0.999 # ema weight, commonly set to 0.999 ema_use_num_updates: true # whether to use number of updates when computing averages +# early stopping based on metrics values. +# LR, wall and any keys printed in the log file can be used. +# The key can start with Training or Validation. If not defined, the validation value will be used. +early_stopping_patiences: # stop early if a metric value stopped decreasing for n epochs + Validation_loss: 50 # + Training_loss: 100 # + mae: 100 # +early_stopping_delta: # If delta is defined, a tiny decrease smaller than delta will not be considered as a decrease + Training_loss: 0.005 # +early_stopping_cumulative_delta: false # If True, the minimum value recorded will not be updated when the decrease is smaller than delta +early_stopping_lower_bounds: # stop early if a metric value is lower than the bound + LR: 1e-10 # +early_stopping_upper_bounds: # stop early if a metric value is higher than the bound + wall: 1e100 # + # loss function loss_coeffs: # different weights to use in a weighted loss functions forces: 100 # for MD applications, we recommed a force weight of 100 and an energy weight of 1 diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py index db84f2d3..65e97c10 100644 --- a/nequip/train/early_stopping.py +++ b/nequip/train/early_stopping.py @@ -11,17 +11,17 @@ class EarlyStopping: 1. a value lower than a defined lower bound 2. a value higher than a defined upper bound - 3. a value hasn't decreased for x epochs within min_delta range + 3. a value hasn't decreased for x epochs within delta range Args: lower_bounds (dict): define the key and lower bound for condition 1 upper_bounds (dict): define the key and lower bound for condition 2 patiences (dict): defined the x epochs for condition 3 - min_delta (dict): defined the delta range for condition 3. defaults are 0.0 + delta (dict): defined the delta range for condition 3. defaults are 0.0 cumulative_delta (bool): if True, the minimum value recorded for condition 3 will not be updated when the newer value only decreases - for a tiny value (< min_delta). default False + for a tiny value (< delta). default False """ def __init__( @@ -29,7 +29,7 @@ def __init__( lower_bounds: dict = {}, upper_bounds: dict = {}, patiences: dict = {}, - min_delta: dict = {}, + delta: dict = {}, cumulative_delta: bool = False, ): @@ -40,23 +40,23 @@ def __init__( # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys())) - self.min_delta = {} + self.delta = {} self.counters = {} self.minimums = {} for key, pat in self.patiences.items(): self.patiences[key] = int(pat) self.counters[key] = 0 self.minimums[key] = None - self.min_delta[key] = min_delta.get(key, 0.0) + self.delta[key] = delta.get(key, 0.0) if pat < 1: raise ValueError( f"Argument patience for {key} should be positive integer." ) - if self.min_delta[key] < 0.0: - raise ValueError("Argument min_delta should not be a negative number.") + if self.delta[key] < 0.0: + raise ValueError("Argument delta should not be a negative number.") - for key in self.min_delta: + for key in self.delta: if key not in self.patiences: raise ValueError(f"patience for {key} should be defined") @@ -71,11 +71,11 @@ def __call__(self, metrics) -> None: value = metrics[key] minimum = self.minimums[key] - min_delta = self.min_delta[key] + delta = self.delta[key] if minimum is None: self.minimums[key] = value - elif value >= (minimum - min_delta): + elif value >= (minimum - delta): if not self.cumulative_delta and value > minimum: self.minimums[key] = value self.counters[key] += 1 From 5037276741ef20f06ddedefd1f7575b255a7ca46 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 16:00:15 -0400 Subject: [PATCH 19/26] update change log --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8386af6d..e477d307 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ Most recent change on the bottom. ## [Unreleased] ### Fixed - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs` +### Added +- `early_stopping_xxx` arguments added to enable early stop for platued values or values that out of lower/upper bounds. ## [0.3.0] - 2021-05-07 ### Added From f73afe9c0b35d5b58e691610fc683514c15d5438 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Thu, 13 May 2021 16:08:10 -0400 Subject: [PATCH 20/26] remove comments --- nequip/train/early_stopping.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nequip/train/early_stopping.py b/nequip/train/early_stopping.py index 65e97c10..c1aee9e5 100644 --- a/nequip/train/early_stopping.py +++ b/nequip/train/early_stopping.py @@ -38,8 +38,6 @@ def __init__( self.upper_bounds = deepcopy(upper_bounds) self.cumulative_delta = cumulative_delta - # self.keys = set(list(self.lower_bounds.keys())) + set(list(self.upper_bounds.keys()))+set(list(self.patiences.keys())) - self.delta = {} self.counters = {} self.minimums = {} From 9fdabaf2bfe8911149b5a9523e4f9d1419d9f32b Mon Sep 17 00:00:00 2001 From: Lixin Sun Date: Thu, 13 May 2021 16:12:24 -0400 Subject: [PATCH 21/26] Update minimal.yaml --- configs/minimal.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/minimal.yaml b/configs/minimal.yaml index 7967060e..e8cb2917 100644 --- a/configs/minimal.yaml +++ b/configs/minimal.yaml @@ -21,8 +21,7 @@ dataset: aspirin dataset_file_name: benchmark_data/aspirin_ccsd-train.npz # logging -wandb: true -wandb_project: aspirin +wandb: false # verbose: debug # training From d4b7f77f31903f440f38a7ace026a5b6f76fc7a8 Mon Sep 17 00:00:00 2001 From: nw13slx Date: Fri, 14 May 2021 15:26:02 -0400 Subject: [PATCH 22/26] support class name as prefix --- nequip/utils/auto_init.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nequip/utils/auto_init.py b/nequip/utils/auto_init.py index 497c4ced..2ac039ac 100644 --- a/nequip/utils/auto_init.py +++ b/nequip/utils/auto_init.py @@ -141,10 +141,11 @@ def instantiate( return_args_only (bool): if True, do not instantiate, only return the arguments """ + prefix_list = [builder.__name__] if inspect.isclass(builder)) else [] if isinstance(prefix, str): - prefix_list = [prefix] + prefix_list += [prefix] else: - prefix_list = prefix + prefix_list += prefix # detect the input parameters needed from params config = Config.from_class(builder, remove_kwargs=remove_kwargs) @@ -226,8 +227,8 @@ def instantiate( sub_prefix_list = [sub_builder.__name__, key] for prefix in prefix_list: sub_prefix_list = sub_prefix_list + [ - prefix + "_" + key, prefix, + prefix + "_" + key, ] nested_km, nested_kwargs = instantiate( From 438320d4b0c76375c911eb221716f3a4079bcb5e Mon Sep 17 00:00:00 2001 From: nw13slx Date: Fri, 14 May 2021 15:27:47 -0400 Subject: [PATCH 23/26] fix ) --- nequip/utils/auto_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/utils/auto_init.py b/nequip/utils/auto_init.py index 2ac039ac..a913dd1e 100644 --- a/nequip/utils/auto_init.py +++ b/nequip/utils/auto_init.py @@ -141,7 +141,7 @@ def instantiate( return_args_only (bool): if True, do not instantiate, only return the arguments """ - prefix_list = [builder.__name__] if inspect.isclass(builder)) else [] + prefix_list = [builder.__name__] if inspect.isclass(builder) else [] if isinstance(prefix, str): prefix_list += [prefix] else: From 151ce42ed4430da3473e5844ae3fe83ab25cd863 Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 14 May 2021 14:02:31 -0600 Subject: [PATCH 24/26] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e477d307..be85e793 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Most recent change on the bottom. ## [Unreleased] ### Fixed - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs` +- Builders, and not just sub-builders, use the class name as a default prefix ### Added - `early_stopping_xxx` arguments added to enable early stop for platued values or values that out of lower/upper bounds. From f9af82381960faba78b0cf80ee726105c3140daf Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 14 May 2021 15:15:09 -0600 Subject: [PATCH 25/26] fix number formats --- configs/full.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/full.yaml b/configs/full.yaml index f273e7a0..e80340c0 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -93,14 +93,14 @@ ema_use_num_updates: true early_stopping_patiences: # stop early if a metric value stopped decreasing for n epochs Validation_loss: 50 # Training_loss: 100 # - mae: 100 # + e_mae: 100 # early_stopping_delta: # If delta is defined, a tiny decrease smaller than delta will not be considered as a decrease Training_loss: 0.005 # early_stopping_cumulative_delta: false # If True, the minimum value recorded will not be updated when the decrease is smaller than delta early_stopping_lower_bounds: # stop early if a metric value is lower than the bound - LR: 1e-10 # + LR: 1.0e-10 # early_stopping_upper_bounds: # stop early if a metric value is higher than the bound - wall: 1e100 # + wall: 1.0e+100 # # loss function loss_coeffs: # different weights to use in a weighted loss functions From 9e9a3fae898af22460b838b22c9af629c4383834 Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 14 May 2021 15:16:28 -0600 Subject: [PATCH 26/26] Bump version --- CHANGELOG.md | 2 ++ nequip/_version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be85e793..48bbf0b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Most recent change on the bottom. ## [Unreleased] + +## [0.3.1] ### Fixed - `iepoch` is no longer off-by-one when restarting a training run that hit `max_epochs` - Builders, and not just sub-builders, use the class name as a default prefix diff --git a/nequip/_version.py b/nequip/_version.py index 355845c6..ea12af4d 100644 --- a/nequip/_version.py +++ b/nequip/_version.py @@ -2,4 +2,4 @@ # See Python packaging guide # https://packaging.python.org/guides/single-sourcing-package-version/ -__version__ = "0.3.0" +__version__ = "0.3.1"