From 41b00a32daa45c5ee768a1a56664cf2c3e5b6021 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 8 May 2024 14:58:22 +0800
Subject: [PATCH 01/19] feat: enable users to customize training loss func and
 val metric func;

---
 pypots/base.py                                |  54 ++++++++-
 pypots/classification/base.py                 |  62 +++++++---
 pypots/classification/brits/core.py           |   8 +-
 pypots/classification/brits/model.py          |  22 +++-
 pypots/classification/grud/core.py            |   6 +-
 pypots/classification/grud/model.py           |  18 ++-
 pypots/classification/raindrop/core.py        |   6 +-
 pypots/classification/raindrop/model.py       |  18 ++-
 pypots/classification/template/model.py       |   2 +
 pypots/clustering/base.py                     |  26 ++--
 pypots/clustering/crli/core.py                |   5 -
 pypots/clustering/crli/model.py               |  16 ++-
 pypots/clustering/template/model.py           |   2 +
 pypots/clustering/vader/core.py               | 113 +++++++++---------
 pypots/clustering/vader/model.py              |  20 +++-
 pypots/forecasting/base.py                    |  26 ++--
 pypots/forecasting/csdi/core.py               |  12 +-
 pypots/forecasting/csdi/model.py              |  18 +--
 pypots/forecasting/template/model.py          |   7 +-
 pypots/imputation/autoformer/core.py          |   4 +-
 pypots/imputation/autoformer/model.py         |  17 ++-
 pypots/imputation/base.py                     |  38 ++++--
 pypots/imputation/brits/core.py               |   4 +-
 pypots/imputation/brits/model.py              |  15 ++-
 pypots/imputation/crossformer/core.py         |   4 +-
 pypots/imputation/crossformer/model.py        |  17 ++-
 pypots/imputation/csdi/core.py                |  12 +-
 pypots/imputation/csdi/model.py               |  18 +--
 pypots/imputation/dlinear/core.py             |   4 +-
 pypots/imputation/dlinear/model.py            |  17 ++-
 pypots/imputation/etsformer/core.py           |   4 +-
 pypots/imputation/etsformer/model.py          |  17 ++-
 pypots/imputation/fedformer/core.py           |   4 +-
 pypots/imputation/fedformer/model.py          |  17 ++-
 pypots/imputation/film/core.py                |   4 +-
 pypots/imputation/film/model.py               |  16 ++-
 pypots/imputation/frets/core.py               |   4 +-
 pypots/imputation/frets/model.py              |  17 ++-
 pypots/imputation/gpvae/core.py               |   4 +-
 pypots/imputation/gpvae/model.py              |  18 +--
 pypots/imputation/informer/core.py            |   4 +-
 pypots/imputation/informer/model.py           |  17 ++-
 pypots/imputation/itransformer/core.py        |   4 +-
 pypots/imputation/itransformer/model.py       |  15 ++-
 pypots/imputation/mrnn/core.py                |   4 +-
 pypots/imputation/mrnn/model.py               |  15 ++-
 .../nonstationary_transformer/core.py         |   4 +-
 .../nonstationary_transformer/model.py        |  16 ++-
 pypots/imputation/patchtst/core.py            |   4 +-
 pypots/imputation/patchtst/model.py           |  16 ++-
 pypots/imputation/pyraformer/core.py          |   4 +-
 pypots/imputation/pyraformer/model.py         |  17 ++-
 pypots/imputation/saits/core.py               |  19 ++-
 pypots/imputation/saits/model.py              |  35 ++++--
 pypots/imputation/template/model.py           |   7 +-
 pypots/imputation/timesnet/core.py            |   4 +-
 pypots/imputation/timesnet/model.py           |  17 ++-
 pypots/imputation/transformer/core.py         |   4 +-
 pypots/imputation/transformer/model.py        |  15 ++-
 pypots/imputation/usgan/core.py               |  17 +--
 pypots/imputation/usgan/model.py              |   8 +-
 pypots/nn/modules/csdi/backbone.py            |  12 +-
 pypots/nn/modules/usgan/backbone.py           |   3 +-
 63 files changed, 664 insertions(+), 293 deletions(-)

diff --git a/pypots/base.py b/pypots/base.py
index d10c7c6e..402fb0b8 100644
--- a/pypots/base.py
+++ b/pypots/base.py
@@ -9,7 +9,7 @@
 from abc import ABC
 from abc import abstractmethod
 from datetime import datetime
-from typing import Optional, Union, Iterable
+from typing import Optional, Union, Iterable, Callable
 
 import torch
 from torch.utils.tensorboard import SummaryWriter
@@ -219,7 +219,9 @@ def _save_log_into_tb_file(self, step: int, stage: str, loss_dict: dict) -> None
             # save all items containing "loss" or "error" in the name
             # WDU: may enable customization keywords in the future
             if ("loss" in item_name) or ("error" in item_name):
-                self.summary_writer.add_scalar(f"{stage}/{item_name}", loss.sum(), step)
+                if isinstance(loss, torch.Tensor):
+                    loss = loss.sum()
+                self.summary_writer.add_scalar(f"{stage}/{item_name}", loss, step)
 
     def _auto_save_model_if_necessary(
         self,
@@ -414,9 +416,17 @@ class BaseNNModel(BaseModel):
         Training epochs, i.e. the maximum rounds of the model to be trained with.
 
     patience :
-        Number of epochs the training procedure will keep if loss doesn't decrease.
-        Once exceeding the number, the training will stop.
-        Must be smaller than or equal to the value of ``epochs``.
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
@@ -471,6 +481,8 @@ def __init__(
         batch_size: int,
         epochs: int,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
         saving_path: str = None,
@@ -482,6 +494,7 @@ def __init__(
             model_saving_strategy,
         )
 
+        # check patience
         if patience is None:
             patience = -1  # early stopping on patience won't work if it is set as < 0
         else:
@@ -489,10 +502,39 @@ def __init__(
                 patience <= epochs
             ), f"patience must be smaller than epochs which is {epochs}, but got patience={patience}"
 
-        # training hype-parameters
+        # check train_loss_func and val_metric_func
+        train_loss_func_name, val_metric_func_name = "default", "loss (default)"
+        if train_loss_func is not None:
+            assert (
+                len(train_loss_func) == 1
+            ), f"train_loss_func should have only 1 item, but got {len(train_loss_func)}"
+            train_loss_func_name, train_loss_func = train_loss_func.popitem()
+            assert isinstance(
+                train_loss_func, Callable
+            ), "train_loss_func should be a callable function"
+            logger.info(
+                f"Using customized {train_loss_func_name} as the training loss function."
+            )
+        if val_metric_func is not None:
+            assert (
+                len(val_metric_func) == 1
+            ), f"val_metric_func should have only 1 item, but got {len(val_metric_func)}"
+            val_metric_func_name, val_metric_func = val_metric_func.popitem()
+            assert isinstance(
+                val_metric_func, Callable
+            ), "val_metric_func should be a callable function"
+            logger.info(
+                f"Using customized {val_metric_func_name} as the validation metric function."
+            )
+
+        # set up the hype-parameters
         self.batch_size = batch_size
         self.epochs = epochs
         self.patience = patience
+        self.train_loss_func = train_loss_func
+        self.train_loss_func_name = train_loss_func_name
+        self.val_metric_func = val_metric_func
+        self.val_metric_func_name = val_metric_func_name
         self.original_patience = patience
         self.num_workers = num_workers
 
diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index a758fed3..37732f79 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -16,6 +16,7 @@
 
 from ..base import BaseModel, BaseNNModel
 from ..utils.logging import logger
+from ..utils.metrics import calc_acc
 
 try:
     import nni
@@ -151,9 +152,17 @@ class BaseNNClassifier(BaseNNModel):
         Training epochs, i.e. the maximum rounds of the model to be trained with.
 
     patience :
-        Number of epochs the training procedure will keep if loss doesn't decrease.
-        Once exceeding the number, the training will stop.
-        Must be smaller than or equal to the value of ``epochs``.
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
@@ -196,6 +205,8 @@ def __init__(
         batch_size: int,
         epochs: int,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
         saving_path: str = None,
@@ -205,6 +216,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -212,6 +225,14 @@ def __init__(
         )
         self.n_classes = n_classes
 
+        # set default training loss function and validation metric function if not given
+        if train_loss_func is None:
+            self.train_loss_func = torch.nn.functional.cross_entropy
+            self.train_loss_func_name = "CrossEntropy"
+        if val_metric_func is None:
+            self.val_metric_func = calc_acc
+            self.val_metric_func_name = "Accuracy"
+
     @abstractmethod
     def _assemble_input_for_training(self, data: list) -> dict:
         """Assemble the given data into a dictionary for training input.
@@ -300,33 +321,48 @@ def _train_model(
 
                 if val_loader is not None:
                     self.model.eval()
-                    epoch_val_loss_collector = []
+                    epoch_val_pred_collector = []
+                    epoch_val_label_collector = []
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(inputs)
-                            epoch_val_loss_collector.append(
-                                results["loss"].sum().item()
+                            results = self.model(inputs)
+                            epoch_val_pred_collector.append(
+                                results["classification_pred"]
                             )
+                            epoch_val_label_collector.append(inputs["y"])
+
+                    epoch_val_pred_collector = torch.cat(
+                        epoch_val_pred_collector, dim=-1
+                    )
+                    epoch_val_label_collector = torch.cat(
+                        epoch_val_label_collector, dim=-1
+                    )
 
-                    mean_val_loss = np.mean(epoch_val_loss_collector)
+                    # TODO: refactor the following code to a function
+                    epoch_val_pred_collector = np.argmax(
+                        epoch_val_pred_collector, axis=1
+                    )
+                    mean_val_loss = self.val_metric_func(
+                        epoch_val_pred_collector, epoch_val_label_collector
+                    )
 
                     # save validation loss logs into the tensorboard file for every epoch if in need
                     if self.summary_writer is not None:
                         val_loss_dict = {
-                            "classification_loss": mean_val_loss,
+                            self.val_metric_func_name: mean_val_loss,
                         }
                         self._save_log_into_tb_file(epoch, "validating", val_loss_dict)
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
@@ -431,8 +467,6 @@ def classify(
     ) -> np.ndarray:
         """Classify the input data with the trained model.
 
-
-
         Parameters
         ----------
         test_set :
diff --git a/pypots/classification/brits/core.py b/pypots/classification/brits/core.py
index ebd1bae3..3b676bfb 100644
--- a/pypots/classification/brits/core.py
+++ b/pypots/classification/brits/core.py
@@ -36,7 +36,7 @@ def __init__(
         self.f_classifier = nn.Linear(self.rnn_hidden_size, n_classes)
         self.b_classifier = nn.Linear(self.rnn_hidden_size, n_classes)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         (
             imputed_data,
             f_reconstruction,
@@ -59,11 +59,11 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             results["consistency_loss"] = consistency_loss
             results["reconstruction_loss"] = reconstruction_loss
-            f_classification_loss = F.nll_loss(torch.log(f_prediction), inputs["label"])
-            b_classification_loss = F.nll_loss(torch.log(b_prediction), inputs["label"])
+            f_classification_loss = F.nll_loss(torch.log(f_prediction), inputs["y"])
+            b_classification_loss = F.nll_loss(torch.log(b_prediction), inputs["y"])
             classification_loss = (f_classification_loss + b_classification_loss) / 2
             loss = (
                 consistency_loss
diff --git a/pypots/classification/brits/model.py b/pypots/classification/brits/model.py
index b52bf4d5..a818ed63 100644
--- a/pypots/classification/brits/model.py
+++ b/pypots/classification/brits/model.py
@@ -53,6 +53,14 @@ class BRITS(BaseNNClassifier):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -94,6 +102,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -105,6 +115,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -117,6 +129,10 @@ def __init__(
         self.classification_weight = classification_weight
         self.reconstruction_weight = reconstruction_weight
 
+        # CSDI has its own defined loss function, so we set them as None here
+        self.train_loss_func = None
+        self.train_loss_func_name = "default"
+
         # set up the model
         self.model = _BRITS(
             self.n_steps,
@@ -143,13 +159,13 @@ def _assemble_input_for_training(self, data: list) -> dict:
             back_X,
             back_missing_mask,
             back_deltas,
-            label,
+            y,
         ) = self._send_data_to_given_device(data)
 
         # assemble input data
         inputs = {
             "indices": indices,
-            "label": label,
+            "y": y,
             "forward": {
                 "X": X,
                 "missing_mask": missing_mask,
@@ -244,7 +260,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 classification_pred = results["classification_pred"]
                 classification_collector.append(classification_pred)
 
diff --git a/pypots/classification/grud/core.py b/pypots/classification/grud/core.py
index 16cd2723..1af7e56d 100644
--- a/pypots/classification/grud/core.py
+++ b/pypots/classification/grud/core.py
@@ -40,7 +40,7 @@ def __init__(
         )
         self.classifier = nn.Linear(self.rnn_hidden_size, self.n_classes)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         """Forward processing of GRU-D.
 
         Parameters
@@ -71,9 +71,9 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         results = {"classification_pred": classification_pred}
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             classification_loss = F.nll_loss(
-                torch.log(classification_pred), inputs["label"]
+                torch.log(classification_pred), inputs["y"]
             )
             results["loss"] = classification_loss
 
diff --git a/pypots/classification/grud/model.py b/pypots/classification/grud/model.py
index f6413d9e..fc5c0123 100644
--- a/pypots/classification/grud/model.py
+++ b/pypots/classification/grud/model.py
@@ -48,6 +48,14 @@ class GRUD(BaseNNClassifier):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -87,6 +95,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -98,6 +108,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -132,7 +144,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
             missing_mask,
             deltas,
             empirical_mean,
-            label,
+            y,
         ) = self._send_data_to_given_device(data)
 
         # assemble input data
@@ -143,7 +155,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
             "missing_mask": missing_mask,
             "deltas": deltas,
             "empirical_mean": empirical_mean,
-            "label": label,
+            "y": y,
         }
         return inputs
 
@@ -221,7 +233,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 prediction = results["classification_pred"]
                 classification_collector.append(prediction)
 
diff --git a/pypots/classification/raindrop/core.py b/pypots/classification/raindrop/core.py
index 5e6deb99..e6be79e9 100644
--- a/pypots/classification/raindrop/core.py
+++ b/pypots/classification/raindrop/core.py
@@ -64,7 +64,7 @@ def __init__(
             nn.Linear(d_final, n_classes),
         )
 
-    def forward(self, inputs, training=True):
+    def forward(self, inputs):
         X, missing_mask, static, timestamps, lengths = (
             inputs["X"],
             inputs["missing_mask"],
@@ -115,9 +115,9 @@ def forward(self, inputs, training=True):
         results = {"classification_pred": classification_pred}
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             classification_loss = F.nll_loss(
-                torch.log(classification_pred), inputs["label"]
+                torch.log(classification_pred), inputs["y"]
             )
             results["loss"] = classification_loss
 
diff --git a/pypots/classification/raindrop/model.py b/pypots/classification/raindrop/model.py
index 78d64267..d9e28c58 100644
--- a/pypots/classification/raindrop/model.py
+++ b/pypots/classification/raindrop/model.py
@@ -74,6 +74,14 @@ class Raindrop(BaseNNClassifier):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -121,6 +129,8 @@ def __init__(
         batch_size=32,
         epochs=100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -132,6 +142,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -172,7 +184,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
             missing_mask,
             deltas,
             empirical_mean,
-            label,
+            y,
         ) = self._send_data_to_given_device(data)
 
         bz, n_steps, n_features = X.shape
@@ -185,7 +197,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
             "timestamps": times,
             "lengths": lengths,
             "missing_mask": missing_mask,
-            "label": label,
+            "y": y,
         }
         return inputs
 
@@ -265,7 +277,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 prediction = results["classification_pred"]
                 classification_collector.append(prediction)
 
diff --git a/pypots/classification/template/model.py b/pypots/classification/template/model.py
index dec46806..8445b4a3 100644
--- a/pypots/classification/template/model.py
+++ b/pypots/classification/template/model.py
@@ -35,6 +35,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
index bdf68645..0df0f298 100644
--- a/pypots/clustering/base.py
+++ b/pypots/clustering/base.py
@@ -150,9 +150,17 @@ class BaseNNClusterer(BaseNNModel):
         Training epochs, i.e. the maximum rounds of the model to be trained with.
 
     patience :
-        Number of epochs the training procedure will keep if loss doesn't decrease.
-        Once exceeding the number, the training will stop.
-        Must be smaller than or equal to the value of ``epochs``.
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
@@ -193,8 +201,10 @@ def __init__(
         self,
         n_clusters: int,
         batch_size: int,
-        epochs: int,
+        epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
         saving_path: str = None,
@@ -204,6 +214,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -319,13 +331,13 @@ def _train_model(
                     mean_val_loss = np.mean(epoch_val_loss_collector)
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
diff --git a/pypots/clustering/crli/core.py b/pypots/clustering/crli/core.py
index 755d9ff7..74bc7605 100644
--- a/pypots/clustering/crli/core.py
+++ b/pypots/clustering/crli/core.py
@@ -55,7 +55,6 @@ def forward(
         self,
         inputs: dict,
         training_object: str = "generator",
-        training: bool = True,
     ) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
         imputation_latent, discrimination, reconstruction, fcn_latent = self.backbone(
@@ -68,10 +67,6 @@ def forward(
             "fcn_latent": fcn_latent,
         }
 
-        # return results directly, skip loss calculation to reduce inference time
-        if not training:
-            return results
-
         if training_object == "discriminator":
             l_D = F.binary_cross_entropy_with_logits(discrimination, missing_mask)
             results["discrimination_loss"] = l_D
diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py
index abe4e655..640e3e0c 100644
--- a/pypots/clustering/crli/model.py
+++ b/pypots/clustering/crli/model.py
@@ -74,6 +74,14 @@ class CRLI(BaseNNClusterer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     G_optimizer :
         The optimizer for the generator training.
         If not given, will use a default Adam optimizer.
@@ -123,6 +131,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         G_optimizer: Optional[Optimizer] = Adam(),
         D_optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
@@ -135,6 +145,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -259,7 +271,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(inputs, training=True)
+                            results = self.model.forward(inputs)
                             epoch_val_loss_G_collector.append(
                                 results["generation_loss"].sum().item()
                             )
@@ -415,7 +427,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                inputs = self.model.forward(inputs, training=False)
+                inputs = self.model.forward(inputs)
                 clustering_latent_collector.append(inputs["fcn_latent"])
                 if return_latent_vars:
                     imputation_collector.append(inputs["imputation_latent"])
diff --git a/pypots/clustering/template/model.py b/pypots/clustering/template/model.py
index 0ed75220..f86e172c 100644
--- a/pypots/clustering/template/model.py
+++ b/pypots/clustering/template/model.py
@@ -35,6 +35,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
diff --git a/pypots/clustering/vader/core.py b/pypots/clustering/vader/core.py
index 52843b72..bb19c1e3 100644
--- a/pypots/clustering/vader/core.py
+++ b/pypots/clustering/vader/core.py
@@ -75,7 +75,6 @@ def forward(
         self,
         inputs: dict,
         pretrain: bool = False,
-        training: bool = True,
     ) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
         device = X.device
@@ -113,63 +112,59 @@ def forward(
             results["loss"] = reconstruction_loss
             return results
 
-        # if in training mode, return results with losses
-        if training:
-            # calculate the latent loss for model training
-            var_tilde = torch.exp(stddev_tilde)
-            stddev_c = torch.log(var_c + self.eps)
-            log_2pi = torch.log(torch.tensor([2 * torch.pi], device=device))
-            log_phi_c = torch.log(phi_c + self.eps)
-
-            batch_size = z.shape[0]
-
-            ii, jj = torch.meshgrid(
-                torch.arange(self.n_clusters, dtype=torch.int64, device=device),
-                torch.arange(batch_size, dtype=torch.int64, device=device),
-                indexing="ij",
-            )
-            ii = ii.flatten()
-            jj = jj.flatten()
-
-            lsc_b = stddev_c.index_select(dim=0, index=ii)
-            mc_b = mu_c.index_select(dim=0, index=ii)
-            sc_b = var_c.index_select(dim=0, index=ii)
-            z_b = z.index_select(dim=0, index=jj)
-            log_pdf_z = -0.5 * (lsc_b + log_2pi + torch.square(z_b - mc_b) / sc_b)
-            log_pdf_z = log_pdf_z.reshape(
-                [batch_size, self.n_clusters, self.d_mu_stddev]
-            )
-
-            log_p = log_phi_c + log_pdf_z.sum(dim=2)
-            lse_p = log_p.logsumexp(dim=1, keepdim=True)
-            log_gamma_c = log_p - lse_p
-            gamma_c = torch.exp(log_gamma_c)
-
-            term1 = torch.log(var_c + self.eps)
-            st_b = var_tilde.index_select(dim=0, index=jj)
-            sc_b = var_c.index_select(dim=0, index=ii)
-            term2 = torch.reshape(
-                st_b / (sc_b + self.eps),
-                [batch_size, self.n_clusters, self.d_mu_stddev],
-            )
-            mt_b = mu_tilde.index_select(dim=0, index=jj)
-            mc_b = mu_c.index_select(dim=0, index=ii)
-            term3 = torch.reshape(
-                torch.square(mt_b - mc_b) / (sc_b + self.eps),
-                [batch_size, self.n_clusters, self.d_mu_stddev],
-            )
-
-            latent_loss1 = 0.5 * torch.sum(
-                gamma_c * torch.sum(term1 + term2 + term3, dim=2), dim=1
-            )
-            latent_loss2 = -torch.sum(gamma_c * (log_phi_c - log_gamma_c), dim=1)
-            latent_loss3 = -0.5 * torch.sum(1 + stddev_tilde, dim=1)
-
-            latent_loss1 = latent_loss1.mean()
-            latent_loss2 = latent_loss2.mean()
-            latent_loss3 = latent_loss3.mean()
-            latent_loss = latent_loss1 + latent_loss2 + latent_loss3
-
-            results["loss"] = reconstruction_loss + self.alpha * latent_loss
+        # calculate the latent loss for model training
+        var_tilde = torch.exp(stddev_tilde)
+        stddev_c = torch.log(var_c + self.eps)
+        log_2pi = torch.log(torch.tensor([2 * torch.pi], device=device))
+        log_phi_c = torch.log(phi_c + self.eps)
+
+        batch_size = z.shape[0]
+
+        ii, jj = torch.meshgrid(
+            torch.arange(self.n_clusters, dtype=torch.int64, device=device),
+            torch.arange(batch_size, dtype=torch.int64, device=device),
+            indexing="ij",
+        )
+        ii = ii.flatten()
+        jj = jj.flatten()
+
+        lsc_b = stddev_c.index_select(dim=0, index=ii)
+        mc_b = mu_c.index_select(dim=0, index=ii)
+        sc_b = var_c.index_select(dim=0, index=ii)
+        z_b = z.index_select(dim=0, index=jj)
+        log_pdf_z = -0.5 * (lsc_b + log_2pi + torch.square(z_b - mc_b) / sc_b)
+        log_pdf_z = log_pdf_z.reshape([batch_size, self.n_clusters, self.d_mu_stddev])
+
+        log_p = log_phi_c + log_pdf_z.sum(dim=2)
+        lse_p = log_p.logsumexp(dim=1, keepdim=True)
+        log_gamma_c = log_p - lse_p
+        gamma_c = torch.exp(log_gamma_c)
+
+        term1 = torch.log(var_c + self.eps)
+        st_b = var_tilde.index_select(dim=0, index=jj)
+        sc_b = var_c.index_select(dim=0, index=ii)
+        term2 = torch.reshape(
+            st_b / (sc_b + self.eps),
+            [batch_size, self.n_clusters, self.d_mu_stddev],
+        )
+        mt_b = mu_tilde.index_select(dim=0, index=jj)
+        mc_b = mu_c.index_select(dim=0, index=ii)
+        term3 = torch.reshape(
+            torch.square(mt_b - mc_b) / (sc_b + self.eps),
+            [batch_size, self.n_clusters, self.d_mu_stddev],
+        )
+
+        latent_loss1 = 0.5 * torch.sum(
+            gamma_c * torch.sum(term1 + term2 + term3, dim=2), dim=1
+        )
+        latent_loss2 = -torch.sum(gamma_c * (log_phi_c - log_gamma_c), dim=1)
+        latent_loss3 = -0.5 * torch.sum(1 + stddev_tilde, dim=1)
+
+        latent_loss1 = latent_loss1.mean()
+        latent_loss2 = latent_loss2.mean()
+        latent_loss3 = latent_loss3.mean()
+        latent_loss = latent_loss1 + latent_loss2 + latent_loss3
+
+        results["loss"] = reconstruction_loss + self.alpha * latent_loss
 
         return results
diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py
index cfc85f97..22116682 100644
--- a/pypots/clustering/vader/model.py
+++ b/pypots/clustering/vader/model.py
@@ -63,6 +63,14 @@ class VaDER(BaseNNClusterer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -101,6 +109,8 @@ def __init__(
         epochs: int = 100,
         pretrain_epochs: int = 10,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -112,6 +122,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -288,13 +300,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
@@ -432,7 +444,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
 
                 mu_tilde = results["mu_tilde"].cpu().numpy()
                 mu_tilde_collector.append(mu_tilde)
diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
index 0b8a153d..85964dda 100644
--- a/pypots/forecasting/base.py
+++ b/pypots/forecasting/base.py
@@ -141,9 +141,17 @@ class BaseNNForecaster(BaseNNModel):
         Training epochs, i.e. the maximum rounds of the model to be trained with.
 
     patience :
-        Number of epochs the training procedure will keep if loss doesn't decrease.
-        Once exceeding the number, the training will stop.
-        Must be smaller than or equal to the value of ``epochs``.
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
@@ -184,6 +192,8 @@ def __init__(
         batch_size: int,
         epochs: int,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
         saving_path: str = None,
@@ -193,6 +203,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -291,7 +303,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(inputs, training=False)
+                            results = self.model.forward(inputs)
                             forecasting_mse = (
                                 calc_mse(
                                     results["forecasting_data"],
@@ -315,13 +327,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
diff --git a/pypots/forecasting/csdi/core.py b/pypots/forecasting/csdi/core.py
index e488cb20..d869f2c0 100644
--- a/pypots/forecasting/csdi/core.py
+++ b/pypots/forecasting/csdi/core.py
@@ -96,9 +96,9 @@ def get_side_info(self, observed_tp, cond_mask, feature_id):
 
         return side_info
 
-    def forward(self, inputs, training=True, n_sampling_times=1):
+    def forward(self, inputs, n_sampling_times=1):
         results = {}
-        if training:  # for training
+        if self.training:  # for training
             (observed_data, indicating_mask, cond_mask, observed_tp, feature_id) = (
                 inputs["X_ori"],
                 inputs["indicating_mask"],
@@ -108,10 +108,10 @@ def forward(self, inputs, training=True, n_sampling_times=1):
             )
             side_info = self.get_side_info(observed_tp, cond_mask, feature_id)
             training_loss = self.backbone.calc_loss(
-                observed_data, cond_mask, indicating_mask, side_info, training
+                observed_data, cond_mask, indicating_mask, side_info
             )
             results["loss"] = training_loss
-        elif not training and n_sampling_times == 0:  # for validating
+        elif not self.training and n_sampling_times == 0:  # for validating
             (observed_data, indicating_mask, cond_mask, observed_tp, feature_id) = (
                 inputs["X_ori"],
                 inputs["indicating_mask"],
@@ -121,10 +121,10 @@ def forward(self, inputs, training=True, n_sampling_times=1):
             )
             side_info = self.get_side_info(observed_tp, cond_mask, feature_id)
             validating_loss = self.backbone.calc_loss_valid(
-                observed_data, cond_mask, indicating_mask, side_info, training
+                observed_data, cond_mask, indicating_mask, side_info
             )
             results["loss"] = validating_loss
-        elif not training and n_sampling_times > 0:  # for testing
+        elif not self.training and n_sampling_times > 0:  # for testing
             observed_data, cond_mask, observed_tp, feature_id = (
                 inputs["X"],
                 inputs["cond_mask"],
diff --git a/pypots/forecasting/csdi/model.py b/pypots/forecasting/csdi/model.py
index 77d32d86..a9915ee7 100644
--- a/pypots/forecasting/csdi/model.py
+++ b/pypots/forecasting/csdi/model.py
@@ -152,6 +152,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            None,
+            None,
             num_workers,
             device,
             saving_path,
@@ -168,6 +170,11 @@ def __init__(
         self.n_pred_steps = n_pred_steps
         self.n_pred_features = n_pred_features
         self.target_strategy = target_strategy
+        # CSDI has its own defined loss function and validation loss, so we set them as None here
+        self.train_loss_func = None
+        self.train_loss_func_name = "default"
+        self.val_metric_func = None
+        self.val_metric_func_name = "loss (default)"
 
         # set up the model
         self.model = _CSDI(
@@ -268,9 +275,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(
-                                inputs, training=False, n_sampling_times=0
-                            )
+                            results = self.model.forward(inputs, n_sampling_times=0)
                             val_loss_collector.append(results["loss"].sum().item())
 
                     mean_val_loss = np.asarray(val_loss_collector).mean()
@@ -284,13 +289,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
@@ -441,7 +446,6 @@ def predict(
                 inputs = self._assemble_input_for_testing(data)
                 results = self.model(
                     inputs,
-                    training=False,
                     n_sampling_times=n_sampling_times,
                 )
                 forecasting_data = results["forecasting_data"][
diff --git a/pypots/forecasting/template/model.py b/pypots/forecasting/template/model.py
index fd817694..60595bd9 100644
--- a/pypots/forecasting/template/model.py
+++ b/pypots/forecasting/template/model.py
@@ -34,6 +34,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -44,12 +46,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
-        )
-        # set up the hyper-parameters
+        )  # set up the hyper-parameters
         # TODO: set up your model's hyper-parameters here
 
         # set up the model
diff --git a/pypots/imputation/autoformer/core.py b/pypots/imputation/autoformer/core.py
index fb883c4e..a382be5b 100644
--- a/pypots/imputation/autoformer/core.py
+++ b/pypots/imputation/autoformer/core.py
@@ -53,7 +53,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Autoformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -74,7 +74,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/autoformer/model.py b/pypots/imputation/autoformer/model.py
index dcdc8b64..5fd0f818 100644
--- a/pypots/imputation/autoformer/model.py
+++ b/pypots/imputation/autoformer/model.py
@@ -71,6 +71,14 @@ class Autoformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -116,7 +124,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -127,12 +137,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -284,7 +295,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index f08d310f..e79d7e23 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -142,9 +142,17 @@ class BaseNNImputer(BaseNNModel):
         Training epochs, i.e. the maximum rounds of the model to be trained with.
 
     patience :
-        Number of epochs the training procedure will keep if loss doesn't decrease.
-        Once exceeding the number, the training will stop.
-        Must be smaller than or equal to the value of ``epochs``.
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
@@ -185,6 +193,8 @@ def __init__(
         batch_size: int,
         epochs: int,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
         saving_path: str = None,
@@ -194,12 +204,22 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
 
+        # set default training loss function and validation metric function if not given
+        if train_loss_func is None:
+            self.train_loss_func = calc_mse
+            self.train_loss_func_name = "MSE"
+        if val_metric_func is None:
+            self.val_metric_func = calc_mse
+            self.val_metric_func_name = "MSE"
+
     @abstractmethod
     def _assemble_input_for_training(self, data: list) -> dict:
         """Assemble the given data into a dictionary for training input.
@@ -293,8 +313,8 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(inputs, training=False)
-                            imputation_mse = (
+                            results = self.model.forward(inputs)
+                            imputation_error = (
                                 calc_mse(
                                     results["imputed_data"],
                                     inputs["X_ori"],
@@ -304,7 +324,7 @@ def _train_model(
                                 .detach()
                                 .item()
                             )
-                            imputation_loss_collector.append(imputation_mse)
+                            imputation_loss_collector.append(imputation_error)
 
                     mean_val_loss = np.mean(imputation_loss_collector)
 
@@ -317,13 +337,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
diff --git a/pypots/imputation/brits/core.py b/pypots/imputation/brits/core.py
index 9d1734c4..c6869c83 100644
--- a/pypots/imputation/brits/core.py
+++ b/pypots/imputation/brits/core.py
@@ -41,7 +41,7 @@ def __init__(
 
         self.model = BackboneBRITS(n_steps, n_features, rnn_hidden_size)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         (
             imputed_data,
             f_reconstruction,
@@ -57,7 +57,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             results["consistency_loss"] = consistency_loss
             results["reconstruction_loss"] = reconstruction_loss
             loss = consistency_loss + reconstruction_loss
diff --git a/pypots/imputation/brits/model.py b/pypots/imputation/brits/model.py
index 5f1676cf..d22bf86d 100644
--- a/pypots/imputation/brits/model.py
+++ b/pypots/imputation/brits/model.py
@@ -45,6 +45,14 @@ class BRITS(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -83,6 +91,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -93,12 +103,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         self.rnn_hidden_size = rnn_hidden_size
@@ -238,7 +249,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/crossformer/core.py b/pypots/imputation/crossformer/core.py
index e26f27ca..48d449b0 100644
--- a/pypots/imputation/crossformer/core.py
+++ b/pypots/imputation/crossformer/core.py
@@ -83,7 +83,7 @@ def __init__(
         # apply SAITS loss function to Crossformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Crossformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -113,7 +113,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/crossformer/model.py b/pypots/imputation/crossformer/model.py
index 7db9aaba..6e826c56 100644
--- a/pypots/imputation/crossformer/model.py
+++ b/pypots/imputation/crossformer/model.py
@@ -74,6 +74,14 @@ class Crossformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -120,7 +128,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -131,12 +141,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -290,7 +301,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/csdi/core.py b/pypots/imputation/csdi/core.py
index a80acce3..0c6ec279 100644
--- a/pypots/imputation/csdi/core.py
+++ b/pypots/imputation/csdi/core.py
@@ -88,9 +88,9 @@ def get_side_info(self, observed_tp, cond_mask):
 
         return side_info
 
-    def forward(self, inputs, training=True, n_sampling_times=1):
+    def forward(self, inputs, n_sampling_times=1):
         results = {}
-        if training:  # for training
+        if self.training:  # for training
             (observed_data, indicating_mask, cond_mask, observed_tp) = (
                 inputs["X_ori"],
                 inputs["indicating_mask"],
@@ -99,10 +99,10 @@ def forward(self, inputs, training=True, n_sampling_times=1):
             )
             side_info = self.get_side_info(observed_tp, cond_mask)
             training_loss = self.backbone.calc_loss(
-                observed_data, cond_mask, indicating_mask, side_info, training
+                observed_data, cond_mask, indicating_mask, side_info
             )
             results["loss"] = training_loss
-        elif not training and n_sampling_times == 0:  # for validating
+        elif not self.training and n_sampling_times == 0:  # for validating
             (observed_data, indicating_mask, cond_mask, observed_tp) = (
                 inputs["X_ori"],
                 inputs["indicating_mask"],
@@ -111,10 +111,10 @@ def forward(self, inputs, training=True, n_sampling_times=1):
             )
             side_info = self.get_side_info(observed_tp, cond_mask)
             validating_loss = self.backbone.calc_loss_valid(
-                observed_data, cond_mask, indicating_mask, side_info, training
+                observed_data, cond_mask, indicating_mask, side_info
             )
             results["loss"] = validating_loss
-        elif not training and n_sampling_times > 0:  # for testing
+        elif not self.training and n_sampling_times > 0:  # for testing
             observed_data, cond_mask, observed_tp = (
                 inputs["X"],
                 inputs["cond_mask"],
diff --git a/pypots/imputation/csdi/model.py b/pypots/imputation/csdi/model.py
index 33c1535b..832e6dc4 100644
--- a/pypots/imputation/csdi/model.py
+++ b/pypots/imputation/csdi/model.py
@@ -144,6 +144,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            None,
+            None,
             num_workers,
             device,
             saving_path,
@@ -153,6 +155,11 @@ def __init__(
         assert schedule in ["quad", "linear"]
         self.n_steps = n_steps
         self.target_strategy = target_strategy
+        # CSDI has its own defined loss function and validation loss, so we set them as None here
+        self.train_loss_func = None
+        self.train_loss_func_name = "default"
+        self.val_metric_func = None
+        self.val_metric_func_name = "loss (default)"
 
         # set up the model
         self.model = _CSDI(
@@ -248,9 +255,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(
-                                inputs, training=False, n_sampling_times=0
-                            )
+                            results = self.model.forward(inputs, n_sampling_times=0)
                             val_loss_collector.append(results["loss"].sum().item())
 
                     mean_val_loss = np.asarray(val_loss_collector).mean()
@@ -264,13 +269,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
@@ -420,7 +425,6 @@ def predict(
                 inputs = self._assemble_input_for_testing(data)
                 results = self.model(
                     inputs,
-                    training=False,
                     n_sampling_times=n_sampling_times,
                 )
                 imputed_data = results["imputed_data"]
diff --git a/pypots/imputation/dlinear/core.py b/pypots/imputation/dlinear/core.py
index 78d3bcbd..3f43f4a0 100644
--- a/pypots/imputation/dlinear/core.py
+++ b/pypots/imputation/dlinear/core.py
@@ -48,7 +48,7 @@ def __init__(
         # apply SAITS loss function to Transformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # input preprocessing and embedding for DLinear
@@ -78,7 +78,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/dlinear/model.py b/pypots/imputation/dlinear/model.py
index 3721eead..cb48708d 100644
--- a/pypots/imputation/dlinear/model.py
+++ b/pypots/imputation/dlinear/model.py
@@ -60,6 +60,14 @@ class DLinear(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -101,7 +109,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -112,12 +122,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -261,7 +272,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/etsformer/core.py b/pypots/imputation/etsformer/core.py
index 92c61f5d..162b6502 100644
--- a/pypots/imputation/etsformer/core.py
+++ b/pypots/imputation/etsformer/core.py
@@ -77,7 +77,7 @@ def __init__(
         # apply SAITS loss function to ETSformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original ETSformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -98,7 +98,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/etsformer/model.py b/pypots/imputation/etsformer/model.py
index 6dbb2fbc..8dbd9009 100644
--- a/pypots/imputation/etsformer/model.py
+++ b/pypots/imputation/etsformer/model.py
@@ -71,6 +71,14 @@ class ETSformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -116,7 +124,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -127,12 +137,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -284,7 +295,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/fedformer/core.py b/pypots/imputation/fedformer/core.py
index 617a1462..679d5868 100644
--- a/pypots/imputation/fedformer/core.py
+++ b/pypots/imputation/fedformer/core.py
@@ -58,7 +58,7 @@ def __init__(
         # apply SAITS loss function to ETSformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original FEDformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -78,7 +78,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/fedformer/model.py b/pypots/imputation/fedformer/model.py
index 2d8ca073..94da1858 100644
--- a/pypots/imputation/fedformer/model.py
+++ b/pypots/imputation/fedformer/model.py
@@ -79,6 +79,14 @@ class FEDformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -126,7 +134,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -137,12 +147,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -298,7 +309,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/film/core.py b/pypots/imputation/film/core.py
index 2e48f8c2..63270edb 100644
--- a/pypots/imputation/film/core.py
+++ b/pypots/imputation/film/core.py
@@ -48,7 +48,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original FiLM paper isn't proposed for imputation task. Hence the model doesn't take
@@ -69,7 +69,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/film/model.py b/pypots/imputation/film/model.py
index 8caae0d5..2bbe7e6d 100644
--- a/pypots/imputation/film/model.py
+++ b/pypots/imputation/film/model.py
@@ -67,6 +67,14 @@ class FiLM(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -111,7 +119,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -122,6 +132,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -278,7 +290,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/frets/core.py b/pypots/imputation/frets/core.py
index 488880d9..3247adb4 100644
--- a/pypots/imputation/frets/core.py
+++ b/pypots/imputation/frets/core.py
@@ -45,7 +45,7 @@ def __init__(
         self.output_projection = nn.Linear(embed_size, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original FreTS paper isn't proposed for imputation task. Hence the model doesn't take
@@ -65,7 +65,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/frets/model.py b/pypots/imputation/frets/model.py
index 42101ac4..15c06f5a 100644
--- a/pypots/imputation/frets/model.py
+++ b/pypots/imputation/frets/model.py
@@ -59,6 +59,14 @@ class FreTS(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -100,7 +108,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -111,12 +121,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -260,7 +271,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/gpvae/core.py b/pypots/imputation/gpvae/core.py
index 79b8f724..0a99fe69 100644
--- a/pypots/imputation/gpvae/core.py
+++ b/pypots/imputation/gpvae/core.py
@@ -89,11 +89,11 @@ def __init__(
             window_size,
         )
 
-    def forward(self, inputs, training=True, n_sampling_times=1):
+    def forward(self, inputs, n_sampling_times=1):
         X, missing_mask = inputs["X"], inputs["missing_mask"]
         results = {}
 
-        if training:
+        if self.training:
             elbo_loss = self.backbone(X, missing_mask)
             results["loss"] = elbo_loss
         else:
diff --git a/pypots/imputation/gpvae/model.py b/pypots/imputation/gpvae/model.py
index 0af6a73d..471e3996 100644
--- a/pypots/imputation/gpvae/model.py
+++ b/pypots/imputation/gpvae/model.py
@@ -132,6 +132,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -142,6 +144,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -266,9 +270,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(
-                                inputs, training=False, n_sampling_times=1
-                            )
+                            results = self.model.forward(inputs, n_sampling_times=1)
                             imputed_data = results["imputed_data"].mean(axis=1)
                             imputation_mse = (
                                 calc_mse(
@@ -293,13 +295,13 @@ def _train_model(
 
                     logger.info(
                         f"Epoch {epoch:03d} - "
-                        f"training loss: {mean_train_loss:.4f}, "
-                        f"validation loss: {mean_val_loss:.4f}"
+                        f"training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}, "
+                        f"validation {self.val_metric_func_name}: {mean_val_loss:.4f}"
                     )
                     mean_loss = mean_val_loss
                 else:
                     logger.info(
-                        f"Epoch {epoch:03d} - training loss: {mean_train_loss:.4f}"
+                        f"Epoch {epoch:03d} - training loss ({self.train_loss_func_name}): {mean_train_loss:.4f}"
                     )
                     mean_loss = mean_train_loss
 
@@ -440,9 +442,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(
-                    inputs, training=False, n_sampling_times=n_sampling_times
-                )
+                results = self.model.forward(inputs, n_sampling_times=n_sampling_times)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/informer/core.py b/pypots/imputation/informer/core.py
index e9199b02..a06eb709 100644
--- a/pypots/imputation/informer/core.py
+++ b/pypots/imputation/informer/core.py
@@ -69,7 +69,7 @@ def __init__(
 
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Informer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -91,7 +91,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/informer/model.py b/pypots/imputation/informer/model.py
index 85b2b1be..df72b5ed 100644
--- a/pypots/imputation/informer/model.py
+++ b/pypots/imputation/informer/model.py
@@ -68,6 +68,14 @@ class Informer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -112,7 +120,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -123,12 +133,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -278,7 +289,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/itransformer/core.py b/pypots/imputation/itransformer/core.py
index 5747f12e..cba9d416 100644
--- a/pypots/imputation/itransformer/core.py
+++ b/pypots/imputation/itransformer/core.py
@@ -53,7 +53,7 @@ def __init__(
         # apply SAITS loss function to Transformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Informer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -79,7 +79,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/itransformer/model.py b/pypots/imputation/itransformer/model.py
index 045bd2dc..adae4068 100644
--- a/pypots/imputation/itransformer/model.py
+++ b/pypots/imputation/itransformer/model.py
@@ -81,6 +81,14 @@ class iTransformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -128,6 +136,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -138,12 +148,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         if d_model != n_heads * d_k:
             logger.warning(
                 "‼️ d_model must = n_heads * d_k, it should be divisible by n_heads "
@@ -283,7 +294,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/mrnn/core.py b/pypots/imputation/mrnn/core.py
index 0cdf7084..2092d10e 100644
--- a/pypots/imputation/mrnn/core.py
+++ b/pypots/imputation/mrnn/core.py
@@ -18,7 +18,7 @@ def __init__(self, n_steps, n_features, rnn_hidden_size):
         super().__init__()
         self.backbone = BackboneMRNN(n_steps, n_features, rnn_hidden_size)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X = inputs["forward"]["X"]
         M = inputs["forward"]["missing_mask"]
 
@@ -30,7 +30,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             RNN_loss = calc_rmse(RNN_estimation, X, M)
             FCN_loss = calc_rmse(FCN_estimation, RNN_imputed_data)
             reconstruction_loss = RNN_loss + FCN_loss
diff --git a/pypots/imputation/mrnn/model.py b/pypots/imputation/mrnn/model.py
index e3527432..e4326aab 100644
--- a/pypots/imputation/mrnn/model.py
+++ b/pypots/imputation/mrnn/model.py
@@ -46,6 +46,14 @@ class MRNN(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -84,6 +92,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -94,12 +104,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         self.rnn_hidden_size = rnn_hidden_size
@@ -240,7 +251,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/nonstationary_transformer/core.py b/pypots/imputation/nonstationary_transformer/core.py
index 9ca21e1d..cf17c8f8 100644
--- a/pypots/imputation/nonstationary_transformer/core.py
+++ b/pypots/imputation/nonstationary_transformer/core.py
@@ -72,7 +72,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
         X_enc, means, stdev = nonstationary_norm(X, missing_mask)
 
@@ -98,7 +98,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/nonstationary_transformer/model.py b/pypots/imputation/nonstationary_transformer/model.py
index 9786ccd7..12c455cf 100644
--- a/pypots/imputation/nonstationary_transformer/model.py
+++ b/pypots/imputation/nonstationary_transformer/model.py
@@ -73,6 +73,14 @@ class NonstationaryTransformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -118,7 +126,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -129,6 +139,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -290,7 +302,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/patchtst/core.py b/pypots/imputation/patchtst/core.py
index 9a356173..f51c18eb 100644
--- a/pypots/imputation/patchtst/core.py
+++ b/pypots/imputation/patchtst/core.py
@@ -53,7 +53,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original PatchTST paper isn't proposed for imputation task. Hence the model doesn't take
@@ -80,7 +80,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
             "imputed_data": imputed_data,
         }
 
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/patchtst/model.py b/pypots/imputation/patchtst/model.py
index f4033a49..745cc20b 100644
--- a/pypots/imputation/patchtst/model.py
+++ b/pypots/imputation/patchtst/model.py
@@ -85,6 +85,14 @@ class PatchTST(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -133,7 +141,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -144,6 +154,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -316,7 +328,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/pyraformer/core.py b/pypots/imputation/pyraformer/core.py
index 3087d90a..7acb0817 100644
--- a/pypots/imputation/pyraformer/core.py
+++ b/pypots/imputation/pyraformer/core.py
@@ -52,7 +52,7 @@ def __init__(
         self.output_projection = nn.Linear((len(window_size) + 1) * d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Pyraformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -73,7 +73,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/pyraformer/model.py b/pypots/imputation/pyraformer/model.py
index 757e96f3..7bc351bf 100644
--- a/pypots/imputation/pyraformer/model.py
+++ b/pypots/imputation/pyraformer/model.py
@@ -74,6 +74,14 @@ class Pyraformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -120,7 +128,9 @@ def __init__(
         MIT_weight: float = 1,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -131,12 +141,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -290,7 +301,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/saits/core.py b/pypots/imputation/saits/core.py
index f5189ab3..93f79aea 100644
--- a/pypots/imputation/saits/core.py
+++ b/pypots/imputation/saits/core.py
@@ -32,7 +32,7 @@ def __init__(
         diagonal_attention_mask: bool = True,
         ORT_weight: float = 1,
         MIT_weight: float = 1,
-        customized_loss_func: Callable = calc_mae,
+        loss_func: Callable = calc_mae,
     ):
         super().__init__()
         self.n_layers = n_layers
@@ -40,7 +40,7 @@ def __init__(
         self.diagonal_attention_mask = diagonal_attention_mask
         self.ORT_weight = ORT_weight
         self.MIT_weight = MIT_weight
-        self.customized_loss_func = customized_loss_func
+        self.loss_func = loss_func
 
         self.encoder = BackboneSAITS(
             n_steps,
@@ -59,13 +59,12 @@ def forward(
         self,
         inputs: dict,
         diagonal_attention_mask: bool = True,
-        training: bool = True,
     ) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # determine the attention mask
-        if (training and self.diagonal_attention_mask) or (
-            (not training) and diagonal_attention_mask
+        if (self.training and self.diagonal_attention_mask) or (
+            (not self.training) and diagonal_attention_mask
         ):
             diagonal_attention_mask = (1 - torch.eye(self.n_steps)).to(X.device)
             # then broadcast on the batch axis
@@ -95,21 +94,21 @@ def forward(
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
 
             # calculate loss for the observed reconstruction task (ORT)
             # this calculation is more complicated that pypots.nn.modules.saits.SaitsLoss because
             # SAITS model structure has three parts of representation
             ORT_loss = 0
-            ORT_loss += self.customized_loss_func(X_tilde_1, X, missing_mask)
-            ORT_loss += self.customized_loss_func(X_tilde_2, X, missing_mask)
-            ORT_loss += self.customized_loss_func(X_tilde_3, X, missing_mask)
+            ORT_loss += self.loss_func(X_tilde_1, X, missing_mask)
+            ORT_loss += self.loss_func(X_tilde_2, X, missing_mask)
+            ORT_loss += self.loss_func(X_tilde_3, X, missing_mask)
             ORT_loss /= 3
             ORT_loss = self.ORT_weight * ORT_loss
 
             # calculate loss for the masked imputation task (MIT)
-            MIT_loss = self.MIT_weight * self.customized_loss_func(
+            MIT_loss = self.MIT_weight * self.loss_func(
                 X_tilde_3, X_ori, indicating_mask
             )
             # `loss` is always the item for backward propagating to update the model
diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py
index ad0fd97b..bf523e05 100644
--- a/pypots/imputation/saits/model.py
+++ b/pypots/imputation/saits/model.py
@@ -6,7 +6,7 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
-from typing import Union, Optional, Callable
+from typing import Union, Optional
 
 import numpy as np
 import torch
@@ -20,7 +20,7 @@
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
-from ...utils.metrics import calc_mae
+from ...utils.metrics import calc_mae, calc_mse
 
 
 class SAITS(BaseNNImputer):
@@ -84,9 +84,13 @@ class SAITS(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
-    customized_loss_func:
-        The customized loss function designed by users for the model to optimize.
-        If not given, will use the default MAE loss as claimed in the original paper.
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
 
     optimizer :
         The optimizer for model training.
@@ -136,7 +140,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
-        customized_loss_func: Callable = calc_mae,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -147,6 +152,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -163,6 +170,14 @@ def __init__(
                 f"⚠️ d_model is reset to {d_model} = n_heads ({n_heads}) * d_k ({d_k})"
             )
 
+        # set default training loss function and validation metric function if not given
+        if train_loss_func is None:
+            self.train_loss_func = calc_mae
+            self.train_loss_func_name = "MAE"
+        if val_metric_func is None:
+            self.val_metric_func = calc_mse
+            self.val_metric_func_name = "MSE"
+
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -193,13 +208,11 @@ def __init__(
             self.diagonal_attention_mask,
             self.ORT_weight,
             self.MIT_weight,
+            self.train_loss_func,
         )
         self._print_model_size()
         self._send_model_to_given_device()
 
-        # set up the loss function
-        self.customized_loss_func = customized_loss_func
-
         # set up the optimizer
         self.optimizer = optimizer
         self.optimizer.init_optimizer(self.model.parameters())
@@ -332,9 +345,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(
-                    inputs, diagonal_attention_mask, training=False
-                )
+                results = self.model.forward(inputs, diagonal_attention_mask)
                 imputation_collector.append(results["imputed_data"])
 
                 if return_latent_vars:
diff --git a/pypots/imputation/template/model.py b/pypots/imputation/template/model.py
index 9f135893..577e631e 100644
--- a/pypots/imputation/template/model.py
+++ b/pypots/imputation/template/model.py
@@ -34,6 +34,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -44,12 +46,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
-        )
-        # set up the hyper-parameters
+        )  # set up the hyper-parameters
         # TODO: set up your model's hyper-parameters here
 
         # set up the model
diff --git a/pypots/imputation/timesnet/core.py b/pypots/imputation/timesnet/core.py
index 15aefecf..10503bbb 100644
--- a/pypots/imputation/timesnet/core.py
+++ b/pypots/imputation/timesnet/core.py
@@ -51,7 +51,7 @@ def __init__(
         # for the imputation task, the output dim is the same as input dim
         self.projection = nn.Linear(d_model, n_features)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         if self.apply_nonstationary_norm:
@@ -75,7 +75,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
             "imputed_data": imputed_data,
         }
 
-        if training:
+        if self.training:
             # `loss` is always the item for backward propagating to update the model
             loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"])
             results["loss"] = loss
diff --git a/pypots/imputation/timesnet/model.py b/pypots/imputation/timesnet/model.py
index 34aba691..a38e46a5 100644
--- a/pypots/imputation/timesnet/model.py
+++ b/pypots/imputation/timesnet/model.py
@@ -67,6 +67,14 @@ class TimesNet(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -110,7 +118,9 @@ def __init__(
         apply_nonstationary_norm: bool = False,
         batch_size: int = 32,
         epochs: int = 100,
-        patience: int = None,
+        patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -121,12 +131,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters
@@ -274,7 +285,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/transformer/core.py b/pypots/imputation/transformer/core.py
index e769a3aa..75ea8e2d 100644
--- a/pypots/imputation/transformer/core.py
+++ b/pypots/imputation/transformer/core.py
@@ -56,7 +56,7 @@ def __init__(
         # apply SAITS loss function to Transformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # apply the SAITS embedding strategy, concatenate X and missing mask for input
@@ -76,7 +76,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(
                 reconstruction, X_ori, missing_mask, indicating_mask
diff --git a/pypots/imputation/transformer/model.py b/pypots/imputation/transformer/model.py
index d7d59097..9393ccff 100644
--- a/pypots/imputation/transformer/model.py
+++ b/pypots/imputation/transformer/model.py
@@ -82,6 +82,14 @@ class Transformer(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -129,6 +137,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -139,12 +149,13 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
             model_saving_strategy,
         )
-
         if d_model != n_heads * d_k:
             logger.warning(
                 "‼️ d_model must = n_heads * d_k, it should be divisible by n_heads "
@@ -284,7 +295,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/usgan/core.py b/pypots/imputation/usgan/core.py
index a0b04ade..5522821c 100644
--- a/pypots/imputation/usgan/core.py
+++ b/pypots/imputation/usgan/core.py
@@ -37,7 +37,6 @@ def forward(
         self,
         inputs: dict,
         training_object: str = "generator",
-        training: bool = True,
     ) -> dict:
         assert training_object in [
             "generator",
@@ -45,26 +44,18 @@ def forward(
         ], 'training_object should be "generator" or "discriminator"'
 
         results = {}
-        if training:
+        if self.training:
             if training_object == "discriminator":
                 imputed_data, discrimination_loss = self.backbone(
-                    inputs, training_object, training
+                    inputs, training_object
                 )
                 loss = discrimination_loss
             else:
-                imputed_data, generation_loss = self.backbone(
-                    inputs,
-                    training_object,
-                    training,
-                )
+                imputed_data, generation_loss = self.backbone(inputs, training_object)
                 loss = generation_loss
             results["loss"] = loss
         else:
-            imputed_data = self.backbone(
-                inputs,
-                training_object,
-                training,
-            )
+            imputed_data = self.backbone(inputs, training_object)
 
         results["imputed_data"] = imputed_data
         return results
diff --git a/pypots/imputation/usgan/model.py b/pypots/imputation/usgan/model.py
index 69f3bdd1..7c715758 100644
--- a/pypots/imputation/usgan/model.py
+++ b/pypots/imputation/usgan/model.py
@@ -114,6 +114,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         G_optimizer: Optional[Optimizer] = Adam(),
         D_optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
@@ -125,6 +127,8 @@ def __init__(
             batch_size,
             epochs,
             patience,
+            train_loss_func,
+            val_metric_func,
             num_workers,
             device,
             saving_path,
@@ -282,7 +286,7 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self._assemble_input_for_validating(data)
-                            results = self.model.forward(inputs, training=False)
+                            results = self.model.forward(inputs)
                             imputation_mse = (
                                 calc_mse(
                                     results["imputed_data"],
@@ -425,7 +429,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/nn/modules/csdi/backbone.py b/pypots/nn/modules/csdi/backbone.py
index 26051060..ca6bf27b 100644
--- a/pypots/nn/modules/csdi/backbone.py
+++ b/pypots/nn/modules/csdi/backbone.py
@@ -83,23 +83,19 @@ def set_input_to_diffmodel(self, noisy_data, observed_data, cond_mask):
 
         return total_input
 
-    def calc_loss_valid(
-        self, observed_data, cond_mask, indicating_mask, side_info, is_train
-    ):
+    def calc_loss_valid(self, observed_data, cond_mask, indicating_mask, side_info):
         loss_sum = 0
         for t in range(self.n_diffusion_steps):  # calculate loss for all t
             loss = self.calc_loss(
-                observed_data, cond_mask, indicating_mask, side_info, is_train, set_t=t
+                observed_data, cond_mask, indicating_mask, side_info, set_t=t
             )
             loss_sum += loss.detach()
         return loss_sum / self.n_diffusion_steps
 
-    def calc_loss(
-        self, observed_data, cond_mask, indicating_mask, side_info, is_train, set_t=-1
-    ):
+    def calc_loss(self, observed_data, cond_mask, indicating_mask, side_info, set_t=-1):
         B, K, L = observed_data.shape
         device = observed_data.device
-        if is_train != 1:  # for validation
+        if self.training != 1:  # for validation
             t = (torch.ones(B) * set_t).long().to(device)
         else:
             t = torch.randint(0, self.n_diffusion_steps, [B]).to(device)
diff --git a/pypots/nn/modules/usgan/backbone.py b/pypots/nn/modules/usgan/backbone.py
index 42d7f430..fdc5bcbd 100644
--- a/pypots/nn/modules/usgan/backbone.py
+++ b/pypots/nn/modules/usgan/backbone.py
@@ -43,7 +43,6 @@ def forward(
         self,
         inputs: dict,
         training_object: str = "generator",
-        training: bool = True,
     ) -> Tuple[torch.Tensor, ...]:
         (
             imputed_data,
@@ -56,7 +55,7 @@ def forward(
         ) = self.generator(inputs)
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             forward_X = inputs["forward"]["X"]
             forward_missing_mask = inputs["forward"]["missing_mask"]
 

From 620a95c610be65347c13e3595722d18e1206e569 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 8 May 2024 18:50:33 +0800
Subject: [PATCH 02/19] refactor: move pypots.utils.metrics to
 pypots.nn.functional;

---
 pypots/classification/base.py                 |  2 +-
 pypots/clustering/crli/core.py                |  2 +-
 pypots/clustering/vader/core.py               |  2 +-
 pypots/clustering/vader/model.py              |  2 +-
 pypots/forecasting/base.py                    |  2 +-
 pypots/imputation/base.py                     |  2 +-
 pypots/imputation/gpvae/data.py               |  1 +
 pypots/imputation/gpvae/model.py              |  2 +-
 pypots/imputation/mrnn/core.py                |  2 +-
 .../nonstationary_transformer/core.py         |  2 +-
 pypots/imputation/saits/core.py               |  2 +-
 pypots/imputation/saits/model.py              |  2 +-
 pypots/imputation/timesnet/core.py            |  2 +-
 pypots/imputation/transformer/model.py        |  2 +-
 pypots/imputation/usgan/model.py              |  2 +-
 pypots/nn/functional/__init__.py              | 49 +++++++++++++++++++
 .../functional}/classification.py             |  0
 .../metrics => nn/functional}/clustering.py   |  0
 .../{utils/metrics => nn/functional}/error.py |  0
 pypots/nn/modules/film/__init__.py            |  2 +-
 pypots/nn/modules/transformer/attention.py    |  2 +-
 pypots/optim/lr_scheduler/__init__.py         |  9 ++--
 pypots/utils/metrics/__init__.py              | 13 +++--
 23 files changed, 80 insertions(+), 24 deletions(-)
 rename pypots/{utils/metrics => nn/functional}/classification.py (100%)
 rename pypots/{utils/metrics => nn/functional}/clustering.py (100%)
 rename pypots/{utils/metrics => nn/functional}/error.py (100%)

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index fef6675b..81340288 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -15,8 +15,8 @@
 from torch.utils.data import DataLoader
 
 from ..base import BaseModel, BaseNNModel
+from ..nn.functional import calc_acc
 from ..utils.logging import logger
-from ..utils.metrics import calc_acc
 
 try:
     import nni
diff --git a/pypots/clustering/crli/core.py b/pypots/clustering/crli/core.py
index 74bc7605..fb90ad39 100644
--- a/pypots/clustering/crli/core.py
+++ b/pypots/clustering/crli/core.py
@@ -13,8 +13,8 @@
 import torch.nn.functional as F
 from sklearn.cluster import KMeans
 
+from ...nn.functional import calc_mse
 from ...nn.modules.crli import BackboneCRLI
-from ...utils.metrics import calc_mse
 
 
 class _CRLI(nn.Module):
diff --git a/pypots/clustering/vader/core.py b/pypots/clustering/vader/core.py
index bb19c1e3..0268c7cd 100644
--- a/pypots/clustering/vader/core.py
+++ b/pypots/clustering/vader/core.py
@@ -11,8 +11,8 @@
 import torch
 import torch.nn as nn
 
+from ...nn.functional import calc_mse
 from ...nn.modules.vader import BackboneVaDER
-from ...utils.metrics import calc_mse
 
 
 def inverse_softplus(x: np.ndarray) -> np.ndarray:
diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py
index d44257db..caff4453 100644
--- a/pypots/clustering/vader/model.py
+++ b/pypots/clustering/vader/model.py
@@ -16,8 +16,8 @@
 from sklearn.mixture import GaussianMixture
 from torch.utils.data import DataLoader
 
-from .data import DatasetForVaDER
 from .core import inverse_softplus, _VaDER
+from .data import DatasetForVaDER
 from ..base import BaseNNClusterer
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
index fb88f7f0..427544b4 100644
--- a/pypots/forecasting/base.py
+++ b/pypots/forecasting/base.py
@@ -14,8 +14,8 @@
 from torch.utils.data import DataLoader
 
 from ..base import BaseModel, BaseNNModel
+from ..nn.functional import calc_mse
 from ..utils.logging import logger
-from ..utils.metrics.error import calc_mse
 
 try:
     import nni
diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index a7335a6c..828c7b58 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -15,8 +15,8 @@
 from torch.utils.data import DataLoader
 
 from ..base import BaseModel, BaseNNModel
+from ..nn.functional import calc_mse
 from ..utils.logging import logger
-from ..utils.metrics import calc_mse
 
 try:
     import nni
diff --git a/pypots/imputation/gpvae/data.py b/pypots/imputation/gpvae/data.py
index af61ace3..4b00a10a 100644
--- a/pypots/imputation/gpvae/data.py
+++ b/pypots/imputation/gpvae/data.py
@@ -9,6 +9,7 @@
 
 import torch
 from pygrinder import fill_and_get_mask_torch
+
 from ...data.dataset import BaseDataset
 
 
diff --git a/pypots/imputation/gpvae/model.py b/pypots/imputation/gpvae/model.py
index 0ef68d40..b85ea782 100644
--- a/pypots/imputation/gpvae/model.py
+++ b/pypots/imputation/gpvae/model.py
@@ -27,7 +27,7 @@
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
-from ...utils.metrics import calc_mse
+from ...nn.functional import calc_mse
 
 
 class GPVAE(BaseNNImputer):
diff --git a/pypots/imputation/mrnn/core.py b/pypots/imputation/mrnn/core.py
index 2092d10e..b9490c29 100644
--- a/pypots/imputation/mrnn/core.py
+++ b/pypots/imputation/mrnn/core.py
@@ -9,8 +9,8 @@
 
 import torch.nn as nn
 
+from ...nn.functional import calc_rmse
 from ...nn.modules.mrnn import BackboneMRNN
-from ...utils.metrics import calc_rmse
 
 
 class _MRNN(nn.Module):
diff --git a/pypots/imputation/nonstationary_transformer/core.py b/pypots/imputation/nonstationary_transformer/core.py
index cf17c8f8..cf631777 100644
--- a/pypots/imputation/nonstationary_transformer/core.py
+++ b/pypots/imputation/nonstationary_transformer/core.py
@@ -8,12 +8,12 @@
 
 import torch.nn as nn
 
+from ...nn.functional import nonstationary_norm, nonstationary_denorm
 from ...nn.modules.nonstationary_transformer import (
     NonstationaryTransformerEncoder,
     Projector,
 )
 from ...nn.modules.saits import SaitsLoss, SaitsEmbedding
-from ...nn.functional.normalization import nonstationary_norm, nonstationary_denorm
 
 
 class _NonstationaryTransformer(nn.Module):
diff --git a/pypots/imputation/saits/core.py b/pypots/imputation/saits/core.py
index 93f79aea..9c7f327b 100644
--- a/pypots/imputation/saits/core.py
+++ b/pypots/imputation/saits/core.py
@@ -12,8 +12,8 @@
 import torch
 import torch.nn as nn
 
+from ...nn.functional import calc_mae
 from ...nn.modules.saits import BackboneSAITS
-from ...utils.metrics import calc_mae
 
 
 class _SAITS(nn.Module):
diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py
index bf523e05..6340503f 100644
--- a/pypots/imputation/saits/model.py
+++ b/pypots/imputation/saits/model.py
@@ -17,10 +17,10 @@
 from ..base import BaseNNImputer
 from ...data.checking import key_in_data_set
 from ...data.dataset import BaseDataset
+from ...nn.functional import calc_mae, calc_mse
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
-from ...utils.metrics import calc_mae, calc_mse
 
 
 class SAITS(BaseNNImputer):
diff --git a/pypots/imputation/timesnet/core.py b/pypots/imputation/timesnet/core.py
index 10503bbb..cd49ba30 100644
--- a/pypots/imputation/timesnet/core.py
+++ b/pypots/imputation/timesnet/core.py
@@ -7,10 +7,10 @@
 
 import torch.nn as nn
 
+from ...nn.functional import calc_mse
 from ...nn.functional import nonstationary_norm, nonstationary_denorm
 from ...nn.modules.timesnet import BackboneTimesNet
 from ...nn.modules.transformer.embedding import DataEmbedding
-from ...utils.metrics import calc_mse
 
 
 class _TimesNet(nn.Module):
diff --git a/pypots/imputation/transformer/model.py b/pypots/imputation/transformer/model.py
index 9393ccff..d2676194 100644
--- a/pypots/imputation/transformer/model.py
+++ b/pypots/imputation/transformer/model.py
@@ -15,8 +15,8 @@
 from .core import _Transformer
 from .data import DatasetForTransformer
 from ..base import BaseNNImputer
-from ...data.dataset import BaseDataset
 from ...data.checking import key_in_data_set
+from ...data.dataset import BaseDataset
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
diff --git a/pypots/imputation/usgan/model.py b/pypots/imputation/usgan/model.py
index cb890c95..8b497d30 100644
--- a/pypots/imputation/usgan/model.py
+++ b/pypots/imputation/usgan/model.py
@@ -17,10 +17,10 @@
 from .data import DatasetForUSGAN
 from ..base import BaseNNImputer
 from ...data.checking import key_in_data_set
+from ...nn.functional import calc_mse
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
-from ...utils.metrics import calc_mse
 
 try:
     import nni
diff --git a/pypots/nn/functional/__init__.py b/pypots/nn/functional/__init__.py
index 36df2bc6..368260ab 100644
--- a/pypots/nn/functional/__init__.py
+++ b/pypots/nn/functional/__init__.py
@@ -5,10 +5,59 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
+from .classification import (
+    calc_binary_classification_metrics,
+    calc_precision_recall_f1,
+    calc_pr_auc,
+    calc_roc_auc,
+    calc_acc,
+)
+from .clustering import (
+    calc_rand_index,
+    calc_adjusted_rand_index,
+    calc_cluster_purity,
+    calc_nmi,
+    calc_chs,
+    calc_dbs,
+    calc_silhouette,
+    calc_internal_cluster_validation_metrics,
+    calc_external_cluster_validation_metrics,
+)
+from .error import (
+    calc_mae,
+    calc_mse,
+    calc_rmse,
+    calc_mre,
+    calc_quantile_crps,
+    calc_quantile_crps_sum,
+)
 from .normalization import nonstationary_norm, nonstationary_denorm
 
 __all__ = [
     # normalization functions
     "nonstationary_norm",
     "nonstationary_denorm",
+    # error
+    "calc_mae",
+    "calc_mse",
+    "calc_rmse",
+    "calc_mre",
+    "calc_quantile_crps",
+    "calc_quantile_crps_sum",
+    # classification
+    "calc_binary_classification_metrics",
+    "calc_precision_recall_f1",
+    "calc_pr_auc",
+    "calc_roc_auc",
+    "calc_acc",
+    # clustering
+    "calc_rand_index",
+    "calc_adjusted_rand_index",
+    "calc_cluster_purity",
+    "calc_nmi",
+    "calc_chs",
+    "calc_dbs",
+    "calc_silhouette",
+    "calc_internal_cluster_validation_metrics",
+    "calc_external_cluster_validation_metrics",
 ]
diff --git a/pypots/utils/metrics/classification.py b/pypots/nn/functional/classification.py
similarity index 100%
rename from pypots/utils/metrics/classification.py
rename to pypots/nn/functional/classification.py
diff --git a/pypots/utils/metrics/clustering.py b/pypots/nn/functional/clustering.py
similarity index 100%
rename from pypots/utils/metrics/clustering.py
rename to pypots/nn/functional/clustering.py
diff --git a/pypots/utils/metrics/error.py b/pypots/nn/functional/error.py
similarity index 100%
rename from pypots/utils/metrics/error.py
rename to pypots/nn/functional/error.py
diff --git a/pypots/nn/modules/film/__init__.py b/pypots/nn/modules/film/__init__.py
index 4f97f20b..8d12cd09 100644
--- a/pypots/nn/modules/film/__init__.py
+++ b/pypots/nn/modules/film/__init__.py
@@ -17,8 +17,8 @@
 # License: BSD-3-Clause
 
 
-from .layers import HiPPO_LegT, SpectralConv1d
 from .backbone import BackboneFiLM
+from .layers import HiPPO_LegT, SpectralConv1d
 
 __all__ = [
     "HiPPO_LegT",
diff --git a/pypots/nn/modules/transformer/attention.py b/pypots/nn/modules/transformer/attention.py
index ecc4f85e..20f1c3f9 100644
--- a/pypots/nn/modules/transformer/attention.py
+++ b/pypots/nn/modules/transformer/attention.py
@@ -11,12 +11,12 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
+from abc import abstractmethod
 from typing import Tuple, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from abc import abstractmethod
 
 
 class AttentionOperator(nn.Module):
diff --git a/pypots/optim/lr_scheduler/__init__.py b/pypots/optim/lr_scheduler/__init__.py
index 02d5cfe4..1a90802e 100644
--- a/pypots/optim/lr_scheduler/__init__.py
+++ b/pypots/optim/lr_scheduler/__init__.py
@@ -9,14 +9,13 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
-from .lambda_lrs import LambdaLR
-from .multiplicative_lrs import MultiplicativeLR
-from .step_lrs import StepLR
-from .multistep_lrs import MultiStepLR
 from .constant_lrs import ConstantLR
 from .exponential_lrs import ExponentialLR
+from .lambda_lrs import LambdaLR
 from .linear_lrs import LinearLR
-
+from .multiplicative_lrs import MultiplicativeLR
+from .multistep_lrs import MultiStepLR
+from .step_lrs import StepLR
 
 __all__ = [
     "LambdaLR",
diff --git a/pypots/utils/metrics/__init__.py b/pypots/utils/metrics/__init__.py
index ed309ce3..5453993c 100644
--- a/pypots/utils/metrics/__init__.py
+++ b/pypots/utils/metrics/__init__.py
@@ -5,14 +5,16 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
-from .classification import (
+
+from ..logging import logger
+from ...nn.functional.classification import (
     calc_binary_classification_metrics,
     calc_precision_recall_f1,
     calc_pr_auc,
     calc_roc_auc,
     calc_acc,
 )
-from .clustering import (
+from ...nn.functional.clustering import (
     calc_rand_index,
     calc_adjusted_rand_index,
     calc_cluster_purity,
@@ -23,7 +25,7 @@
     calc_internal_cluster_validation_metrics,
     calc_external_cluster_validation_metrics,
 )
-from .error import (
+from ...nn.functional.error import (
     calc_mae,
     calc_mse,
     calc_rmse,
@@ -32,6 +34,11 @@
     calc_quantile_crps_sum,
 )
 
+logger.warning(
+    "🚨 Importing metrics from pypots.utils.metrics is deprecated. "
+    "Please import from pypots.nn.functional instead."
+)
+
 __all__ = [
     # error
     "calc_mae",

From c7dc6beaf401fbdcf4175a3fe4c271b81cc53a03 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 9 May 2024 12:27:55 +0800
Subject: [PATCH 03/19] feat: add BaseLoss and BaseMetric;

---
 pypots/nn/modules/loss.py   | 28 ++++++++++++++++++++++++++++
 pypots/nn/modules/metric.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 pypots/nn/modules/loss.py
 create mode 100644 pypots/nn/modules/metric.py

diff --git a/pypots/nn/modules/loss.py b/pypots/nn/modules/loss.py
new file mode 100644
index 00000000..0868d2ca
--- /dev/null
+++ b/pypots/nn/modules/loss.py
@@ -0,0 +1,28 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .metric import BaseMetric
+from ..functional import calc_mse
+
+
+class BaseLoss(BaseMetric):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+    def forward(self, prediction, target):
+        raise NotImplementedError
+
+
+class MAE_Loss(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        return calc_mse(prediction, target, mask)
diff --git a/pypots/nn/modules/metric.py b/pypots/nn/modules/metric.py
new file mode 100644
index 00000000..faea3d78
--- /dev/null
+++ b/pypots/nn/modules/metric.py
@@ -0,0 +1,29 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+import torch.nn as nn
+
+from ..functional import calc_pr_auc
+
+
+class BaseMetric(nn.Module):
+    def __init__(self, lower_better: bool = True):
+        super().__init__()
+        self.lower_better = lower_better
+
+    def forward(self, prediction, target):
+        raise NotImplementedError
+
+
+class PR_AUC(BaseMetric):
+    def __init__(self):
+        super().__init__(lower_better=False)
+
+    def forward(self, prediction, target):
+        pr_auc, _, _, _ = calc_pr_auc(prediction, target)
+        return pr_auc

From f674a3c25ea4d7affe251554dbe896b8a66d79eb Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 27 Sep 2024 14:30:49 +0800
Subject: [PATCH 04/19] refactor: make FITS able to apply customized loss func;

---
 pypots/imputation/fits/core.py  |  4 +--
 pypots/imputation/fits/model.py | 53 ++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/pypots/imputation/fits/core.py b/pypots/imputation/fits/core.py
index 701ec4ca..ba3f2661 100644
--- a/pypots/imputation/fits/core.py
+++ b/pypots/imputation/fits/core.py
@@ -46,7 +46,7 @@ def __init__(
         self.output_projection = nn.Linear(n_features, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         if self.apply_nonstationary_norm:
@@ -75,7 +75,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/fits/model.py b/pypots/imputation/fits/model.py
index 2664da26..b5c9dc7a 100644
--- a/pypots/imputation/fits/model.py
+++ b/pypots/imputation/fits/model.py
@@ -33,26 +33,11 @@ class FITS(BaseNNImputer):
     n_features :
         The number of features in the time-series data sample.
 
-    n_layers :
-        The number of layers in the FITS model.
+    cut_freq :
+        The cut-off frequency for the Fourier transformation.
 
-    d_model :
-        The dimension of the model.
-
-    n_heads :
-        The number of heads in each layer of FITS.
-
-    d_ffn :
-        The dimension of the feed-forward network.
-
-    factor :
-        The factor of the auto correlation mechanism for the FITS model.
-
-    moving_avg_window_size :
-        The window size of moving average.
-
-    dropout :
-        The dropout rate for the model.
+    individual :
+        Whether to use individual Fourier transformation for each feature.
 
     ORT_weight :
         The weight for the ORT loss, the same as SAITS.
@@ -71,6 +56,14 @@ class FITS(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -115,6 +108,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: int = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -123,14 +118,16 @@ def __init__(
         verbose: bool = True,
     ):
         super().__init__(
-            batch_size,
-            epochs,
-            patience,
-            num_workers,
-            device,
-            saving_path,
-            model_saving_strategy,
-            verbose,
+            batch_size=batch_size,
+            epochs=epochs,
+            patience=patience,
+            train_loss_func=train_loss_func,
+            val_metric_func=val_metric_func,
+            num_workers=num_workers,
+            device=device,
+            saving_path=saving_path,
+            model_saving_strategy=model_saving_strategy,
+            verbose=verbose,
         )
 
         self.n_steps = n_steps
@@ -272,7 +269,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return

From 40e9602f6de20dc75bcd5f65388fe9fb1ec02809 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 27 Sep 2024 15:42:52 +0800
Subject: [PATCH 05/19] refactor: replace arg training with self attribute in
 new added models;

---
 pypots/imputation/grud/core.py          | 4 ++--
 pypots/imputation/grud/model.py         | 2 +-
 pypots/imputation/imputeformer/core.py  | 4 ++--
 pypots/imputation/imputeformer/model.py | 2 +-
 pypots/imputation/koopa/model.py        | 2 +-
 pypots/imputation/micn/core.py          | 4 ++--
 pypots/imputation/micn/model.py         | 2 +-
 pypots/imputation/moderntcn/core.py     | 4 ++--
 pypots/imputation/moderntcn/model.py    | 2 +-
 pypots/imputation/reformer/core.py      | 4 ++--
 pypots/imputation/reformer/model.py     | 2 +-
 pypots/imputation/revinscinet/core.py   | 4 ++--
 pypots/imputation/revinscinet/model.py  | 2 +-
 pypots/imputation/scinet/core.py        | 4 ++--
 pypots/imputation/scinet/model.py       | 2 +-
 pypots/imputation/stemgnn/core.py       | 4 ++--
 pypots/imputation/stemgnn/model.py      | 2 +-
 pypots/imputation/tcn/core.py           | 4 ++--
 pypots/imputation/tcn/model.py          | 2 +-
 pypots/imputation/tefn/core.py          | 5 +++--
 pypots/imputation/tefn/model.py         | 2 +-
 pypots/imputation/tide/core.py          | 4 ++--
 pypots/imputation/tide/model.py         | 2 +-
 pypots/imputation/timemixer/core.py     | 5 +++--
 pypots/imputation/timemixer/model.py    | 2 +-
 25 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/pypots/imputation/grud/core.py b/pypots/imputation/grud/core.py
index 98f368e0..713259c2 100644
--- a/pypots/imputation/grud/core.py
+++ b/pypots/imputation/grud/core.py
@@ -33,7 +33,7 @@ def __init__(
         )
         self.output_projection = nn.Linear(rnn_hidden_size, n_features)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         """Forward processing of GRU-D.
 
         Parameters
@@ -66,7 +66,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             results["loss"] = calc_mse(reconstruction, X, missing_mask)
 
         return results
diff --git a/pypots/imputation/grud/model.py b/pypots/imputation/grud/model.py
index 008408a9..b2ea4a0d 100644
--- a/pypots/imputation/grud/model.py
+++ b/pypots/imputation/grud/model.py
@@ -225,7 +225,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/imputeformer/core.py b/pypots/imputation/imputeformer/core.py
index ceb81630..46d49aed 100644
--- a/pypots/imputation/imputeformer/core.py
+++ b/pypots/imputation/imputeformer/core.py
@@ -92,7 +92,7 @@ def __init__(
         # apply SAITS loss function to Transformer on the imputation task
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         x, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # x: (batch_size, in_steps, num_nodes)
@@ -132,7 +132,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/imputeformer/model.py b/pypots/imputation/imputeformer/model.py
index 2c1e229c..334badd9 100644
--- a/pypots/imputation/imputeformer/model.py
+++ b/pypots/imputation/imputeformer/model.py
@@ -283,7 +283,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputed_data = results["imputed_data"]
                 imputation_collector.append(imputed_data)
 
diff --git a/pypots/imputation/koopa/model.py b/pypots/imputation/koopa/model.py
index 76c72c1f..ab780553 100644
--- a/pypots/imputation/koopa/model.py
+++ b/pypots/imputation/koopa/model.py
@@ -295,7 +295,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/micn/core.py b/pypots/imputation/micn/core.py
index 11bfa394..153413e2 100644
--- a/pypots/imputation/micn/core.py
+++ b/pypots/imputation/micn/core.py
@@ -60,7 +60,7 @@ def __init__(
         # for the imputation task, the output dim is the same as input dim
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         seasonal_init, trend_init = self.decomp_multi(X)
@@ -82,7 +82,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/micn/model.py b/pypots/imputation/micn/model.py
index 3456d539..59061d93 100644
--- a/pypots/imputation/micn/model.py
+++ b/pypots/imputation/micn/model.py
@@ -276,7 +276,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/moderntcn/core.py b/pypots/imputation/moderntcn/core.py
index 3ca8e8f5..4c64d2b8 100644
--- a/pypots/imputation/moderntcn/core.py
+++ b/pypots/imputation/moderntcn/core.py
@@ -66,7 +66,7 @@ def __init__(
             individual,
         )
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         if self.apply_nonstationary_norm:
@@ -88,7 +88,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             loss = calc_mse(reconstruction, inputs["X_ori"], inputs["indicating_mask"])
             results["loss"] = loss
 
diff --git a/pypots/imputation/moderntcn/model.py b/pypots/imputation/moderntcn/model.py
index 68ed84ba..145a70d8 100644
--- a/pypots/imputation/moderntcn/model.py
+++ b/pypots/imputation/moderntcn/model.py
@@ -306,7 +306,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/reformer/core.py b/pypots/imputation/reformer/core.py
index ec55c7ad..6c74f01d 100644
--- a/pypots/imputation/reformer/core.py
+++ b/pypots/imputation/reformer/core.py
@@ -54,7 +54,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original Reformer paper isn't proposed for imputation task. Hence the model doesn't take
@@ -75,7 +75,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/reformer/model.py b/pypots/imputation/reformer/model.py
index 072c6894..c61f6a85 100644
--- a/pypots/imputation/reformer/model.py
+++ b/pypots/imputation/reformer/model.py
@@ -295,7 +295,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/revinscinet/core.py b/pypots/imputation/revinscinet/core.py
index 16d199d3..a3188d5d 100644
--- a/pypots/imputation/revinscinet/core.py
+++ b/pypots/imputation/revinscinet/core.py
@@ -59,7 +59,7 @@ def __init__(
         # for the imputation task, the output dim is the same as input dim
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
         X = self.revin(X, missing_mask, mode="norm")
 
@@ -80,7 +80,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/revinscinet/model.py b/pypots/imputation/revinscinet/model.py
index 9056f552..4d10967c 100644
--- a/pypots/imputation/revinscinet/model.py
+++ b/pypots/imputation/revinscinet/model.py
@@ -300,7 +300,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/scinet/core.py b/pypots/imputation/scinet/core.py
index 4d2b02a1..1df706c7 100644
--- a/pypots/imputation/scinet/core.py
+++ b/pypots/imputation/scinet/core.py
@@ -57,7 +57,7 @@ def __init__(
         # for the imputation task, the output dim is the same as input dim
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original SCINet paper isn't proposed for imputation task. Hence the model doesn't take
@@ -76,7 +76,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/scinet/model.py b/pypots/imputation/scinet/model.py
index 4dd0fa27..29dac999 100644
--- a/pypots/imputation/scinet/model.py
+++ b/pypots/imputation/scinet/model.py
@@ -302,7 +302,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/stemgnn/core.py b/pypots/imputation/stemgnn/core.py
index d8d51efb..650e7bb5 100644
--- a/pypots/imputation/stemgnn/core.py
+++ b/pypots/imputation/stemgnn/core.py
@@ -48,7 +48,7 @@ def __init__(
         self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original StemGNN paper isn't proposed for imputation task. Hence the model doesn't take
@@ -69,7 +69,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/stemgnn/model.py b/pypots/imputation/stemgnn/model.py
index ecee2c80..2f75f4d6 100644
--- a/pypots/imputation/stemgnn/model.py
+++ b/pypots/imputation/stemgnn/model.py
@@ -276,7 +276,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/tcn/core.py b/pypots/imputation/tcn/core.py
index c38390b5..7274af14 100644
--- a/pypots/imputation/tcn/core.py
+++ b/pypots/imputation/tcn/core.py
@@ -45,7 +45,7 @@ def __init__(
         self.output_projection = nn.Linear(channel_sizes[-1], n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # WDU: the original TCN paper isn't proposed for imputation task. Hence the model doesn't take
@@ -68,7 +68,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/tcn/model.py b/pypots/imputation/tcn/model.py
index 2b33251a..28e987f3 100644
--- a/pypots/imputation/tcn/model.py
+++ b/pypots/imputation/tcn/model.py
@@ -270,7 +270,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/tefn/core.py b/pypots/imputation/tefn/core.py
index f71927a6..ca11825e 100644
--- a/pypots/imputation/tefn/core.py
+++ b/pypots/imputation/tefn/core.py
@@ -32,7 +32,7 @@ def __init__(
             n_fod,
         )
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         if self.apply_nonstationary_norm:
@@ -51,7 +51,8 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
             "imputed_data": imputed_data,
         }
 
-        if training:
+        # if in training mode, return results with losses
+        if self.training:
             # `loss` is always the item for backward propagating to update the model
             loss = calc_mse(out, inputs["X_ori"], inputs["indicating_mask"])
             results["loss"] = loss
diff --git a/pypots/imputation/tefn/model.py b/pypots/imputation/tefn/model.py
index 2925d8a6..6d55bd7c 100644
--- a/pypots/imputation/tefn/model.py
+++ b/pypots/imputation/tefn/model.py
@@ -250,7 +250,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/tide/core.py b/pypots/imputation/tide/core.py
index e826cbeb..876b3ec4 100644
--- a/pypots/imputation/tide/core.py
+++ b/pypots/imputation/tide/core.py
@@ -82,7 +82,7 @@ def __init__(
         # self.output_projection = nn.Linear(d_model, n_features)
         self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         # # WDU: the original TiDE paper isn't proposed for imputation task. Hence the model doesn't take
@@ -112,7 +112,7 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
         }
 
         # if in training mode, return results with losses
-        if training:
+        if self.training:
             X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
             loss, ORT_loss, MIT_loss = self.saits_loss_func(reconstruction, X_ori, missing_mask, indicating_mask)
             results["ORT_loss"] = ORT_loss
diff --git a/pypots/imputation/tide/model.py b/pypots/imputation/tide/model.py
index 7b14a5a6..693e5c5d 100644
--- a/pypots/imputation/tide/model.py
+++ b/pypots/imputation/tide/model.py
@@ -282,7 +282,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return
diff --git a/pypots/imputation/timemixer/core.py b/pypots/imputation/timemixer/core.py
index c094d2ff..f988c6fc 100644
--- a/pypots/imputation/timemixer/core.py
+++ b/pypots/imputation/timemixer/core.py
@@ -56,7 +56,7 @@ def __init__(
             use_future_temporal_feature=False,
         )
 
-    def forward(self, inputs: dict, training: bool = True) -> dict:
+    def forward(self, inputs: dict) -> dict:
         X, missing_mask = inputs["X"], inputs["missing_mask"]
 
         if self.apply_nonstationary_norm:
@@ -75,7 +75,8 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
             "imputed_data": imputed_data,
         }
 
-        if training:
+        # if in training mode, return results with losses
+        if self.training:
             # `loss` is always the item for backward propagating to update the model
             loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"])
             results["loss"] = loss
diff --git a/pypots/imputation/timemixer/model.py b/pypots/imputation/timemixer/model.py
index 89b24011..7ebf10ca 100644
--- a/pypots/imputation/timemixer/model.py
+++ b/pypots/imputation/timemixer/model.py
@@ -307,7 +307,7 @@ def predict(
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self._assemble_input_for_testing(data)
-                results = self.model.forward(inputs, training=False)
+                results = self.model.forward(inputs)
                 imputation_collector.append(results["imputed_data"])
 
         # Step 3: output collection and return

From cc286eea7461434c9516500dc9cf48a8c0a030c5 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 30 Sep 2024 17:39:12 +0800
Subject: [PATCH 06/19] fix: globally replace importing from  with ;

---
 pypots/imputation/grud/core.py      | 2 +-
 pypots/imputation/moderntcn/core.py | 2 +-
 pypots/imputation/tefn/core.py      | 2 +-
 pypots/imputation/timemixer/core.py | 2 +-
 pypots/imputation/timesnet/core.py  | 2 +-
 pypots/nn/modules/brits/backbone.py | 2 +-
 pypots/nn/modules/saits/loss.py     | 2 +-
 pypots/nn/modules/usgan/backbone.py | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pypots/imputation/grud/core.py b/pypots/imputation/grud/core.py
index 713259c2..44f85e44 100644
--- a/pypots/imputation/grud/core.py
+++ b/pypots/imputation/grud/core.py
@@ -9,8 +9,8 @@
 
 import torch.nn as nn
 
+from ...nn.functional import calc_mse
 from ...nn.modules.grud import BackboneGRUD
-from ...utils.metrics import calc_mse
 
 
 class _GRUD(nn.Module):
diff --git a/pypots/imputation/moderntcn/core.py b/pypots/imputation/moderntcn/core.py
index 4c64d2b8..91a43495 100644
--- a/pypots/imputation/moderntcn/core.py
+++ b/pypots/imputation/moderntcn/core.py
@@ -9,9 +9,9 @@
 import torch.nn as nn
 
 from ...nn.functional import nonstationary_norm, nonstationary_denorm
+from ...nn.functional import calc_mse
 from ...nn.modules.moderntcn import BackboneModernTCN
 from ...nn.modules.patchtst.layers import FlattenHead
-from ...utils.metrics import calc_mse
 
 
 class _ModernTCN(nn.Module):
diff --git a/pypots/imputation/tefn/core.py b/pypots/imputation/tefn/core.py
index ca11825e..c5bdc6d7 100644
--- a/pypots/imputation/tefn/core.py
+++ b/pypots/imputation/tefn/core.py
@@ -7,9 +7,9 @@
 
 import torch.nn as nn
 
+from ...nn.functional import calc_mse
 from ...nn.functional import nonstationary_norm, nonstationary_denorm
 from ...nn.modules.tefn import BackboneTEFN
-from ...utils.metrics import calc_mse
 
 
 class _TEFN(nn.Module):
diff --git a/pypots/imputation/timemixer/core.py b/pypots/imputation/timemixer/core.py
index f988c6fc..8127e04a 100644
--- a/pypots/imputation/timemixer/core.py
+++ b/pypots/imputation/timemixer/core.py
@@ -11,8 +11,8 @@
     nonstationary_norm,
     nonstationary_denorm,
 )
+from ...nn.functional import calc_mse
 from ...nn.modules.timemixer import BackboneTimeMixer
-from ...utils.metrics import calc_mse
 
 
 class _TimeMixer(nn.Module):
diff --git a/pypots/imputation/timesnet/core.py b/pypots/imputation/timesnet/core.py
index 26f8424a..c4b203db 100644
--- a/pypots/imputation/timesnet/core.py
+++ b/pypots/imputation/timesnet/core.py
@@ -7,8 +7,8 @@
 
 import torch.nn as nn
 
-from ...nn.functional import calc_mse
 from ...nn.functional import nonstationary_norm, nonstationary_denorm
+from ...nn.functional import calc_mse
 from ...nn.modules.timesnet import BackboneTimesNet
 from ...nn.modules.transformer.embedding import DataEmbedding
 
diff --git a/pypots/nn/modules/brits/backbone.py b/pypots/nn/modules/brits/backbone.py
index eef07cc2..c779cbc9 100644
--- a/pypots/nn/modules/brits/backbone.py
+++ b/pypots/nn/modules/brits/backbone.py
@@ -12,7 +12,7 @@
 
 from .layers import FeatureRegression
 from ..grud.layers import TemporalDecay
-from ....utils.metrics import calc_mae
+from ....nn.functional import calc_mae
 
 
 class BackboneRITS(nn.Module):
diff --git a/pypots/nn/modules/saits/loss.py b/pypots/nn/modules/saits/loss.py
index 0052dce2..dc19ad4a 100644
--- a/pypots/nn/modules/saits/loss.py
+++ b/pypots/nn/modules/saits/loss.py
@@ -10,7 +10,7 @@
 
 import torch.nn as nn
 
-from ....utils.metrics import calc_mae
+from ....nn.functional import calc_mae
 
 
 class SaitsLoss(nn.Module):
diff --git a/pypots/nn/modules/usgan/backbone.py b/pypots/nn/modules/usgan/backbone.py
index 9b7fa079..4ecbfef5 100644
--- a/pypots/nn/modules/usgan/backbone.py
+++ b/pypots/nn/modules/usgan/backbone.py
@@ -13,7 +13,7 @@
 
 from .layers import UsganDiscriminator
 from ..brits import BackboneBRITS
-from ....utils.metrics import calc_mse
+from ....nn.functional import calc_mse
 
 
 class BackboneUSGAN(nn.Module):

From ba4588ce16c030f864b407fe963680e212da33e8 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 30 Sep 2024 20:51:26 +0800
Subject: [PATCH 07/19] refactor: still keep pypots.utils.metrics for future
 compatibility;

---
 pypots/utils/metrics/classification.py | 23 +++++++++++++++++++++++
 pypots/utils/metrics/clustering.py     | 24 ++++++++++++++++++++++++
 pypots/utils/metrics/error.py          | 21 +++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 pypots/utils/metrics/classification.py
 create mode 100644 pypots/utils/metrics/clustering.py
 create mode 100644 pypots/utils/metrics/error.py

diff --git a/pypots/utils/metrics/classification.py b/pypots/utils/metrics/classification.py
new file mode 100644
index 00000000..20481631
--- /dev/null
+++ b/pypots/utils/metrics/classification.py
@@ -0,0 +1,23 @@
+"""
+Evaluation metrics related to classification.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from ..logging import logger
+from ...nn.functional.classification import *
+
+# pypots.utils.metrics.classification is deprecated, and moved to pypots.nn.functional.classification
+logger.warning(
+    "🚨 Please import from pypots.nn.functional.classification instead of pypots.utils.metrics.classification"
+)
+
+__all__ = [
+    "calc_binary_classification_metrics",
+    "calc_precision_recall_f1",
+    "calc_pr_auc",
+    "calc_roc_auc",
+    "calc_acc",
+]
diff --git a/pypots/utils/metrics/clustering.py b/pypots/utils/metrics/clustering.py
new file mode 100644
index 00000000..eb6a668b
--- /dev/null
+++ b/pypots/utils/metrics/clustering.py
@@ -0,0 +1,24 @@
+"""
+Evaluation metrics related to clustering.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from ..logging import logger
+from ...nn.functional.clustering import *
+
+# pypots.utils.metrics.clustering is deprecated, and moved to pypots.nn.functional.clustering
+logger.warning("🚨 Please import from pypots.nn.functional.clustering instead of pypots.utils.metrics.clustering")
+
+__all__ = [
+    "calc_rand_index",
+    "calc_adjusted_rand_index",
+    "calc_cluster_purity",
+    "calc_nmi",
+    "calc_chs",
+    "calc_dbs",
+    "calc_silhouette",
+    "calc_internal_cluster_validation_metrics",
+    "calc_external_cluster_validation_metrics",
+]
diff --git a/pypots/utils/metrics/error.py b/pypots/utils/metrics/error.py
new file mode 100644
index 00000000..a25e7647
--- /dev/null
+++ b/pypots/utils/metrics/error.py
@@ -0,0 +1,21 @@
+"""
+Evaluation metrics related to error calculation (like in tasks regression, imputation etc).
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from ..logging import logger
+from ...nn.functional.error import *
+
+# pypots.utils.metrics.error is deprecated, and moved to pypots.nn.functional.error
+logger.warning("🚨 Please import from pypots.nn.functional.error instead of pypots.utils.metrics.error")
+
+__all__ = [
+    "calc_mae",
+    "calc_mse",
+    "calc_rmse",
+    "calc_mre",
+    "calc_quantile_crps",
+    "calc_quantile_crps_sum",
+]

From 86340d0bb2fceebfe593bddd0a889f10297665c0 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 3 Oct 2024 23:10:33 +0800
Subject: [PATCH 08/19] refactor: do not expose  by default;

---
 pypots/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypots/utils/__init__.py b/pypots/utils/__init__.py
index d564bf31..24542ad4 100644
--- a/pypots/utils/__init__.py
+++ b/pypots/utils/__init__.py
@@ -10,7 +10,7 @@
     # content files in this package
     "file",
     "logging",
-    "metrics",
+    # "metrics", # deprecated and everything is moved to nn.functional, hence do not import it by default
     "random",
     "visual",
 ]

From b4b5b489c860894ccac7f07e3aa6e2842adb9a07 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Tue, 8 Oct 2024 21:07:05 +0800
Subject: [PATCH 09/19] refactor: remove lingting issues;

---
 pypots/utils/metrics/__init__.py       |  5 +----
 pypots/utils/metrics/classification.py |  8 +++++++-
 pypots/utils/metrics/clustering.py     | 12 +++++++++++-
 pypots/utils/metrics/error.py          |  9 ++++++++-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/pypots/utils/metrics/__init__.py b/pypots/utils/metrics/__init__.py
index 5453993c..e3627f9c 100644
--- a/pypots/utils/metrics/__init__.py
+++ b/pypots/utils/metrics/__init__.py
@@ -34,10 +34,7 @@
     calc_quantile_crps_sum,
 )
 
-logger.warning(
-    "🚨 Importing metrics from pypots.utils.metrics is deprecated. "
-    "Please import from pypots.nn.functional instead."
-)
+logger.warning("‼️ `pypots.utils.metrics` is deprecated. Please import from `pypots.nn.functional` instead.")
 
 __all__ = [
     # error
diff --git a/pypots/utils/metrics/classification.py b/pypots/utils/metrics/classification.py
index 20481631..eaef8e68 100644
--- a/pypots/utils/metrics/classification.py
+++ b/pypots/utils/metrics/classification.py
@@ -7,7 +7,13 @@
 
 
 from ..logging import logger
-from ...nn.functional.classification import *
+from ...nn.functional.classification import (
+    calc_binary_classification_metrics,
+    calc_precision_recall_f1,
+    calc_pr_auc,
+    calc_roc_auc,
+    calc_acc,
+)
 
 # pypots.utils.metrics.classification is deprecated, and moved to pypots.nn.functional.classification
 logger.warning(
diff --git a/pypots/utils/metrics/clustering.py b/pypots/utils/metrics/clustering.py
index eb6a668b..a37dba58 100644
--- a/pypots/utils/metrics/clustering.py
+++ b/pypots/utils/metrics/clustering.py
@@ -6,7 +6,17 @@
 # License: BSD-3-Clause
 
 from ..logging import logger
-from ...nn.functional.clustering import *
+from ...nn.functional.clustering import (
+    calc_rand_index,
+    calc_adjusted_rand_index,
+    calc_cluster_purity,
+    calc_nmi,
+    calc_chs,
+    calc_dbs,
+    calc_silhouette,
+    calc_internal_cluster_validation_metrics,
+    calc_external_cluster_validation_metrics,
+)
 
 # pypots.utils.metrics.clustering is deprecated, and moved to pypots.nn.functional.clustering
 logger.warning("🚨 Please import from pypots.nn.functional.clustering instead of pypots.utils.metrics.clustering")
diff --git a/pypots/utils/metrics/error.py b/pypots/utils/metrics/error.py
index a25e7647..0ff62e28 100644
--- a/pypots/utils/metrics/error.py
+++ b/pypots/utils/metrics/error.py
@@ -6,7 +6,14 @@
 # License: BSD-3-Clause
 
 from ..logging import logger
-from ...nn.functional.error import *
+from ...nn.functional.error import (
+    calc_mae,
+    calc_mse,
+    calc_rmse,
+    calc_mre,
+    calc_quantile_crps,
+    calc_quantile_crps_sum,
+)
 
 # pypots.utils.metrics.error is deprecated, and moved to pypots.nn.functional.error
 logger.warning("🚨 Please import from pypots.nn.functional.error instead of pypots.utils.metrics.error")

From af3bf85ee4854b81d2c5464db86a92ff08076676 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 2 Dec 2024 11:52:10 +0800
Subject: [PATCH 10/19] fix: rename labels in classification CSAI into y;

---
 pypots/classification/csai/core.py  | 4 ++--
 pypots/classification/csai/model.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypots/classification/csai/core.py b/pypots/classification/csai/core.py
index dbdd025b..9ff1883e 100644
--- a/pypots/classification/csai/core.py
+++ b/pypots/classification/csai/core.py
@@ -81,8 +81,8 @@ def forward(self, inputs: dict, training: bool = True) -> dict:
             results["consistency_loss"] = consistency_loss
             results["reconstruction_loss"] = reconstruction_loss
             # print(inputs["labels"].unsqueeze(1))
-            f_classification_loss = F.nll_loss(torch.log(f_prediction), inputs["labels"])
-            b_classification_loss = F.nll_loss(torch.log(b_prediction), inputs["labels"])
+            f_classification_loss = F.nll_loss(torch.log(f_prediction), inputs["y"])
+            b_classification_loss = F.nll_loss(torch.log(b_prediction), inputs["y"])
             # f_classification_loss, _ = criterion(f_prediction, f_logits, inputs["labels"].unsqueeze(1).float())
             # b_classification_loss, _ = criterion(b_prediction, b_logits, inputs["labels"].unsqueeze(1).float())
             classification_loss = f_classification_loss + b_classification_loss
diff --git a/pypots/classification/csai/model.py b/pypots/classification/csai/model.py
index 3419c5bb..5ed3bc95 100644
--- a/pypots/classification/csai/model.py
+++ b/pypots/classification/csai/model.py
@@ -185,7 +185,7 @@ def _assemble_input_for_training(self, data: list, training=True) -> dict:
 
         inputs = {
             "indices": indices,
-            "labels": labels,
+            "y": labels,
             "forward": {
                 "X": X,
                 "missing_mask": missing_mask,

From 054b3c9b60d9032fa17350d2fd00f1af68cddecd Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 2 Dec 2024 12:07:23 +0800
Subject: [PATCH 11/19] refactor: add customized loss and metric funcs for CSAI
 models;

---
 pypots/classification/csai/model.py | 30 ++++++++++++++++++++---------
 pypots/imputation/csai/model.py     | 28 +++++++++++++++++++--------
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/pypots/classification/csai/model.py b/pypots/classification/csai/model.py
index 5ed3bc95..34d95b1e 100644
--- a/pypots/classification/csai/model.py
+++ b/pypots/classification/csai/model.py
@@ -70,6 +70,14 @@ class CSAI(BaseNNClassifier):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default loss as metric as claimed in the original paper.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -120,6 +128,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optimizer = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -128,15 +138,17 @@ def __init__(
         verbose: bool = True,
     ):
         super().__init__(
-            n_classes,
-            batch_size,
-            epochs,
-            patience,
-            num_workers,
-            device,
-            saving_path,
-            model_saving_strategy,
-            verbose,
+            n_classes=n_classes,
+            batch_size=batch_size,
+            epochs=epochs,
+            patience=patience,
+            train_loss_func=train_loss_func,
+            val_metric_func=val_metric_func,
+            num_workers=num_workers,
+            device=device,
+            saving_path=saving_path,
+            model_saving_strategy=model_saving_strategy,
+            verbose=verbose,
         )
 
         self.n_steps = n_steps
diff --git a/pypots/imputation/csai/model.py b/pypots/imputation/csai/model.py
index a579fd2c..ddc7a0bf 100644
--- a/pypots/imputation/csai/model.py
+++ b/pypots/imputation/csai/model.py
@@ -64,6 +64,14 @@ class CSAI(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func :
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -119,6 +127,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Union[str, torch.device, list, None] = None,
@@ -127,14 +137,16 @@ def __init__(
         verbose: bool = True,
     ):
         super().__init__(
-            batch_size,
-            epochs,
-            patience,
-            num_workers,
-            device,
-            saving_path,
-            model_saving_strategy,
-            verbose,
+            batch_size=batch_size,
+            epochs=epochs,
+            patience=patience,
+            train_loss_func=train_loss_func,
+            val_metric_func=val_metric_func,
+            num_workers=num_workers,
+            device=device,
+            saving_path=saving_path,
+            model_saving_strategy=model_saving_strategy,
+            verbose=verbose,
         )
 
         self.n_steps = n_steps

From 9bc40f4c000ca1fd24f4f91b48243ee54fa7bd19 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 2 Dec 2024 14:36:11 +0800
Subject: [PATCH 12/19] docs: update doc strings;

---
 pypots/classification/base.py           | 2 +-
 pypots/classification/brits/model.py    | 6 +-----
 pypots/classification/csai/model.py     | 2 +-
 pypots/classification/grud/model.py     | 2 +-
 pypots/classification/raindrop/model.py | 2 +-
 pypots/clustering/base.py               | 2 +-
 pypots/clustering/crli/model.py         | 2 +-
 pypots/clustering/vader/model.py        | 2 +-
 pypots/forecasting/base.py              | 2 +-
 pypots/forecasting/csdi/model.py        | 2 +-
 pypots/imputation/csdi/model.py         | 3 ++-
 11 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index a837d9c9..6c6ed763 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -166,7 +166,7 @@ class BaseNNClassifier(BaseNNModel):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
diff --git a/pypots/classification/brits/model.py b/pypots/classification/brits/model.py
index a08d46ee..6721641f 100644
--- a/pypots/classification/brits/model.py
+++ b/pypots/classification/brits/model.py
@@ -59,7 +59,7 @@ class BRITS(BaseNNClassifier):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     optimizer :
         The optimizer for model training.
@@ -133,10 +133,6 @@ def __init__(
         self.classification_weight = classification_weight
         self.reconstruction_weight = reconstruction_weight
 
-        # CSDI has its own defined loss function, so we set them as None here
-        self.train_loss_func = None
-        self.train_loss_func_name = "default"
-
         # set up the model
         self.model = _BRITS(
             self.n_steps,
diff --git a/pypots/classification/csai/model.py b/pypots/classification/csai/model.py
index 34d95b1e..72e76522 100644
--- a/pypots/classification/csai/model.py
+++ b/pypots/classification/csai/model.py
@@ -76,7 +76,7 @@ class CSAI(BaseNNClassifier):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default loss as metric as claimed in the original paper.
+        If not given, will use the default loss from the original paper as the metric.
 
     optimizer :
         The optimizer for model training.
diff --git a/pypots/classification/grud/model.py b/pypots/classification/grud/model.py
index 72c17d82..c60cea3d 100644
--- a/pypots/classification/grud/model.py
+++ b/pypots/classification/grud/model.py
@@ -54,7 +54,7 @@ class GRUD(BaseNNClassifier):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     optimizer :
         The optimizer for model training.
diff --git a/pypots/classification/raindrop/model.py b/pypots/classification/raindrop/model.py
index c0ac2509..3250803d 100644
--- a/pypots/classification/raindrop/model.py
+++ b/pypots/classification/raindrop/model.py
@@ -79,7 +79,7 @@ class Raindrop(BaseNNClassifier):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     optimizer :
         The optimizer for model training.
diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
index 9391e092..39dfad5b 100644
--- a/pypots/clustering/base.py
+++ b/pypots/clustering/base.py
@@ -164,7 +164,7 @@ class BaseNNClusterer(BaseNNModel):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py
index 1f0364bd..5cbf4be2 100644
--- a/pypots/clustering/crli/model.py
+++ b/pypots/clustering/crli/model.py
@@ -80,7 +80,7 @@ class CRLI(BaseNNClusterer):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     G_optimizer :
         The optimizer for the generator training.
diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py
index 1a21d727..28405367 100644
--- a/pypots/clustering/vader/model.py
+++ b/pypots/clustering/vader/model.py
@@ -69,7 +69,7 @@ class VaDER(BaseNNClusterer):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     optimizer :
         The optimizer for model training.
diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
index f1168e69..25e9d9ee 100644
--- a/pypots/forecasting/base.py
+++ b/pypots/forecasting/base.py
@@ -155,7 +155,7 @@ class BaseNNForecaster(BaseNNModel):
 
     val_metric_func:
         The customized metric function designed by users for validating the model.
-        If not given, will use the default MSE metric.
+        If not given, will use the default loss from the original paper as the metric.
 
     num_workers :
         The number of subprocesses to use for data loading.
diff --git a/pypots/forecasting/csdi/model.py b/pypots/forecasting/csdi/model.py
index 8eac0e36..29480eea 100644
--- a/pypots/forecasting/csdi/model.py
+++ b/pypots/forecasting/csdi/model.py
@@ -178,7 +178,7 @@ def __init__(
         self.train_loss_func = None
         self.train_loss_func_name = "default"
         self.val_metric_func = None
-        self.val_metric_func_name = "loss (default)"
+        self.val_metric_func_name = "metric (default)"
 
         # set up the model
         self.model = _CSDI(
diff --git a/pypots/imputation/csdi/model.py b/pypots/imputation/csdi/model.py
index 6c3c0160..d34067e3 100644
--- a/pypots/imputation/csdi/model.py
+++ b/pypots/imputation/csdi/model.py
@@ -159,11 +159,12 @@ def __init__(
         assert schedule in ["quad", "linear"]
         self.n_steps = n_steps
         self.target_strategy = target_strategy
+
         # CSDI has its own defined loss function and validation loss, so we set them as None here
         self.train_loss_func = None
         self.train_loss_func_name = "default"
         self.val_metric_func = None
-        self.val_metric_func_name = "loss (default)"
+        self.val_metric_func_name = "metric (default)"
 
         # set up the model
         self.model = _CSDI(

From 2f5840a97b91505aef81da6fade726de07e8811c Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Tue, 3 Dec 2024 19:42:19 +0800
Subject: [PATCH 13/19] refactor: add customized loss and metric funcs for
 SegRNN;

---
 pypots/imputation/segrnn/model.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/pypots/imputation/segrnn/model.py b/pypots/imputation/segrnn/model.py
index 6e687084..bde16599 100644
--- a/pypots/imputation/segrnn/model.py
+++ b/pypots/imputation/segrnn/model.py
@@ -59,6 +59,14 @@ class SegRNN(BaseNNImputer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
+    train_loss_func:
+        The customized loss function designed by users for training the model.
+        If not given, will use the default loss as claimed in the original paper.
+
+    val_metric_func:
+        The customized metric function designed by users for validating the model.
+        If not given, will use the default MSE metric.
+
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
@@ -103,6 +111,8 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: int = None,
+        train_loss_func: Optional[dict] = None,
+        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -111,14 +121,16 @@ def __init__(
         verbose: bool = True,
     ):
         super().__init__(
-            batch_size,
-            epochs,
-            patience,
-            num_workers,
-            device,
-            saving_path,
-            model_saving_strategy,
-            verbose,
+            batch_size=batch_size,
+            epochs=epochs,
+            patience=patience,
+            train_loss_func=train_loss_func,
+            val_metric_func=val_metric_func,
+            num_workers=num_workers,
+            device=device,
+            saving_path=saving_path,
+            model_saving_strategy=model_saving_strategy,
+            verbose=verbose,
         )
 
         self.n_steps = n_steps

From 566b859baf29d6c4f7d4fb4fa927e10018f906a6 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Tue, 3 Dec 2024 20:13:20 +0800
Subject: [PATCH 14/19] refactor: simplify some parts;

---
 pypots/__init__.py                      |  2 --
 pypots/base.py                          | 24 ++++++------------------
 pypots/nn/modules/timemixer/backbone.py |  6 ++++--
 pypots/nn/modules/timesnet/backbone.py  |  8 ++++----
 4 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/pypots/__init__.py b/pypots/__init__.py
index 3f3e781f..f76e968d 100644
--- a/pypots/__init__.py
+++ b/pypots/__init__.py
@@ -15,7 +15,6 @@
     data,
     utils,
 )
-from .gungnir import Gungnir
 from .version import __version__
 
 __all__ = [
@@ -26,6 +25,5 @@
     "optim",
     "data",
     "utils",
-    "Gungnir",
     "__version__",
 ]
diff --git a/pypots/base.py b/pypots/base.py
index f2835e07..6c52dde2 100644
--- a/pypots/base.py
+++ b/pypots/base.py
@@ -510,27 +510,15 @@ def __init__(
         # check train_loss_func and val_metric_func
         train_loss_func_name, val_metric_func_name = "default", "loss (default)"
         if train_loss_func is not None:
-            assert (
-                len(train_loss_func) == 1
-            ), f"train_loss_func should have only 1 item, but got {len(train_loss_func)}"
+            assert len(train_loss_func) == 1, f"train_loss_func should have only 1 item, but got {len(train_loss_func)}"
             train_loss_func_name, train_loss_func = train_loss_func.popitem()
-            assert isinstance(
-                train_loss_func, Callable
-            ), "train_loss_func should be a callable function"
-            logger.info(
-                f"Using customized {train_loss_func_name} as the training loss function."
-            )
+            assert isinstance(train_loss_func, Callable), "train_loss_func should be a callable function"
+            logger.info(f"Using customized {train_loss_func_name} as the training loss function.")
         if val_metric_func is not None:
-            assert (
-                len(val_metric_func) == 1
-            ), f"val_metric_func should have only 1 item, but got {len(val_metric_func)}"
+            assert len(val_metric_func) == 1, f"val_metric_func should have only 1 item, but got {len(val_metric_func)}"
             val_metric_func_name, val_metric_func = val_metric_func.popitem()
-            assert isinstance(
-                val_metric_func, Callable
-            ), "val_metric_func should be a callable function"
-            logger.info(
-                f"Using customized {val_metric_func_name} as the validation metric function."
-            )
+            assert isinstance(val_metric_func, Callable), "val_metric_func should be a callable function"
+            logger.info(f"Using customized {val_metric_func_name} as the validation metric function.")
 
         # set up the hype-parameters
         self.batch_size = batch_size
diff --git a/pypots/nn/modules/timemixer/backbone.py b/pypots/nn/modules/timemixer/backbone.py
index 1b134437..fe0647bc 100644
--- a/pypots/nn/modules/timemixer/backbone.py
+++ b/pypots/nn/modules/timemixer/backbone.py
@@ -114,15 +114,17 @@ def __init__(
                         for i in range(downsampling_layers + 1)
                     ]
                 )
-        if task_name == "imputation" or task_name == "anomaly_detection":
+        elif task_name == "imputation" or task_name == "anomaly_detection":
             if self.channel_independence == 1:
                 self.projection_layer = nn.Linear(d_model, 1, bias=True)
             else:
                 self.projection_layer = nn.Linear(d_model, n_pred_features, bias=True)
-        if task_name == "classification":
+        elif task_name == "classification":
             self.act = F.gelu
             self.dropout = nn.Dropout(dropout)
             self.projection = nn.Linear(d_model * n_steps, n_classes)
+        else:
+            raise NotImplementedError("Task not supported")
 
     def out_projection(self, dec_out, i, out_res):
         dec_out = self.projection_layer(dec_out)
diff --git a/pypots/nn/modules/timesnet/backbone.py b/pypots/nn/modules/timesnet/backbone.py
index 5eb6ec04..94e90f1b 100644
--- a/pypots/nn/modules/timesnet/backbone.py
+++ b/pypots/nn/modules/timesnet/backbone.py
@@ -2,16 +2,16 @@
 
 """
 
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
 import torch
 import torch.nn as nn
 
 from .layers import TimesBlock
 
 
-# Created by Wenjie Du <wenjay.du@gmail.com>
-# License: BSD-3-Clause
-
-
 class BackboneTimesNet(nn.Module):
     def __init__(
         self,

From 30efa4e31dacc3db2d37fb4aca6ff46b95be7027 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Tue, 3 Dec 2024 22:39:08 +0800
Subject: [PATCH 15/19] refactor: disable clustering algos to customize
 training loss and validation metric;

---
 pypots/clustering/base.py           | 11 +++++++++++
 pypots/clustering/crli/model.py     | 14 ++------------
 pypots/clustering/template/model.py |  6 ++----
 pypots/clustering/vader/model.py    | 15 +++------------
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
index 39dfad5b..c5258fbb 100644
--- a/pypots/clustering/base.py
+++ b/pypots/clustering/base.py
@@ -231,6 +231,17 @@ def __init__(
         )
         self.n_clusters = n_clusters
 
+        # training loss function and validation metric function are quite different in clustering models,
+        # hence we don't set default loss and metric functions here. So the below lines are commented out.
+
+        # # set default training loss function and validation metric function if not given
+        # if train_loss_func is None:
+        #     self.train_loss_func =
+        #     self.train_loss_func_name = self.train_loss_func.__class__.__name__
+        # if val_metric_func is None:
+        #     self.val_metric_func =
+        #     self.val_metric_func_name =
+
     @abstractmethod
     def _assemble_input_for_training(self, data: list) -> dict:
         """Assemble the given data into a dictionary for training input.
diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py
index 5cbf4be2..b004f8d8 100644
--- a/pypots/clustering/crli/model.py
+++ b/pypots/clustering/crli/model.py
@@ -74,14 +74,6 @@ class CRLI(BaseNNClusterer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
-    train_loss_func:
-        The customized loss function designed by users for training the model.
-        If not given, will use the default loss as claimed in the original paper.
-
-    val_metric_func:
-        The customized metric function designed by users for validating the model.
-        If not given, will use the default loss from the original paper as the metric.
-
     G_optimizer :
         The optimizer for the generator training.
         If not given, will use a default Adam optimizer.
@@ -133,8 +125,6 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
-        train_loss_func: Optional[dict] = None,
-        val_metric_func: Optional[dict] = None,
         G_optimizer: Optional[Optimizer] = Adam(),
         D_optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
@@ -148,8 +138,8 @@ def __init__(
             batch_size=batch_size,
             epochs=epochs,
             patience=patience,
-            train_loss_func=train_loss_func,
-            val_metric_func=val_metric_func,
+            train_loss_func=None,
+            val_metric_func=None,
             num_workers=num_workers,
             device=device,
             saving_path=saving_path,
diff --git a/pypots/clustering/template/model.py b/pypots/clustering/template/model.py
index b242c605..e5fe8414 100644
--- a/pypots/clustering/template/model.py
+++ b/pypots/clustering/template/model.py
@@ -35,8 +35,6 @@ def __init__(
         batch_size: int = 32,
         epochs: int = 100,
         patience: Optional[int] = None,
-        train_loss_func: Optional[dict] = None,
-        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -49,8 +47,8 @@ def __init__(
             batch_size=batch_size,
             epochs=epochs,
             patience=patience,
-            train_loss_func=train_loss_func,
-            val_metric_func=val_metric_func,
+            train_loss_func=None,
+            val_metric_func=None,
             num_workers=num_workers,
             device=device,
             saving_path=saving_path,
diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py
index 28405367..d7e43a4d 100644
--- a/pypots/clustering/vader/model.py
+++ b/pypots/clustering/vader/model.py
@@ -63,17 +63,10 @@ class VaDER(BaseNNClusterer):
         stopped when the model does not perform better after that number of epochs.
         Leaving it default as None will disable the early-stopping.
 
-    train_loss_func:
-        The customized loss function designed by users for training the model.
-        If not given, will use the default loss as claimed in the original paper.
-
-    val_metric_func:
-        The customized metric function designed by users for validating the model.
-        If not given, will use the default loss from the original paper as the metric.
-
     optimizer :
         The optimizer for model training.
         If not given, will use a default Adam optimizer.
+
     num_workers :
         The number of subprocesses to use for data loading.
         `0` means data loading will be in the main process, i.e. there won't be subprocesses.
@@ -111,8 +104,6 @@ def __init__(
         epochs: int = 100,
         pretrain_epochs: int = 10,
         patience: Optional[int] = None,
-        train_loss_func: Optional[dict] = None,
-        val_metric_func: Optional[dict] = None,
         optimizer: Optional[Optimizer] = Adam(),
         num_workers: int = 0,
         device: Optional[Union[str, torch.device, list]] = None,
@@ -125,8 +116,8 @@ def __init__(
             batch_size=batch_size,
             epochs=epochs,
             patience=patience,
-            train_loss_func=train_loss_func,
-            val_metric_func=val_metric_func,
+            train_loss_func=None,
+            val_metric_func=None,
             num_workers=num_workers,
             device=device,
             saving_path=saving_path,

From 6f3b36eaf2375cc1613d784a1f7df3704b7a3bd9 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Tue, 3 Dec 2024 23:05:12 +0800
Subject: [PATCH 16/19] feat: add loss and metric classes;

---
 pypots/nn/modules/loss.py   | 70 +++++++++++++++++++++++++++++++++++--
 pypots/nn/modules/metric.py | 11 +++++-
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/pypots/nn/modules/loss.py b/pypots/nn/modules/loss.py
index 0868d2ca..12d3ade3 100644
--- a/pypots/nn/modules/loss.py
+++ b/pypots/nn/modules/loss.py
@@ -6,8 +6,17 @@
 # License: BSD-3-Clause
 
 
+import torch
+
 from .metric import BaseMetric
-from ..functional import calc_mse
+from ..functional import (
+    calc_mae,
+    calc_mse,
+    calc_rmse,
+    calc_mre,
+    calc_quantile_crps,
+    calc_quantile_crps_sum,
+)
 
 
 class BaseLoss(BaseMetric):
@@ -20,9 +29,64 @@ def forward(self, prediction, target):
         raise NotImplementedError
 
 
-class MAE_Loss(BaseLoss):
+class MSE(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        value = calc_mse(prediction, target, mask)
+        return value
+
+
+class MAE(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        value = calc_mae(prediction, target, mask)
+        return value
+
+
+class RMSE(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        value = calc_rmse(prediction, target, mask)
+        return value
+
+
+class MRE(BaseLoss):
     def __init__(self):
         super().__init__()
 
     def forward(self, prediction, target, mask=None):
-        return calc_mse(prediction, target, mask)
+        value = calc_mre(prediction, target, mask)
+        return value
+
+
+class QuantileCRPS(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        value = calc_quantile_crps(prediction, target, mask)
+        return value
+
+
+class QuantileCRPS_Sum(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target, mask=None):
+        value = calc_quantile_crps_sum(prediction, target, mask)
+        return value
+
+
+class CrossEntropy(BaseLoss):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, prediction, target):
+        value = torch.nn.functional.cross_entropy(prediction, target)
+        return value
diff --git a/pypots/nn/modules/metric.py b/pypots/nn/modules/metric.py
index faea3d78..61f8c616 100644
--- a/pypots/nn/modules/metric.py
+++ b/pypots/nn/modules/metric.py
@@ -8,7 +8,7 @@
 
 import torch.nn as nn
 
-from ..functional import calc_pr_auc
+from ..functional import calc_pr_auc, calc_acc
 
 
 class BaseMetric(nn.Module):
@@ -27,3 +27,12 @@ def __init__(self):
     def forward(self, prediction, target):
         pr_auc, _, _, _ = calc_pr_auc(prediction, target)
         return pr_auc
+
+
+class Accuracy(BaseMetric):
+    def __init__(self):
+        super().__init__(lower_better=False)
+
+    def forward(self, prediction, target):
+        acc_score = calc_acc(prediction, target)
+        return acc_score

From 8705db6e571d77705f2280cc5225c8929bcf4e9e Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 4 Dec 2024 00:17:28 +0800
Subject: [PATCH 17/19] refactor: use classes to replace funcs in CLAS algos;

---
 pypots/classification/base.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index 6c6ed763..562e0137 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -15,7 +15,8 @@
 from torch.utils.data import DataLoader
 
 from ..base import BaseModel, BaseNNModel
-from ..nn.functional import calc_acc
+from ..nn.modules.loss import CrossEntropy
+from ..nn.modules.metric import Accuracy
 from ..utils.logging import logger
 
 try:
@@ -235,11 +236,11 @@ def __init__(
 
         # set default training loss function and validation metric function if not given
         if train_loss_func is None:
-            self.train_loss_func = torch.nn.functional.cross_entropy
-            self.train_loss_func_name = "CrossEntropy"
+            self.train_loss_func = CrossEntropy()
+            self.train_loss_func_name = self.train_loss_func.__class__.__name__
         if val_metric_func is None:
-            self.val_metric_func = calc_acc
-            self.val_metric_func_name = "Accuracy"
+            self.val_metric_func = Accuracy()
+            self.val_metric_func_name = self.val_metric_func.__class__.__name__
 
     @abstractmethod
     def _assemble_input_for_training(self, data: list) -> dict:

From 5b58313340ef7bdc0d031440e7b9c57bf3fe7106 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 4 Dec 2024 20:26:53 +0800
Subject: [PATCH 18/19] refactor: simplify some parts;

---
 pypots/base.py                   | 10 ++++------
 pypots/imputation/base.py        | 12 ++++++------
 pypots/imputation/saits/model.py |  9 ---------
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/pypots/base.py b/pypots/base.py
index 6c52dde2..cba26c99 100644
--- a/pypots/base.py
+++ b/pypots/base.py
@@ -510,14 +510,12 @@ def __init__(
         # check train_loss_func and val_metric_func
         train_loss_func_name, val_metric_func_name = "default", "loss (default)"
         if train_loss_func is not None:
-            assert len(train_loss_func) == 1, f"train_loss_func should have only 1 item, but got {len(train_loss_func)}"
-            train_loss_func_name, train_loss_func = train_loss_func.popitem()
-            assert isinstance(train_loss_func, Callable), "train_loss_func should be a callable function"
+            train_loss_func_name = train_loss_func.__class__.__name__
+            assert isinstance(train_loss_func, Callable), "train_loss_func should be a callable instance"
             logger.info(f"Using customized {train_loss_func_name} as the training loss function.")
         if val_metric_func is not None:
-            assert len(val_metric_func) == 1, f"val_metric_func should have only 1 item, but got {len(val_metric_func)}"
-            val_metric_func_name, val_metric_func = val_metric_func.popitem()
-            assert isinstance(val_metric_func, Callable), "val_metric_func should be a callable function"
+            val_metric_func_name = val_metric_func.__class__.__name__
+            assert isinstance(val_metric_func, Callable), "val_metric_func should be a callable instance"
             logger.info(f"Using customized {val_metric_func_name} as the validation metric function.")
 
         # set up the hype-parameters
diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index 009ae31f..eff1fac5 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -15,7 +15,7 @@
 from torch.utils.data import DataLoader
 
 from ..base import BaseModel, BaseNNModel
-from ..nn.functional import calc_mse
+from ..nn.modules.loss import MSE
 from ..utils.logging import logger
 
 try:
@@ -222,11 +222,11 @@ def __init__(
 
         # set default training loss function and validation metric function if not given
         if train_loss_func is None:
-            self.train_loss_func = calc_mse
-            self.train_loss_func_name = "MSE"
+            self.train_loss_func = MSE()
+            self.train_loss_func_name = self.train_loss_func.__class__.__name__
         if val_metric_func is None:
-            self.val_metric_func = calc_mse
-            self.val_metric_func_name = "MSE"
+            self.val_metric_func = MSE()
+            self.val_metric_func_name = self.val_metric_func.__class__.__name__
 
     @abstractmethod
     def _assemble_input_for_training(self, data: list) -> dict:
@@ -323,7 +323,7 @@ def _train_model(
                             inputs = self._assemble_input_for_validating(data)
                             results = self.model.forward(inputs)
                             imputation_error = (
-                                calc_mse(
+                                self.val_metric_func(
                                     results["imputed_data"],
                                     inputs["X_ori"],
                                     inputs["indicating_mask"],
diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py
index c779f4f4..85f64fa6 100644
--- a/pypots/imputation/saits/model.py
+++ b/pypots/imputation/saits/model.py
@@ -17,7 +17,6 @@
 from ..base import BaseNNImputer
 from ...data.checking import key_in_data_set
 from ...data.dataset import BaseDataset
-from ...nn.functional import calc_mae, calc_mse
 from ...optim.adam import Adam
 from ...optim.base import Optimizer
 from ...utils.logging import logger
@@ -172,14 +171,6 @@ def __init__(
             d_model = n_heads * d_k
             logger.warning(f"⚠️ d_model is reset to {d_model} = n_heads ({n_heads}) * d_k ({d_k})")
 
-        # set default training loss function and validation metric function if not given
-        if train_loss_func is None:
-            self.train_loss_func = calc_mae
-            self.train_loss_func_name = "MAE"
-        if val_metric_func is None:
-            self.val_metric_func = calc_mse
-            self.val_metric_func_name = "MSE"
-
         self.n_steps = n_steps
         self.n_features = n_features
         # model hype-parameters

From 6ca6da82cf2794c4100392220dec07c610fd9299 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 19 Feb 2025 17:49:28 +0800
Subject: [PATCH 19/19] refactor: import from pypots.nn.functional instead of
 pypots.utils.metrics;

---
 README.md                                     |  2 +-
 README_zh.md                                  |  2 +-
 docs/examples.rst                             |  2 +-
 docs/pypots.utils.rst                         |  4 +-
 pypots/nn/functional/error.py                 | 15 +++--
 pypots/utils/metrics/__init__.py              |  2 +-
 pypots/utils/metrics/classification.py        |  4 +-
 pypots/utils/metrics/clustering.py            |  4 +-
 pypots/utils/metrics/error.py                 |  4 +-
 tests/classification/brits.py                 | 19 ++----
 tests/classification/csai.py                  |  2 +-
 tests/classification/grud.py                  | 15 ++---
 tests/classification/raindrop.py              | 19 ++----
 tests/clustering/crli.py                      | 52 ++++-------------
 tests/clustering/vader.py                     | 24 ++------
 tests/forecasting/bttf.py                     |  2 +-
 tests/forecasting/csdi.py                     | 23 ++------
 tests/imputation/autoformer.py                | 16 ++---
 tests/imputation/brits.py                     | 19 ++----
 tests/imputation/crossformer.py               | 16 ++---
 tests/imputation/csai.py                      |  2 +-
 tests/imputation/csdi.py                      | 35 +++--------
 tests/imputation/dlinear.py                   | 11 +---
 tests/imputation/etsformer.py                 | 16 ++---
 tests/imputation/fedformer.py                 | 16 ++---
 tests/imputation/film.py                      | 11 +---
 tests/imputation/fits.py                      |  2 +-
 tests/imputation/frets.py                     | 11 +---
 tests/imputation/gpvae.py                     | 27 +++------
 tests/imputation/grud.py                      | 19 ++----
 tests/imputation/imputeformer.py              | 20 ++-----
 tests/imputation/informer.py                  | 15 ++---
 tests/imputation/itransformer.py              | 28 +++------
 tests/imputation/koopa.py                     | 11 +---
 tests/imputation/lerp.py                      | 18 ++----
 tests/imputation/locf.py                      | 34 +++--------
 tests/imputation/mean.py                      | 18 ++----
 tests/imputation/median.py                    | 18 ++----
 tests/imputation/micn.py                      | 11 +---
 tests/imputation/moderntcn.py                 | 16 ++---
 tests/imputation/mrnn.py                      | 19 ++----
 tests/imputation/nonstationary_transformer.py | 23 ++------
 tests/imputation/patchtst.py                  | 15 ++---
 tests/imputation/pyraformer.py                | 16 ++---
 tests/imputation/reformer.py                  | 15 ++---
 tests/imputation/revin_scinet.py              | 20 ++-----
 tests/imputation/saits.py                     | 15 ++---
 tests/imputation/scinet.py                    | 11 +---
 tests/imputation/segrnn.py                    |  2 +-
 tests/imputation/stemgnn.py                   | 11 +---
 tests/imputation/tcn.py                       | 11 +---
 tests/imputation/tefn.py                      | 15 ++---
 tests/imputation/tide.py                      | 11 +---
 tests/imputation/timemixer.py                 | 16 ++---
 tests/imputation/timesnet.py                  | 15 ++---
 tests/imputation/transformer.py               | 24 ++------
 tests/imputation/trmf.py                      |  2 +-
 tests/imputation/usgan.py                     | 19 ++----
 tests/optim/adadelta.py                       | 10 +---
 tests/optim/adagrad.py                        | 10 +---
 tests/optim/adam.py                           | 10 +---
 tests/optim/adamw.py                          | 10 +---
 tests/optim/lr_schedulers.py                  | 58 +++++--------------
 tests/optim/rmsprop.py                        | 10 +---
 tests/optim/sgd.py                            | 10 +---
 65 files changed, 267 insertions(+), 696 deletions(-)

diff --git a/README.md b/README.md
index 5598e21c..f7d13e5a 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,7 @@ print(X.shape)  # (11988, 48, 37), 11988 samples and each sample has 48 time ste
 
 # Model training. This is PyPOTS showtime.
 from pypots.imputation import SAITS
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, n_heads=4, d_k=64, d_v=64, d_ffn=128, dropout=0.1, epochs=10)
 # Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
 saits.fit(dataset)  # train the model on the dataset
diff --git a/README_zh.md b/README_zh.md
index b8b8d162..fd6a9b8a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -261,7 +261,7 @@ print(X.shape)  # X的形状为(11988, 48, 37), 即11988个样本, 每个样本
 
 # 模型训练. PyPOTS的好戏上演了！
 from pypots.imputation import SAITS
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, n_heads=4, d_k=64, d_v=64, d_ffn=128, dropout=0.1, epochs=10)
 # 因为基准数据对模型不可知, 将整个数据集作为训练集, 也可以把数据集分为训练/验证/测试集
 saits.fit(dataset)  # 基于数据集训练模型
diff --git a/docs/examples.rst b/docs/examples.rst
index 5101eba8..b1c4c84d 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -25,7 +25,7 @@ You can also find a simple and quick-start tutorial notebook on Google Colab
     from pygrinder import mcar
     from pypots.data import load_specific_dataset
     from pypots.imputation import SAITS
-    from pypots.utils.metrics import calc_mae
+    from pypots.nn.functional import calc_mae
 
     # Data preprocessing. Tedious, but PyPOTS can help. 🤓
     data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.
diff --git a/docs/pypots.utils.rst b/docs/pypots.utils.rst
index e4154bbf..bfc02cc1 100644
--- a/docs/pypots.utils.rst
+++ b/docs/pypots.utils.rst
@@ -10,10 +10,10 @@ pypots.utils.file
    :show-inheritance:
    :inherited-members:
 
-pypots.utils.metrics
+pypots.nn.functional
 ---------------------------
 
-.. automodule:: pypots.utils.metrics
+.. automodule:: pypots.nn.functional
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/pypots/nn/functional/error.py b/pypots/nn/functional/error.py
index f5414845..6caf32cb 100644
--- a/pypots/nn/functional/error.py
+++ b/pypots/nn/functional/error.py
@@ -77,7 +77,7 @@ def calc_mae(
     --------
 
     >>> import numpy as np
-    >>> from pypots.utils.metrics import calc_mae
+    >>> from pypots.nn.functional import calc_mae
     >>> targets = np.array([1, 2, 3, 4, 5])
     >>> predictions = np.array([1, 2, 1, 4, 6])
     >>> mae = calc_mae(predictions, targets)
@@ -128,7 +128,7 @@ def calc_mse(
     --------
 
     >>> import numpy as np
-    >>> from pypots.utils.metrics import calc_mse
+    >>> from pypots.nn.functional import calc_mse
     >>> targets = np.array([1, 2, 3, 4, 5])
     >>> predictions = np.array([1, 2, 1, 4, 6])
     >>> mse = calc_mse(predictions, targets)
@@ -179,7 +179,7 @@ def calc_rmse(
     --------
 
     >>> import numpy as np
-    >>> from pypots.utils.metrics import calc_rmse
+    >>> from pypots.nn.functional import calc_rmse
     >>> targets = np.array([1, 2, 3, 4, 5])
     >>> predictions = np.array([1, 2, 1, 4, 6])
     >>> rmse = calc_rmse(predictions, targets)
@@ -227,7 +227,7 @@ def calc_mre(
     --------
 
     >>> import numpy as np
-    >>> from pypots.utils.metrics import calc_mre
+    >>> from pypots.nn.functional import calc_mre
     >>> targets = np.array([1, 2, 3, 4, 5])
     >>> predictions = np.array([1, 2, 1, 4, 6])
     >>> mre = calc_mre(predictions, targets)
@@ -254,7 +254,12 @@ def calc_mre(
         return lib.sum(lib.abs(predictions - targets)) / (lib.sum(lib.abs(targets)) + 1e-12)
 
 
-def calc_quantile_loss(predictions, targets, q: float, eval_points) -> float:
+def calc_quantile_loss(
+    predictions: Union[np.ndarray, torch.Tensor],
+    targets: Union[np.ndarray, torch.Tensor],
+    q: float,
+    eval_points: Union[np.ndarray, torch.Tensor],
+) -> Union[float, torch.Tensor]:
     quantile_loss = 2 * torch.sum(
         torch.abs((predictions - targets) * eval_points * ((targets <= predictions) * 1.0 - q))
     )
diff --git a/pypots/utils/metrics/__init__.py b/pypots/utils/metrics/__init__.py
index e3627f9c..f9d1413f 100644
--- a/pypots/utils/metrics/__init__.py
+++ b/pypots/utils/metrics/__init__.py
@@ -34,7 +34,7 @@
     calc_quantile_crps_sum,
 )
 
-logger.warning("‼️ `pypots.utils.metrics` is deprecated. Please import from `pypots.nn.functional` instead.")
+logger.warning("‼️ `pypots.nn.functional` is deprecated. Please import from `pypots.nn.functional` instead.")
 
 __all__ = [
     # error
diff --git a/pypots/utils/metrics/classification.py b/pypots/utils/metrics/classification.py
index eaef8e68..663b5975 100644
--- a/pypots/utils/metrics/classification.py
+++ b/pypots/utils/metrics/classification.py
@@ -15,9 +15,9 @@
     calc_acc,
 )
 
-# pypots.utils.metrics.classification is deprecated, and moved to pypots.nn.functional.classification
+# pypots.nn.functional.classification is deprecated, and moved to pypots.nn.functional.classification
 logger.warning(
-    "🚨 Please import from pypots.nn.functional.classification instead of pypots.utils.metrics.classification"
+    "🚨 Please import from pypots.nn.functional.classification instead of pypots.nn.functional.classification"
 )
 
 __all__ = [
diff --git a/pypots/utils/metrics/clustering.py b/pypots/utils/metrics/clustering.py
index a37dba58..af57c4b5 100644
--- a/pypots/utils/metrics/clustering.py
+++ b/pypots/utils/metrics/clustering.py
@@ -18,8 +18,8 @@
     calc_external_cluster_validation_metrics,
 )
 
-# pypots.utils.metrics.clustering is deprecated, and moved to pypots.nn.functional.clustering
-logger.warning("🚨 Please import from pypots.nn.functional.clustering instead of pypots.utils.metrics.clustering")
+# pypots.nn.functional.clustering is deprecated, and moved to pypots.nn.functional.clustering
+logger.warning("🚨 Please import from pypots.nn.functional.clustering instead of pypots.nn.functional.clustering")
 
 __all__ = [
     "calc_rand_index",
diff --git a/pypots/utils/metrics/error.py b/pypots/utils/metrics/error.py
index 0ff62e28..ccb1fb15 100644
--- a/pypots/utils/metrics/error.py
+++ b/pypots/utils/metrics/error.py
@@ -15,8 +15,8 @@
     calc_quantile_crps_sum,
 )
 
-# pypots.utils.metrics.error is deprecated, and moved to pypots.nn.functional.error
-logger.warning("🚨 Please import from pypots.nn.functional.error instead of pypots.utils.metrics.error")
+# pypots.nn.functional.error is deprecated, and moved to pypots.nn.functional.error
+logger.warning("🚨 Please import from pypots.nn.functional.error instead of pypots.nn.functional.error")
 
 __all__ = [
     "calc_mae",
diff --git a/tests/classification/brits.py b/tests/classification/brits.py
index 7441e40e..8a1c0062 100644
--- a/tests/classification/brits.py
+++ b/tests/classification/brits.py
@@ -13,7 +13,7 @@
 from pypots.classification import BRITS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_binary_classification_metrics
+from pypots.nn.functional import calc_binary_classification_metrics
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -59,9 +59,7 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="classification-brits")
     def test_1_classify(self):
         results = self.brits.predict(TEST_SET)
-        metrics = calc_binary_classification_metrics(
-            results["classification"], DATA["test_y"]
-        )
+        metrics = calc_binary_classification_metrics(results["classification"], DATA["test_y"])
         logger.info(
             f'BRITS ROC_AUC: {metrics["roc_auc"]}, '
             f'PR_AUC: {metrics["pr_auc"]}, '
@@ -80,17 +78,12 @@ def test_2_parameters(self):
         assert hasattr(self.brits, "best_loss")
         self.assertNotEqual(self.brits.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.brits, "best_model_dict")
-            and self.brits.best_model_dict is not None
-        )
+        assert hasattr(self.brits, "best_model_dict") and self.brits.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="classification-brits")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.brits)
@@ -106,9 +99,7 @@ def test_3_saving_path(self):
     def test_4_lazy_loading(self):
         self.brits.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
         results = self.brits.predict(GENERAL_H5_TEST_SET_PATH)
-        metrics = calc_binary_classification_metrics(
-            results["classification"], DATA["test_y"]
-        )
+        metrics = calc_binary_classification_metrics(results["classification"], DATA["test_y"])
         logger.info(
             f'Lazy-loading BRITS ROC_AUC: {metrics["roc_auc"]}, '
             f'PR_AUC: {metrics["pr_auc"]}, '
diff --git a/tests/classification/csai.py b/tests/classification/csai.py
index 4a2bbf5f..74c0e360 100644
--- a/tests/classification/csai.py
+++ b/tests/classification/csai.py
@@ -13,7 +13,7 @@
 from pypots.classification import CSAI
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_binary_classification_metrics
+from pypots.nn.functional import calc_binary_classification_metrics
 from tests.global_test_config import (
     DATA,
     EPOCHS,
diff --git a/tests/classification/grud.py b/tests/classification/grud.py
index 61f7d496..ef0fe840 100644
--- a/tests/classification/grud.py
+++ b/tests/classification/grud.py
@@ -13,7 +13,7 @@
 from pypots.classification import GRUD
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_binary_classification_metrics
+from pypots.nn.functional import calc_binary_classification_metrics
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -77,17 +77,12 @@ def test_2_parameters(self):
         assert hasattr(self.grud, "best_loss")
         self.assertNotEqual(self.grud.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.grud, "best_model_dict")
-            and self.grud.best_model_dict is not None
-        )
+        assert hasattr(self.grud, "best_model_dict") and self.grud.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="classification-grud")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.grud)
@@ -103,9 +98,7 @@ def test_3_saving_path(self):
     def test_4_lazy_loading(self):
         self.grud.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
         results = self.grud.predict(GENERAL_H5_TEST_SET_PATH)
-        metrics = calc_binary_classification_metrics(
-            results["classification"], DATA["test_y"]
-        )
+        metrics = calc_binary_classification_metrics(results["classification"], DATA["test_y"])
         logger.info(
             f'GRU-D ROC_AUC: {metrics["roc_auc"]}, '
             f'PR_AUC: {metrics["pr_auc"]}, '
diff --git a/tests/classification/raindrop.py b/tests/classification/raindrop.py
index 1909c076..c9c53ffc 100644
--- a/tests/classification/raindrop.py
+++ b/tests/classification/raindrop.py
@@ -12,7 +12,7 @@
 
 from pypots.classification import Raindrop
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_binary_classification_metrics
+from pypots.nn.functional import calc_binary_classification_metrics
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -75,24 +75,17 @@ def test_1_classify(self):
     def test_2_parameters(self):
         assert hasattr(self.raindrop, "model") and self.raindrop.model is not None
 
-        assert (
-            hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None
-        )
+        assert hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None
 
         assert hasattr(self.raindrop, "best_loss")
         self.assertNotEqual(self.raindrop.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.raindrop, "best_model_dict")
-            and self.raindrop.best_model_dict is not None
-        )
+        assert hasattr(self.raindrop, "best_model_dict") and self.raindrop.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="classification-raindrop")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.raindrop)
@@ -108,9 +101,7 @@ def test_3_saving_path(self):
     def test_4_lazy_loading(self):
         self.raindrop.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
         results = self.raindrop.predict(GENERAL_H5_TEST_SET_PATH)
-        metrics = calc_binary_classification_metrics(
-            results["classification"], DATA["test_y"]
-        )
+        metrics = calc_binary_classification_metrics(results["classification"], DATA["test_y"])
         logger.info(
             f'Lazy-loading Raindrop ROC_AUC: {metrics["roc_auc"]}, '
             f'PR_AUC: {metrics["pr_auc"]}, '
diff --git a/tests/clustering/crli.py b/tests/clustering/crli.py
index 6b3266ff..ab7eea07 100644
--- a/tests/clustering/crli.py
+++ b/tests/clustering/crli.py
@@ -14,7 +14,7 @@
 from pypots.clustering import CRLI
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import (
+from pypots.nn.functional import (
     calc_external_cluster_validation_metrics,
     calc_internal_cluster_validation_metrics,
 )
@@ -85,50 +85,30 @@ def test_1_parameters(self):
         # GRU cell
         assert hasattr(self.crli_gru, "model") and self.crli_gru.model is not None
 
-        assert (
-            hasattr(self.crli_gru, "G_optimizer")
-            and self.crli_gru.G_optimizer is not None
-        )
-        assert (
-            hasattr(self.crli_gru, "D_optimizer")
-            and self.crli_gru.D_optimizer is not None
-        )
+        assert hasattr(self.crli_gru, "G_optimizer") and self.crli_gru.G_optimizer is not None
+        assert hasattr(self.crli_gru, "D_optimizer") and self.crli_gru.D_optimizer is not None
 
         assert hasattr(self.crli_gru, "best_loss")
         self.assertNotEqual(self.crli_gru.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.crli_gru, "best_model_dict")
-            and self.crli_gru.best_model_dict is not None
-        )
+        assert hasattr(self.crli_gru, "best_model_dict") and self.crli_gru.best_model_dict is not None
 
         # LSTM cell
         assert hasattr(self.crli_lstm, "model") and self.crli_lstm.model is not None
 
-        assert (
-            hasattr(self.crli_lstm, "G_optimizer")
-            and self.crli_lstm.G_optimizer is not None
-        )
-        assert (
-            hasattr(self.crli_lstm, "D_optimizer")
-            and self.crli_lstm.D_optimizer is not None
-        )
+        assert hasattr(self.crli_lstm, "G_optimizer") and self.crli_lstm.G_optimizer is not None
+        assert hasattr(self.crli_lstm, "D_optimizer") and self.crli_lstm.D_optimizer is not None
 
         assert hasattr(self.crli_lstm, "best_loss")
         self.assertNotEqual(self.crli_lstm.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.crli_lstm, "best_model_dict")
-            and self.crli_lstm.best_model_dict is not None
-        )
+        assert hasattr(self.crli_lstm, "best_model_dict") and self.crli_lstm.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="clustering-crli")
     def test_2_cluster(self):
         # GRU cell
         clustering_results = self.crli_gru.predict(TEST_SET, return_latent_vars=True)
-        external_metrics = calc_external_cluster_validation_metrics(
-            clustering_results["clustering"], DATA["test_y"]
-        )
+        external_metrics = calc_external_cluster_validation_metrics(clustering_results["clustering"], DATA["test_y"])
         internal_metrics = calc_internal_cluster_validation_metrics(
             clustering_results["latent_vars"]["clustering_latent"], DATA["test_y"]
         )
@@ -137,9 +117,7 @@ def test_2_cluster(self):
 
         # LSTM cell
         clustering_results = self.crli_lstm.predict(TEST_SET, return_latent_vars=True)
-        external_metrics = calc_external_cluster_validation_metrics(
-            clustering_results["clustering"], DATA["test_y"]
-        )
+        external_metrics = calc_external_cluster_validation_metrics(clustering_results["clustering"], DATA["test_y"])
         internal_metrics = calc_internal_cluster_validation_metrics(
             clustering_results["latent_vars"]["clustering_latent"], DATA["test_y"]
         )
@@ -149,9 +127,7 @@ def test_2_cluster(self):
     @pytest.mark.xdist_group(name="clustering-crli")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.crli_gru)
@@ -166,12 +142,8 @@ def test_3_saving_path(self):
     @pytest.mark.xdist_group(name="clustering-crli")
     def test_4_lazy_loading(self):
         self.crli_lstm.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
-        clustering_results = self.crli_lstm.predict(
-            GENERAL_H5_TEST_SET_PATH, return_latent_vars=True
-        )
-        external_metrics = calc_external_cluster_validation_metrics(
-            clustering_results["clustering"], DATA["test_y"]
-        )
+        clustering_results = self.crli_lstm.predict(GENERAL_H5_TEST_SET_PATH, return_latent_vars=True)
+        external_metrics = calc_external_cluster_validation_metrics(clustering_results["clustering"], DATA["test_y"])
         internal_metrics = calc_internal_cluster_validation_metrics(
             clustering_results["latent_vars"]["clustering_latent"], DATA["test_y"]
         )
diff --git a/tests/clustering/vader.py b/tests/clustering/vader.py
index ba8a02de..e8500b92 100644
--- a/tests/clustering/vader.py
+++ b/tests/clustering/vader.py
@@ -15,7 +15,7 @@
 from pypots.clustering import VaDER
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import (
+from pypots.nn.functional import (
     calc_external_cluster_validation_metrics,
     calc_internal_cluster_validation_metrics,
 )
@@ -75,10 +75,7 @@ def test_1_cluster(self):
             logger.info(f"VaDER external_metrics: {external_metrics}")
             logger.info(f"VaDER internal_metrics: {internal_metrics}")
         except np.linalg.LinAlgError as e:
-            logger.error(
-                f"❌ Exception: {e}\n"
-                "Got singular matrix, please try to retrain the model to fix this"
-            )
+            logger.error(f"❌ Exception: {e}\n" "Got singular matrix, please try to retrain the model to fix this")
 
     @pytest.mark.xdist_group(name="clustering-vader")
     def test_2_parameters(self):
@@ -89,17 +86,12 @@ def test_2_parameters(self):
         assert hasattr(self.vader, "best_loss")
         self.assertNotEqual(self.vader.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.vader, "best_model_dict")
-            and self.vader.best_model_dict is not None
-        )
+        assert hasattr(self.vader, "best_model_dict") and self.vader.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="clustering-vader")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.vader)
@@ -114,12 +106,8 @@ def test_3_saving_path(self):
     @pytest.mark.xdist_group(name="clustering-vader")
     def test_4_lazy_loading(self):
         self.vader.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
-        clustering_results = self.vader.predict(
-            GENERAL_H5_TEST_SET_PATH, return_latent_vars=True
-        )
-        external_metrics = calc_external_cluster_validation_metrics(
-            clustering_results["clustering"], DATA["test_y"]
-        )
+        clustering_results = self.vader.predict(GENERAL_H5_TEST_SET_PATH, return_latent_vars=True)
+        external_metrics = calc_external_cluster_validation_metrics(clustering_results["clustering"], DATA["test_y"])
         internal_metrics = calc_internal_cluster_validation_metrics(
             clustering_results["latent_vars"]["z"], DATA["test_y"]
         )
diff --git a/tests/forecasting/bttf.py b/tests/forecasting/bttf.py
index 5e87cac5..3f1612c9 100644
--- a/tests/forecasting/bttf.py
+++ b/tests/forecasting/bttf.py
@@ -11,7 +11,7 @@
 
 from pypots.forecasting import BTTF
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import DATA, FORECASTING_TEST_SET, N_PRED_STEPS
 
 
diff --git a/tests/forecasting/csdi.py b/tests/forecasting/csdi.py
index 3df64ad8..d8bdaeca 100644
--- a/tests/forecasting/csdi.py
+++ b/tests/forecasting/csdi.py
@@ -15,7 +15,7 @@
 from pypots.forecasting import CSDI
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse, calc_quantile_crps
+from pypots.nn.functional import calc_mse, calc_quantile_crps
 
 from tests.global_test_config import (
     DATA,
@@ -68,9 +68,7 @@ def test_0_fit(self):
 
     @pytest.mark.xdist_group(name="forecasting-csdi")
     def test_1_forecasting(self):
-        forecasting_X = self.csdi.predict(FORECASTING_TEST_SET, n_sampling_times=2)[
-            "forecasting"
-        ]
+        forecasting_X = self.csdi.predict(FORECASTING_TEST_SET, n_sampling_times=2)["forecasting"]
         test_CRPS = calc_quantile_crps(
             forecasting_X,
             FORECASTING_TEST_SET["X_pred"],
@@ -79,9 +77,7 @@ def test_1_forecasting(self):
         forecasting_X = forecasting_X.mean(axis=1)  # mean over sampling times
         assert not np.isnan(
             forecasting_X
-        ).any(), (
-            "Output has missing values in the forecasting results that should not be."
-        )
+        ).any(), "Output has missing values in the forecasting results that should not be."
         test_MSE = calc_mse(
             forecasting_X,
             FORECASTING_TEST_SET["X_pred"],
@@ -98,17 +94,12 @@ def test_2_parameters(self):
         assert hasattr(self.csdi, "best_loss")
         self.assertNotEqual(self.csdi.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.csdi, "best_model_dict")
-            and self.csdi.best_model_dict is not None
-        )
+        assert hasattr(self.csdi, "best_model_dict") and self.csdi.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="forecasting-csdi")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.csdi)
@@ -133,9 +124,7 @@ def test_4_lazy_loading(self):
         forecasting_X = forecasting_X.mean(axis=1)  # mean over sampling times
         assert not np.isnan(
             forecasting_X
-        ).any(), (
-            "Output has missing values in the forecasting results that should not be."
-        )
+        ).any(), "Output has missing values in the forecasting results that should not be."
 
         test_MSE = calc_mse(
             forecasting_X,
diff --git a/tests/imputation/autoformer.py b/tests/imputation/autoformer.py
index 3050f0f2..63907a92 100644
--- a/tests/imputation/autoformer.py
+++ b/tests/imputation/autoformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Autoformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -80,25 +80,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.autoformer, "model") and self.autoformer.model is not None
 
-        assert (
-            hasattr(self.autoformer, "optimizer")
-            and self.autoformer.optimizer is not None
-        )
+        assert hasattr(self.autoformer, "optimizer") and self.autoformer.optimizer is not None
 
         assert hasattr(self.autoformer, "best_loss")
         self.assertNotEqual(self.autoformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.autoformer, "best_model_dict")
-            and self.autoformer.best_model_dict is not None
-        )
+        assert hasattr(self.autoformer, "best_model_dict") and self.autoformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-autoformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.autoformer)
diff --git a/tests/imputation/brits.py b/tests/imputation/brits.py
index d69e4b1d..7d44544e 100644
--- a/tests/imputation/brits.py
+++ b/tests/imputation/brits.py
@@ -15,7 +15,7 @@
 from pypots.imputation import BRITS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -59,12 +59,8 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-brits")
     def test_1_impute(self):
         imputed_X = self.brits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"BRITS test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-brits")
@@ -76,17 +72,12 @@ def test_2_parameters(self):
         assert hasattr(self.brits, "best_loss")
         self.assertNotEqual(self.brits.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.brits, "best_model_dict")
-            and self.brits.best_model_dict is not None
-        )
+        assert hasattr(self.brits, "best_model_dict") and self.brits.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-brits")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.brits)
diff --git a/tests/imputation/crossformer.py b/tests/imputation/crossformer.py
index c11792c2..75ba064f 100644
--- a/tests/imputation/crossformer.py
+++ b/tests/imputation/crossformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Crossformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -81,25 +81,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.crossformer, "model") and self.crossformer.model is not None
 
-        assert (
-            hasattr(self.crossformer, "optimizer")
-            and self.crossformer.optimizer is not None
-        )
+        assert hasattr(self.crossformer, "optimizer") and self.crossformer.optimizer is not None
 
         assert hasattr(self.crossformer, "best_loss")
         self.assertNotEqual(self.crossformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.crossformer, "best_model_dict")
-            and self.crossformer.best_model_dict is not None
-        )
+        assert hasattr(self.crossformer, "best_model_dict") and self.crossformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-crossformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.crossformer)
diff --git a/tests/imputation/csai.py b/tests/imputation/csai.py
index f5c4873b..2545d254 100644
--- a/tests/imputation/csai.py
+++ b/tests/imputation/csai.py
@@ -15,7 +15,7 @@
 from pypots.imputation import CSAI
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
diff --git a/tests/imputation/csdi.py b/tests/imputation/csdi.py
index 3023cea2..66881edc 100644
--- a/tests/imputation/csdi.py
+++ b/tests/imputation/csdi.py
@@ -15,7 +15,7 @@
 from pypots.imputation import CSDI
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse, calc_quantile_crps
+from pypots.nn.functional import calc_mse, calc_quantile_crps
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -65,16 +65,10 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-csdi")
     def test_1_impute(self):
         imputed_X = self.csdi.predict(TEST_SET, n_sampling_times=2)["imputation"]
-        test_CRPS = calc_quantile_crps(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        test_CRPS = calc_quantile_crps(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         imputed_X = imputed_X.mean(axis=1)  # mean over sampling times
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"CSDI test_MSE: {test_MSE}, test_CRPS: {test_CRPS}")
 
     @pytest.mark.xdist_group(name="imputation-csdi")
@@ -86,17 +80,12 @@ def test_2_parameters(self):
         assert hasattr(self.csdi, "best_loss")
         self.assertNotEqual(self.csdi.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.csdi, "best_model_dict")
-            and self.csdi.best_model_dict is not None
-        )
+        assert hasattr(self.csdi, "best_model_dict") and self.csdi.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-csdi")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.csdi)
@@ -113,17 +102,11 @@ def test_4_lazy_loading(self):
         self.csdi.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
         imputation_results = self.csdi.predict(GENERAL_H5_TEST_SET_PATH)
         imputed_X = imputation_results["imputation"]
-        test_CRPS = calc_quantile_crps(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        test_CRPS = calc_quantile_crps(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         imputed_X = imputed_X.mean(axis=1)  # mean over sampling times
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
 
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"Lazy-loading CSDI test_MSE: {test_MSE}, test_CRPS: {test_CRPS}")
 
 
diff --git a/tests/imputation/dlinear.py b/tests/imputation/dlinear.py
index d8cdf858..baba2164 100644
--- a/tests/imputation/dlinear.py
+++ b/tests/imputation/dlinear.py
@@ -15,7 +15,7 @@
 from pypots.imputation import DLinear
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -103,17 +103,12 @@ def test_2_parameters(self):
         assert hasattr(self.dlinear, "best_loss")
         self.assertNotEqual(self.dlinear.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.dlinear, "best_model_dict")
-            and self.dlinear.best_model_dict is not None
-        )
+        assert hasattr(self.dlinear, "best_model_dict") and self.dlinear.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-dlinear")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.dlinear)
diff --git a/tests/imputation/etsformer.py b/tests/imputation/etsformer.py
index 94bf57b7..fa2e4d39 100644
--- a/tests/imputation/etsformer.py
+++ b/tests/imputation/etsformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import ETSformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -80,25 +80,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.etsformer, "model") and self.etsformer.model is not None
 
-        assert (
-            hasattr(self.etsformer, "optimizer")
-            and self.etsformer.optimizer is not None
-        )
+        assert hasattr(self.etsformer, "optimizer") and self.etsformer.optimizer is not None
 
         assert hasattr(self.etsformer, "best_loss")
         self.assertNotEqual(self.etsformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.etsformer, "best_model_dict")
-            and self.etsformer.best_model_dict is not None
-        )
+        assert hasattr(self.etsformer, "best_model_dict") and self.etsformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-etsformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.etsformer)
diff --git a/tests/imputation/fedformer.py b/tests/imputation/fedformer.py
index fe72721a..6d6d92f5 100644
--- a/tests/imputation/fedformer.py
+++ b/tests/imputation/fedformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import FEDformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -82,25 +82,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.fedformer, "model") and self.fedformer.model is not None
 
-        assert (
-            hasattr(self.fedformer, "optimizer")
-            and self.fedformer.optimizer is not None
-        )
+        assert hasattr(self.fedformer, "optimizer") and self.fedformer.optimizer is not None
 
         assert hasattr(self.fedformer, "best_loss")
         self.assertNotEqual(self.fedformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.fedformer, "best_model_dict")
-            and self.fedformer.best_model_dict is not None
-        )
+        assert hasattr(self.fedformer, "best_model_dict") and self.fedformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-fedformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.fedformer)
diff --git a/tests/imputation/film.py b/tests/imputation/film.py
index e508856c..69d3cc3e 100644
--- a/tests/imputation/film.py
+++ b/tests/imputation/film.py
@@ -15,7 +15,7 @@
 from pypots.imputation import FiLM
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -83,17 +83,12 @@ def test_2_parameters(self):
         assert hasattr(self.film, "best_loss")
         self.assertNotEqual(self.film.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.film, "best_model_dict")
-            and self.film.best_model_dict is not None
-        )
+        assert hasattr(self.film, "best_model_dict") and self.film.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-film")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.film)
diff --git a/tests/imputation/fits.py b/tests/imputation/fits.py
index c35c942e..d654ff2c 100644
--- a/tests/imputation/fits.py
+++ b/tests/imputation/fits.py
@@ -15,7 +15,7 @@
 from pypots.imputation.fits import FITS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
diff --git a/tests/imputation/frets.py b/tests/imputation/frets.py
index 8821b36a..0e920207 100644
--- a/tests/imputation/frets.py
+++ b/tests/imputation/frets.py
@@ -15,7 +15,7 @@
 from pypots.imputation import FreTS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -81,17 +81,12 @@ def test_2_parameters(self):
         assert hasattr(self.frets, "best_loss")
         self.assertNotEqual(self.frets.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.frets, "best_model_dict")
-            and self.frets.best_model_dict is not None
-        )
+        assert hasattr(self.frets, "best_model_dict") and self.frets.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-frets")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.frets)
diff --git a/tests/imputation/gpvae.py b/tests/imputation/gpvae.py
index c76170e8..e78c828d 100644
--- a/tests/imputation/gpvae.py
+++ b/tests/imputation/gpvae.py
@@ -15,7 +15,7 @@
 from pypots.imputation import GPVAE
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -60,12 +60,8 @@ def test_0_fit(self):
     def test_1_impute(self):
         imputed_X = self.gp_vae.predict(TEST_SET, n_sampling_times=2)["imputation"]
         imputed_X = imputed_X.mean(axis=1)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"GP-VAE test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-gpvae")
@@ -77,17 +73,12 @@ def test_2_parameters(self):
         assert hasattr(self.gp_vae, "best_loss")
         self.assertNotEqual(self.gp_vae.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.gp_vae, "best_model_dict")
-            and self.gp_vae.best_model_dict is not None
-        )
+        assert hasattr(self.gp_vae, "best_model_dict") and self.gp_vae.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-gpvae")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.gp_vae)
@@ -102,13 +93,9 @@ def test_3_saving_path(self):
     @pytest.mark.xdist_group(name="imputation-gpvae")
     def test_4_lazy_loading(self):
         self.gp_vae.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
-        imputed_X = self.gp_vae.predict(GENERAL_H5_TEST_SET_PATH, n_sampling_times=2)[
-            "imputation"
-        ]
+        imputed_X = self.gp_vae.predict(GENERAL_H5_TEST_SET_PATH, n_sampling_times=2)["imputation"]
         imputed_X = imputed_X.mean(axis=1)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
 
         test_MSE = calc_mse(
             imputed_X,
diff --git a/tests/imputation/grud.py b/tests/imputation/grud.py
index c2fa4ade..eda931b7 100644
--- a/tests/imputation/grud.py
+++ b/tests/imputation/grud.py
@@ -15,7 +15,7 @@
 from pypots.imputation import GRUD
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -59,12 +59,8 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-grud")
     def test_1_impute(self):
         imputed_X = self.grud.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"GRUD test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-grud")
@@ -76,17 +72,12 @@ def test_2_parameters(self):
         assert hasattr(self.grud, "best_loss")
         self.assertNotEqual(self.grud.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.grud, "best_model_dict")
-            and self.grud.best_model_dict is not None
-        )
+        assert hasattr(self.grud, "best_model_dict") and self.grud.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-grud")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.grud)
diff --git a/tests/imputation/imputeformer.py b/tests/imputation/imputeformer.py
index 0257805d..18498d3c 100644
--- a/tests/imputation/imputeformer.py
+++ b/tests/imputation/imputeformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import ImputeFormer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -78,29 +78,19 @@ def test_1_impute(self):
 
     @pytest.mark.xdist_group(name="imputation-imputeformer")
     def test_2_parameters(self):
-        assert (
-            hasattr(self.imputeformer, "model") and self.imputeformer.model is not None
-        )
+        assert hasattr(self.imputeformer, "model") and self.imputeformer.model is not None
 
-        assert (
-            hasattr(self.imputeformer, "optimizer")
-            and self.imputeformer.optimizer is not None
-        )
+        assert hasattr(self.imputeformer, "optimizer") and self.imputeformer.optimizer is not None
 
         assert hasattr(self.imputeformer, "best_loss")
         self.assertNotEqual(self.imputeformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.imputeformer, "best_model_dict")
-            and self.imputeformer.best_model_dict is not None
-        )
+        assert hasattr(self.imputeformer, "best_model_dict") and self.imputeformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-imputeformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.imputeformer)
diff --git a/tests/imputation/informer.py b/tests/imputation/informer.py
index 78dbbedf..a8ab4fc2 100644
--- a/tests/imputation/informer.py
+++ b/tests/imputation/informer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Informer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -79,24 +79,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.informer, "model") and self.informer.model is not None
 
-        assert (
-            hasattr(self.informer, "optimizer") and self.informer.optimizer is not None
-        )
+        assert hasattr(self.informer, "optimizer") and self.informer.optimizer is not None
 
         assert hasattr(self.informer, "best_loss")
         self.assertNotEqual(self.informer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.informer, "best_model_dict")
-            and self.informer.best_model_dict is not None
-        )
+        assert hasattr(self.informer, "best_model_dict") and self.informer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-informer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.informer)
diff --git a/tests/imputation/itransformer.py b/tests/imputation/itransformer.py
index d47f1e1f..59a1d5ca 100644
--- a/tests/imputation/itransformer.py
+++ b/tests/imputation/itransformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import iTransformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -65,39 +65,25 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-itransformer")
     def test_1_impute(self):
         imputed_X = self.itransformer.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"iTransformer test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-itransformer")
     def test_2_parameters(self):
-        assert (
-            hasattr(self.itransformer, "model") and self.itransformer.model is not None
-        )
+        assert hasattr(self.itransformer, "model") and self.itransformer.model is not None
 
-        assert (
-            hasattr(self.itransformer, "optimizer")
-            and self.itransformer.optimizer is not None
-        )
+        assert hasattr(self.itransformer, "optimizer") and self.itransformer.optimizer is not None
 
         assert hasattr(self.itransformer, "best_loss")
         self.assertNotEqual(self.itransformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.itransformer, "best_model_dict")
-            and self.itransformer.best_model_dict is not None
-        )
+        assert hasattr(self.itransformer, "best_model_dict") and self.itransformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-itransformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.itransformer)
diff --git a/tests/imputation/koopa.py b/tests/imputation/koopa.py
index 5f1848f2..ca64be1d 100644
--- a/tests/imputation/koopa.py
+++ b/tests/imputation/koopa.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Koopa
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -83,17 +83,12 @@ def test_2_parameters(self):
         assert hasattr(self.koopa, "best_loss")
         self.assertNotEqual(self.koopa.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.koopa, "best_model_dict")
-            and self.koopa.best_model_dict is not None
-        )
+        assert hasattr(self.koopa, "best_model_dict") and self.koopa.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-koopa")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.koopa)
diff --git a/tests/imputation/lerp.py b/tests/imputation/lerp.py
index 41b396d6..b836cf8d 100644
--- a/tests/imputation/lerp.py
+++ b/tests/imputation/lerp.py
@@ -14,7 +14,7 @@
 
 from pypots.imputation import Lerp
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     TEST_SET,
@@ -32,25 +32,17 @@ class TestLerp(unittest.TestCase):
     def test_0_impute(self):
         # if input data is numpy ndarray
         test_X_imputed = self.lerp.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"Lerp test_MSE: {test_MSE}")
 
         # if input data is torch tensor
         X = torch.from_numpy(np.copy(TEST_SET["X"]))
         test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"]))
-        test_X_indicating_mask = torch.from_numpy(
-            np.copy(DATA["test_X_indicating_mask"])
-        )
+        test_X_indicating_mask = torch.from_numpy(np.copy(DATA["test_X_indicating_mask"]))
 
         test_X_imputed = self.lerp.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(test_X_imputed, test_X_ori, test_X_indicating_mask)
         logger.info(f"Lerp test_MSE: {test_MSE}")
 
diff --git a/tests/imputation/locf.py b/tests/imputation/locf.py
index b22f4b42..ded39004 100644
--- a/tests/imputation/locf.py
+++ b/tests/imputation/locf.py
@@ -14,7 +14,7 @@
 
 from pypots.imputation import LOCF
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     DEVICE,
@@ -36,18 +36,12 @@ class TestLOCF(unittest.TestCase):
     def test_0_impute(self):
         # if input data is numpy ndarray
         test_X_imputed_zero = self.locf_zero.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed_zero
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            test_X_imputed_zero, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(test_X_imputed_zero).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(test_X_imputed_zero, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"LOCF (zero) test_MSE: {test_MSE}")
 
         test_X_imputed_backward = self.locf_backward.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed_backward
-        ).any(), "Output still has missing values after running impute()."
+        assert not np.isnan(test_X_imputed_backward).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(
             test_X_imputed_backward,
             DATA["test_X_ori"],
@@ -56,9 +50,7 @@ def test_0_impute(self):
         logger.info(f"LOCF (backward) test_MSE: {test_MSE}")
 
         test_X_imputed_median = self.locf_median.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed_median
-        ).any(), "Output still has missing values after running impute()."
+        assert not np.isnan(test_X_imputed_median).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(
             test_X_imputed_median,
             DATA["test_X_ori"],
@@ -74,21 +66,15 @@ def test_0_impute(self):
         # if input data is torch tensor
         X = torch.from_numpy(np.copy(TEST_SET["X"]))
         test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"]))
-        test_X_indicating_mask = torch.from_numpy(
-            np.copy(DATA["test_X_indicating_mask"])
-        )
+        test_X_indicating_mask = torch.from_numpy(np.copy(DATA["test_X_indicating_mask"]))
 
         test_X_imputed_zero = self.locf_zero.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed_zero
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed_zero).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(test_X_imputed_zero, test_X_ori, test_X_indicating_mask)
         logger.info(f"LOCF (zero) test_MSE: {test_MSE}")
 
         test_X_imputed_backward = self.locf_backward.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed_backward
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed_backward).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(
             test_X_imputed_backward,
             test_X_ori,
@@ -97,9 +83,7 @@ def test_0_impute(self):
         logger.info(f"LOCF (backward) test_MSE: {test_MSE}")
 
         test_X_imputed_median = self.locf_median.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed_median
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed_median).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(
             test_X_imputed_median,
             test_X_ori,
diff --git a/tests/imputation/mean.py b/tests/imputation/mean.py
index 04be2c9d..34320357 100644
--- a/tests/imputation/mean.py
+++ b/tests/imputation/mean.py
@@ -14,7 +14,7 @@
 
 from pypots.imputation import Mean
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     TEST_SET,
@@ -32,25 +32,17 @@ class TestMean(unittest.TestCase):
     def test_0_impute(self):
         # if input data is numpy ndarray
         test_X_imputed = self.mean.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"Mean test_MSE: {test_MSE}")
 
         # if input data is torch tensor
         X = torch.from_numpy(np.copy(TEST_SET["X"]))
         test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"]))
-        test_X_indicating_mask = torch.from_numpy(
-            np.copy(DATA["test_X_indicating_mask"])
-        )
+        test_X_indicating_mask = torch.from_numpy(np.copy(DATA["test_X_indicating_mask"]))
 
         test_X_imputed = self.mean.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(test_X_imputed, test_X_ori, test_X_indicating_mask)
         logger.info(f"Mean test_MSE: {test_MSE}")
 
diff --git a/tests/imputation/median.py b/tests/imputation/median.py
index d4960449..c6cd312e 100644
--- a/tests/imputation/median.py
+++ b/tests/imputation/median.py
@@ -14,7 +14,7 @@
 
 from pypots.imputation import Median
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     TEST_SET,
@@ -32,25 +32,17 @@ class TestMedian(unittest.TestCase):
     def test_0_impute(self):
         # if input data is numpy ndarray
         test_X_imputed = self.median.predict(TEST_SET)["imputation"]
-        assert not np.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"Median test_MSE: {test_MSE}")
 
         # if input data is torch tensor
         X = torch.from_numpy(np.copy(TEST_SET["X"]))
         test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"]))
-        test_X_indicating_mask = torch.from_numpy(
-            np.copy(DATA["test_X_indicating_mask"])
-        )
+        test_X_indicating_mask = torch.from_numpy(np.copy(DATA["test_X_indicating_mask"]))
 
         test_X_imputed = self.median.predict({"X": X})["imputation"]
-        assert not torch.isnan(
-            test_X_imputed
-        ).any(), "Output still has missing values after running impute()."
+        assert not torch.isnan(test_X_imputed).any(), "Output still has missing values after running impute()."
         test_MSE = calc_mse(test_X_imputed, test_X_ori, test_X_indicating_mask)
         logger.info(f"Median test_MSE: {test_MSE}")
 
diff --git a/tests/imputation/micn.py b/tests/imputation/micn.py
index ea27fd95..ac179e12 100644
--- a/tests/imputation/micn.py
+++ b/tests/imputation/micn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import MICN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -82,17 +82,12 @@ def test_2_parameters(self):
         assert hasattr(self.micn, "best_loss")
         self.assertNotEqual(self.micn.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.micn, "best_model_dict")
-            and self.micn.best_model_dict is not None
-        )
+        assert hasattr(self.micn, "best_model_dict") and self.micn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-micn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.micn)
diff --git a/tests/imputation/moderntcn.py b/tests/imputation/moderntcn.py
index 33b41269..23e644e8 100644
--- a/tests/imputation/moderntcn.py
+++ b/tests/imputation/moderntcn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import ModernTCN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -87,25 +87,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.moderntcn, "model") and self.moderntcn.model is not None
 
-        assert (
-            hasattr(self.moderntcn, "optimizer")
-            and self.moderntcn.optimizer is not None
-        )
+        assert hasattr(self.moderntcn, "optimizer") and self.moderntcn.optimizer is not None
 
         assert hasattr(self.moderntcn, "best_loss")
         self.assertNotEqual(self.moderntcn.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.moderntcn, "best_model_dict")
-            and self.moderntcn.best_model_dict is not None
-        )
+        assert hasattr(self.moderntcn, "best_model_dict") and self.moderntcn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-moderntcn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.moderntcn)
diff --git a/tests/imputation/mrnn.py b/tests/imputation/mrnn.py
index 5e42e256..a7a02b75 100644
--- a/tests/imputation/mrnn.py
+++ b/tests/imputation/mrnn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import MRNN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -59,12 +59,8 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-mrnn")
     def test_1_impute(self):
         imputed_X = self.mrnn.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"MRNN test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-mrnn")
@@ -76,17 +72,12 @@ def test_2_parameters(self):
         assert hasattr(self.mrnn, "best_loss")
         self.assertNotEqual(self.mrnn.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.mrnn, "best_model_dict")
-            and self.mrnn.best_model_dict is not None
-        )
+        assert hasattr(self.mrnn, "best_model_dict") and self.mrnn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-mrnn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.mrnn)
diff --git a/tests/imputation/nonstationary_transformer.py b/tests/imputation/nonstationary_transformer.py
index 57dcf537..6a7c75f4 100644
--- a/tests/imputation/nonstationary_transformer.py
+++ b/tests/imputation/nonstationary_transformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import NonstationaryTransformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -35,9 +35,7 @@ class TestNonstationaryTransformer(unittest.TestCase):
     logger.info("Running tests for an imputation model NonstationaryTransformer...")
 
     # set the log and model saving path
-    saving_path = os.path.join(
-        RESULT_SAVING_DIR_FOR_IMPUTATION, "NonstationaryTransformer"
-    )
+    saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "NonstationaryTransformer")
     model_save_name = "saved_nonstationary_transformer_model.pypots"
 
     # initialize an Adam optimizer
@@ -80,10 +78,7 @@ def test_1_impute(self):
 
     @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
     def test_2_parameters(self):
-        assert (
-            hasattr(self.nonstationary_transformer, "model")
-            and self.nonstationary_transformer.model is not None
-        )
+        assert hasattr(self.nonstationary_transformer, "model") and self.nonstationary_transformer.model is not None
 
         assert (
             hasattr(self.nonstationary_transformer, "optimizer")
@@ -101,9 +96,7 @@ def test_2_parameters(self):
     @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.nonstationary_transformer)
@@ -117,12 +110,8 @@ def test_3_saving_path(self):
 
     @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
     def test_4_lazy_loading(self):
-        self.nonstationary_transformer.fit(
-            GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH
-        )
-        imputation_results = self.nonstationary_transformer.predict(
-            GENERAL_H5_TEST_SET_PATH
-        )
+        self.nonstationary_transformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
+        imputation_results = self.nonstationary_transformer.predict(GENERAL_H5_TEST_SET_PATH)
         assert not np.isnan(
             imputation_results["imputation"]
         ).any(), "Output still has missing values after running impute()."
diff --git a/tests/imputation/patchtst.py b/tests/imputation/patchtst.py
index 5dd6fe21..fda3a01e 100644
--- a/tests/imputation/patchtst.py
+++ b/tests/imputation/patchtst.py
@@ -15,7 +15,7 @@
 from pypots.imputation import PatchTST
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -83,24 +83,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.patchtst, "model") and self.patchtst.model is not None
 
-        assert (
-            hasattr(self.patchtst, "optimizer") and self.patchtst.optimizer is not None
-        )
+        assert hasattr(self.patchtst, "optimizer") and self.patchtst.optimizer is not None
 
         assert hasattr(self.patchtst, "best_loss")
         self.assertNotEqual(self.patchtst.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.patchtst, "best_model_dict")
-            and self.patchtst.best_model_dict is not None
-        )
+        assert hasattr(self.patchtst, "best_model_dict") and self.patchtst.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-patchtst")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.patchtst)
diff --git a/tests/imputation/pyraformer.py b/tests/imputation/pyraformer.py
index 64629647..175653ac 100644
--- a/tests/imputation/pyraformer.py
+++ b/tests/imputation/pyraformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Pyraformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -81,25 +81,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.pyraformer, "model") and self.pyraformer.model is not None
 
-        assert (
-            hasattr(self.pyraformer, "optimizer")
-            and self.pyraformer.optimizer is not None
-        )
+        assert hasattr(self.pyraformer, "optimizer") and self.pyraformer.optimizer is not None
 
         assert hasattr(self.pyraformer, "best_loss")
         self.assertNotEqual(self.pyraformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.pyraformer, "best_model_dict")
-            and self.pyraformer.best_model_dict is not None
-        )
+        assert hasattr(self.pyraformer, "best_model_dict") and self.pyraformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-pyraformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.pyraformer)
diff --git a/tests/imputation/reformer.py b/tests/imputation/reformer.py
index 15b3d749..79f8ddc6 100644
--- a/tests/imputation/reformer.py
+++ b/tests/imputation/reformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Reformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -81,24 +81,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.reformer, "model") and self.reformer.model is not None
 
-        assert (
-            hasattr(self.reformer, "optimizer") and self.reformer.optimizer is not None
-        )
+        assert hasattr(self.reformer, "optimizer") and self.reformer.optimizer is not None
 
         assert hasattr(self.reformer, "best_loss")
         self.assertNotEqual(self.reformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.reformer, "best_model_dict")
-            and self.reformer.best_model_dict is not None
-        )
+        assert hasattr(self.reformer, "best_model_dict") and self.reformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-reformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.reformer)
diff --git a/tests/imputation/revin_scinet.py b/tests/imputation/revin_scinet.py
index ab7347f4..62547220 100644
--- a/tests/imputation/revin_scinet.py
+++ b/tests/imputation/revin_scinet.py
@@ -15,7 +15,7 @@
 from pypots.imputation import RevIN_SCINet
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -80,29 +80,19 @@ def test_1_impute(self):
 
     @pytest.mark.xdist_group(name="imputation-revin_scinet")
     def test_2_parameters(self):
-        assert (
-            hasattr(self.revin_scinet, "model") and self.revin_scinet.model is not None
-        )
+        assert hasattr(self.revin_scinet, "model") and self.revin_scinet.model is not None
 
-        assert (
-            hasattr(self.revin_scinet, "optimizer")
-            and self.revin_scinet.optimizer is not None
-        )
+        assert hasattr(self.revin_scinet, "optimizer") and self.revin_scinet.optimizer is not None
 
         assert hasattr(self.revin_scinet, "best_loss")
         self.assertNotEqual(self.revin_scinet.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.revin_scinet, "best_model_dict")
-            and self.revin_scinet.best_model_dict is not None
-        )
+        assert hasattr(self.revin_scinet, "best_model_dict") and self.revin_scinet.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-revin_scinet")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.revin_scinet)
diff --git a/tests/imputation/saits.py b/tests/imputation/saits.py
index 53e6d717..fccdf89f 100644
--- a/tests/imputation/saits.py
+++ b/tests/imputation/saits.py
@@ -15,7 +15,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from pypots.utils.visual.data import plot_data, plot_missingness
 from tests.global_test_config import (
     DATA,
@@ -81,9 +81,7 @@ def test_1_impute(self):
         logger.info(f"SAITS test_MSE: {test_MSE}")
 
         # plot the missingness and imputed data
-        plot_missingness(
-            ~np.isnan(TEST_SET["X"]), 0, imputation_results["imputation"].shape[1]
-        )
+        plot_missingness(~np.isnan(TEST_SET["X"]), 0, imputation_results["imputation"].shape[1])
         plot_data(TEST_SET["X"], TEST_SET["X_ori"], imputation_results["imputation"])
 
     @pytest.mark.xdist_group(name="imputation-saits")
@@ -95,17 +93,12 @@ def test_2_parameters(self):
         assert hasattr(self.saits, "best_loss")
         self.assertNotEqual(self.saits.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.saits, "best_model_dict")
-            and self.saits.best_model_dict is not None
-        )
+        assert hasattr(self.saits, "best_model_dict") and self.saits.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-saits")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.saits)
diff --git a/tests/imputation/scinet.py b/tests/imputation/scinet.py
index dcbe8fa6..cac6594c 100644
--- a/tests/imputation/scinet.py
+++ b/tests/imputation/scinet.py
@@ -15,7 +15,7 @@
 from pypots.imputation import SCINet
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -87,17 +87,12 @@ def test_2_parameters(self):
         assert hasattr(self.scinet, "best_loss")
         self.assertNotEqual(self.scinet.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.scinet, "best_model_dict")
-            and self.scinet.best_model_dict is not None
-        )
+        assert hasattr(self.scinet, "best_model_dict") and self.scinet.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-scinet")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.scinet)
diff --git a/tests/imputation/segrnn.py b/tests/imputation/segrnn.py
index 17b877ad..c2488519 100644
--- a/tests/imputation/segrnn.py
+++ b/tests/imputation/segrnn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import SegRNN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
diff --git a/tests/imputation/stemgnn.py b/tests/imputation/stemgnn.py
index 515c9107..1b6d5bf7 100644
--- a/tests/imputation/stemgnn.py
+++ b/tests/imputation/stemgnn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import StemGNN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -81,17 +81,12 @@ def test_2_parameters(self):
         assert hasattr(self.stemgnn, "best_loss")
         self.assertNotEqual(self.stemgnn.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.stemgnn, "best_model_dict")
-            and self.stemgnn.best_model_dict is not None
-        )
+        assert hasattr(self.stemgnn, "best_model_dict") and self.stemgnn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-stemgnn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.stemgnn)
diff --git a/tests/imputation/tcn.py b/tests/imputation/tcn.py
index 400b66c4..78157369 100644
--- a/tests/imputation/tcn.py
+++ b/tests/imputation/tcn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import TCN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -82,17 +82,12 @@ def test_2_parameters(self):
         assert hasattr(self.tcn, "best_loss")
         self.assertNotEqual(self.tcn.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.tcn, "best_model_dict")
-            and self.tcn.best_model_dict is not None
-        )
+        assert hasattr(self.tcn, "best_model_dict") and self.tcn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-tcn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.tcn)
diff --git a/tests/imputation/tefn.py b/tests/imputation/tefn.py
index e0edd0fe..059fefa1 100644
--- a/tests/imputation/tefn.py
+++ b/tests/imputation/tefn.py
@@ -15,7 +15,7 @@
 from pypots.imputation import TEFN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -74,24 +74,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.tefn, "model") and self.tefn.model is not None
 
-        assert (
-                hasattr(self.tefn, "optimizer") and self.tefn.optimizer is not None
-        )
+        assert hasattr(self.tefn, "optimizer") and self.tefn.optimizer is not None
 
         assert hasattr(self.tefn, "best_loss")
         self.assertNotEqual(self.tefn.best_loss, float("inf"))
 
-        assert (
-                hasattr(self.tefn, "best_model_dict")
-                and self.tefn.best_model_dict is not None
-        )
+        assert hasattr(self.tefn, "best_model_dict") and self.tefn.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-tefn")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.tefn)
diff --git a/tests/imputation/tide.py b/tests/imputation/tide.py
index 3f18e4d7..20354b5d 100644
--- a/tests/imputation/tide.py
+++ b/tests/imputation/tide.py
@@ -15,7 +15,7 @@
 from pypots.imputation import TiDE
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -84,17 +84,12 @@ def test_2_parameters(self):
         assert hasattr(self.tide, "best_loss")
         self.assertNotEqual(self.tide.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.tide, "best_model_dict")
-            and self.tide.best_model_dict is not None
-        )
+        assert hasattr(self.tide, "best_model_dict") and self.tide.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-tide")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.tide)
diff --git a/tests/imputation/timemixer.py b/tests/imputation/timemixer.py
index a0735663..e972d055 100644
--- a/tests/imputation/timemixer.py
+++ b/tests/imputation/timemixer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import TimeMixer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -78,25 +78,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.timemixer, "model") and self.timemixer.model is not None
 
-        assert (
-            hasattr(self.timemixer, "optimizer")
-            and self.timemixer.optimizer is not None
-        )
+        assert hasattr(self.timemixer, "optimizer") and self.timemixer.optimizer is not None
 
         assert hasattr(self.timemixer, "best_loss")
         self.assertNotEqual(self.timemixer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.timemixer, "best_model_dict")
-            and self.timemixer.best_model_dict is not None
-        )
+        assert hasattr(self.timemixer, "best_model_dict") and self.timemixer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-timemixer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.timemixer)
diff --git a/tests/imputation/timesnet.py b/tests/imputation/timesnet.py
index 8959cc9f..7f4e6ceb 100644
--- a/tests/imputation/timesnet.py
+++ b/tests/imputation/timesnet.py
@@ -15,7 +15,7 @@
 from pypots.imputation import TimesNet
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -79,24 +79,17 @@ def test_1_impute(self):
     def test_2_parameters(self):
         assert hasattr(self.timesnet, "model") and self.timesnet.model is not None
 
-        assert (
-            hasattr(self.timesnet, "optimizer") and self.timesnet.optimizer is not None
-        )
+        assert hasattr(self.timesnet, "optimizer") and self.timesnet.optimizer is not None
 
         assert hasattr(self.timesnet, "best_loss")
         self.assertNotEqual(self.timesnet.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.timesnet, "best_model_dict")
-            and self.timesnet.best_model_dict is not None
-        )
+        assert hasattr(self.timesnet, "best_model_dict") and self.timesnet.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-timesnet")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.timesnet)
diff --git a/tests/imputation/transformer.py b/tests/imputation/transformer.py
index e509c899..b6e3977f 100644
--- a/tests/imputation/transformer.py
+++ b/tests/imputation/transformer.py
@@ -15,7 +15,7 @@
 from pypots.imputation import Transformer
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -65,37 +65,25 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-transformer")
     def test_1_impute(self):
         imputed_X = self.transformer.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"Transformer test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-transformer")
     def test_2_parameters(self):
         assert hasattr(self.transformer, "model") and self.transformer.model is not None
 
-        assert (
-            hasattr(self.transformer, "optimizer")
-            and self.transformer.optimizer is not None
-        )
+        assert hasattr(self.transformer, "optimizer") and self.transformer.optimizer is not None
 
         assert hasattr(self.transformer, "best_loss")
         self.assertNotEqual(self.transformer.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.transformer, "best_model_dict")
-            and self.transformer.best_model_dict is not None
-        )
+        assert hasattr(self.transformer, "best_model_dict") and self.transformer.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-transformer")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.transformer)
diff --git a/tests/imputation/trmf.py b/tests/imputation/trmf.py
index b5509da3..d7b312da 100644
--- a/tests/imputation/trmf.py
+++ b/tests/imputation/trmf.py
@@ -14,7 +14,7 @@
 
 from pypots.imputation import TRMF
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from pypots.utils.visual.data import plot_data, plot_missingness
 from tests.global_test_config import (
     DATA,
diff --git a/tests/imputation/usgan.py b/tests/imputation/usgan.py
index c9da6696..57150ef6 100644
--- a/tests/imputation/usgan.py
+++ b/tests/imputation/usgan.py
@@ -15,7 +15,7 @@
 from pypots.imputation import USGAN
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mse
+from pypots.nn.functional import calc_mse
 from tests.global_test_config import (
     DATA,
     EPOCHS,
@@ -61,12 +61,8 @@ def test_0_fit(self):
     @pytest.mark.xdist_group(name="imputation-usgan")
     def test_1_impute(self):
         imputed_X = self.usgan.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MSE = calc_mse(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MSE = calc_mse(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"US-GAN test_MSE: {test_MSE}")
 
     @pytest.mark.xdist_group(name="imputation-usgan")
@@ -79,17 +75,12 @@ def test_2_parameters(self):
         assert hasattr(self.usgan, "best_loss")
         self.assertNotEqual(self.usgan.best_loss, float("inf"))
 
-        assert (
-            hasattr(self.usgan, "best_model_dict")
-            and self.usgan.best_model_dict is not None
-        )
+        assert hasattr(self.usgan, "best_model_dict") and self.usgan.best_model_dict is not None
 
     @pytest.mark.xdist_group(name="imputation-usgan")
     def test_3_saving_path(self):
         # whether the root saving dir exists, which should be created by save_log_into_tb_file
-        assert os.path.exists(
-            self.saving_path
-        ), f"file {self.saving_path} does not exist"
+        assert os.path.exists(self.saving_path), f"file {self.saving_path} does not exist"
 
         # check if the tensorboard file and model checkpoints exist
         check_tb_and_model_checkpoints_existence(self.usgan)
diff --git a/tests/optim/adadelta.py b/tests/optim/adadelta.py
index 2b48cfb9..062cdf18 100644
--- a/tests/optim/adadelta.py
+++ b/tests/optim/adadelta.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import Adadelta
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestAdadelta(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
diff --git a/tests/optim/adagrad.py b/tests/optim/adagrad.py
index a505a15e..c47e6e33 100644
--- a/tests/optim/adagrad.py
+++ b/tests/optim/adagrad.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import Adagrad
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestAdagrad(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
diff --git a/tests/optim/adam.py b/tests/optim/adam.py
index 89a85d41..85e41e5e 100644
--- a/tests/optim/adam.py
+++ b/tests/optim/adam.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import Adam
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestAdam(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
diff --git a/tests/optim/adamw.py b/tests/optim/adamw.py
index c2674f62..57f8b5eb 100644
--- a/tests/optim/adamw.py
+++ b/tests/optim/adamw.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import AdamW
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestAdamW(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
diff --git a/tests/optim/lr_schedulers.py b/tests/optim/lr_schedulers.py
index 3c6140dc..b89eaaee 100644
--- a/tests/optim/lr_schedulers.py
+++ b/tests/optim/lr_schedulers.py
@@ -22,7 +22,7 @@
     MultiplicativeLR,
 )
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -71,12 +71,8 @@ def test_0_lambda_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-multiplicative")
@@ -99,12 +95,8 @@ def test_1_multiplicative_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-step")
@@ -127,12 +119,8 @@ def test_2_step_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-multistep")
@@ -155,12 +143,8 @@ def test_3_multistep_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-constant")
@@ -184,12 +168,8 @@ def test_4_constant_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-linear")
@@ -212,12 +192,8 @@ def test_5_linear_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
     @pytest.mark.xdist_group(name="lrs-exponential")
@@ -240,10 +216,6 @@ def test_6_exponential_lrs(self):
         )
         saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
diff --git a/tests/optim/rmsprop.py b/tests/optim/rmsprop.py
index 154805f1..a1bedfa1 100644
--- a/tests/optim/rmsprop.py
+++ b/tests/optim/rmsprop.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import RMSprop
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestRMSprop(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
diff --git a/tests/optim/sgd.py b/tests/optim/sgd.py
index 2c5a2e22..447a151b 100644
--- a/tests/optim/sgd.py
+++ b/tests/optim/sgd.py
@@ -13,7 +13,7 @@
 from pypots.imputation import SAITS
 from pypots.optim import SGD
 from pypots.utils.logging import logger
-from pypots.utils.metrics import calc_mae
+from pypots.nn.functional import calc_mae
 from tests.global_test_config import DATA
 from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET
 
@@ -43,12 +43,8 @@ class TestSGD(unittest.TestCase):
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
         imputed_X = self.saits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = calc_mae(
-            imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"]
-        )
+        assert not np.isnan(imputed_X).any(), "Output still has missing values after running impute()."
+        test_MAE = calc_mae(imputed_X, DATA["test_X_ori"], DATA["test_X_indicating_mask"])
         logger.info(f"SAITS test_MAE: {test_MAE}")