baler-collaboration · neogyk · Oct 1, 2024
diff --git a/example/run.py b/example/run.py
@@ -30,6 +30,7 @@ def define_config():
     # Define config
     config.input_path = "input/exafel_1.npz"
     config.output_path = "output/"
+    config.experiment_name = "exafel"
 
     config = config_module.Config
     config.compression_ratio = 1000

diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ scipy==1.10.1
 setuptools==50.3.1
 torch==2.1.0
 tqdm==4.66.1
+mlflow
diff --git a/src/baler_compressor/trainer.py b/src/baler_compressor/trainer.py
@@ -8,6 +8,7 @@
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
 import math
+import mlflow
 
 import baler_compressor.helper as helper
 import baler_compressor.utils as utils
@@ -46,6 +47,17 @@ def run(data_path, config):
         verbose,
     )
 
+
+    experiment_name = config.experiment_name
+    if not os.path.exists(config.output_path):
+        os.mkdir(config.output_path)
+
+    mlflow.set_tracking_uri(f"sqlite:///{os.path.abspath(config.output_path)}/mlruns.db")
+    tracking_uri = mlflow.get_tracking_uri()
+
+    print(f"Current tracking uri: {tracking_uri}")
+    mlflow.set_experiment(experiment_name)
+
     if verbose:
         print("Training and testing sets normalized")
 
@@ -109,15 +121,18 @@ def run(data_path, config):
     # if verbose:
     #    print(f"Training path: {training_path}")
 
-    trained_model, loss_data = train(
-        model,
-        number_of_columns,
-        train_set_norm,
-        test_set_norm,
-        # training_path,
-        config,
-    )
-
+    with mlflow.start_run():
+        for k,v in config.__dict__.items():mlflow.log_param(k,v)
+        trained_model, loss_data = train(
+            model,
+            number_of_columns,
+            train_set_norm,
+            test_set_norm,
+            # training_path,
+            config,
+        )
+    mlflow.end_run()
+
     if verbose:
         print("Training complete")
 
@@ -421,6 +436,11 @@ def train(model, variables, train_data, test_data, config):
         )
         train_loss.append(train_epoch_loss)
 
+        mlflow.log_metric("Train Loss", train_epoch_loss, step=epoch)
+        mlflow.log_metric("Train Loss MSE", mse_loss_fit, step=epoch)
+        mlflow.log_metric("Learning Rate ",lr_scheduler.lr_scheduler.get_last_lr()[0])
+        mlflow.log_metric("Train Regularized Loss", regularizer_loss_fit, step=epoch)
+
         if test_size:
             val_epoch_loss = validate(
                 model=trained_model,
@@ -439,7 +459,8 @@ def train(model, variables, train_data, test_data, config):
             early_stopping(val_epoch_loss)
             if early_stopping.early_stop:
                 break
-
+
+        mlflow.log_metric("Test Loss", val_epoch_loss, step=epoch)
         ### Implementation to save models & values after every N epochs, where N is stored in 'intermittent_saving_patience':
         # if intermittent_model_saving:
         #    if epoch % intermittent_saving_patience == 0: