solving merge conflicts

mathysgrapotte · Sep 17, 2024 · c6e5990 · c6e5990
2 parents 0d42e14 + 901b068
commit c6e5990
Show file tree

Hide file tree

Showing 58 changed files with 2,150 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# Ignore all __pycache__ directories, including those in subdirectories
+**/__pycache__/
+
 # C extensions
 *.so
 
@@ -25,6 +28,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+test_env_stimulus/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -160,3 +164,16 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# VS Code
+.vscode/
+*.code-workspace
+
+# VS Code Local History extension
+.history/
+
+# VS Code Settings Sync extension
+.vscode-sync.json
+
+# VS Code Python extension
+.vscode-python/
diff --git a/additional_tests/test_launch_interpret_json.py b/additional_tests/test_launch_interpret_json.py
@@ -1,8 +1,8 @@
 import unittest
-from bin.launch_interpret_json import interpret_json
+from src.stimulus.cli.interpret_json import interpret_json
 
 """ 
-to run this test you need to put a relatibve inmport in the JsonSchema import line in launch_interpret_json.py.
+to run this test you need to put a relative import in the JsonSchema import line in launch_interpret_json.py.
 
 To explain this further launch_interpret_json.py is meant to be launched as it is:
 python3 launch_interpret_json.py 

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,13 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 
+[project.scripts]
+stimulus-shuffle-csv = "stimulus.cli.shuffle_csv:main"
+stimulus-transform-csv = "stimulus.cli.transform_csv:main"
+stimulus-split-csv = "stimulus.cli.split_csv:main"
+stimulus-analysis-default = "stimulus.cli.analysis_default:main"
+stimulus-predict = "stimulus.cli.predict:main"
+
 [project.urls]
 Homepage = "https://github.com/mathysgrapotte/stimulus-py"
 Issues = "https://github.com/mathysgrapotte/stimulus-py/issues"

diff --git a/src/data/__pycache__/__init__.cpython-311.pyc b/src/data/__pycache__/__init__.cpython-311.pyc
diff --git a/src/data/__pycache__/__init__.cpython-38.pyc b/src/data/__pycache__/__init__.cpython-38.pyc
diff --git a/src/data/__pycache__/csv.cpython-311.pyc b/src/data/__pycache__/csv.cpython-311.pyc
diff --git a/src/data/__pycache__/csv_parser.cpython-311.pyc b/src/data/__pycache__/csv_parser.cpython-311.pyc
diff --git a/src/data/__pycache__/experiments.cpython-311.pyc b/src/data/__pycache__/experiments.cpython-311.pyc
diff --git a/src/data/__pycache__/experiments.cpython-38.pyc b/src/data/__pycache__/experiments.cpython-38.pyc
diff --git a/src/data/__pycache__/handlertorch.cpython-311.pyc b/src/data/__pycache__/handlertorch.cpython-311.pyc
diff --git a/src/data/data_types/__pycache__/data_types.cpython-311.pyc b/src/data/data_types/__pycache__/data_types.cpython-311.pyc
diff --git a/src/data/encoding/__pycache__/encoders.cpython-311.pyc b/src/data/encoding/__pycache__/encoders.cpython-311.pyc
diff --git a/src/data/noise/__pycache__/noise_generators.cpython-311.pyc b/src/data/noise/__pycache__/noise_generators.cpython-311.pyc
diff --git a/src/data/spliters/__pycache__/spliters.cpython-311.pyc b/src/data/spliters/__pycache__/spliters.cpython-311.pyc
diff --git a/src/data/splitters/__pycache__/splitters.cpython-311.pyc b/src/data/splitters/__pycache__/splitters.cpython-311.pyc
diff --git a/src/data/splitters/__pycache__/splitters.cpython-38.pyc b/src/data/splitters/__pycache__/splitters.cpython-38.pyc
diff --git a/src/data/transform/__pycache__/data_transformation_generators.cpython-311.pyc b/src/data/transform/__pycache__/data_transformation_generators.cpython-311.pyc
diff --git a/src/learner/__pycache__/predict.cpython-311.pyc b/src/learner/__pycache__/predict.cpython-311.pyc
diff --git a/src/learner/__pycache__/raytune_learner.cpython-311.pyc b/src/learner/__pycache__/raytune_learner.cpython-311.pyc
diff --git a/src/learner/__pycache__/raytune_parser.cpython-311.pyc b/src/learner/__pycache__/raytune_parser.cpython-311.pyc
diff --git a/src/stimulus/__init__.py b/src/stimulus/__init__.py
@@ -0,0 +1,19 @@
+from .utils.launch_utils import get_experiment, import_class_from_file, memory_split_for_ray_init
+from .data.csv import CsvProcessing
+from .learner.raytune_learner import TuneWrapper
+from .learner.raytune_parser import TuneParser
+from .data.handlertorch import TorchDataset
+from .learner.predict import PredictWrapper
+from .utils.json_schema import JsonSchema
+
+__all__ = [
+    'get_experiment',
+    'import_class_from_file',
+    'memory_split_for_ray_init',
+    'CsvProcessing',
+    'TuneWrapper',
+    'TuneParser',
+    'TorchDataset',
+    'PredictWrapper',
+    'JsonSchema'
+]
diff --git a/src/analysis/analysis_default.py → src/stimulus/analysis/analysis_default.py b/src/analysis/analysis_default.py → src/stimulus/analysis/analysis_default.py
@@ -6,8 +6,8 @@
 from matplotlib import pyplot as plt
 from typing import Any, Tuple
 from torch.utils.data import DataLoader
-from src.data.handlertorch import TorchDataset
-from src.learner.predict import PredictWrapper
+from src.stimulus.data.handlertorch import TorchDataset
+from src.stimulus.learner.predict import PredictWrapper
 
 class Analysis:
     """

diff --git a/src/stimulus/cli/analysis_default.py b/src/stimulus/cli/analysis_default.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import pandas as pd
+import torch
+
+from launch_utils import import_class_from_file, get_experiment
+from src.analysis.analysis_default import AnalysisPerformanceTune, AnalysisRobustness
+
+def get_args():
+
+    "get the arguments when using from the commandline"
+
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("-m", "--model", type=str, required=True, metavar="FILE", help='The model .py file')
+    parser.add_argument("-w", "--weight", type=str, required=True, nargs="+", metavar="FILE", help="Model weights .pt file")
+    parser.add_argument("-me", "--metrics", type=str, required=True, nargs="+", metavar="FILE", help='The file path for the metrics file obtained during tuning')
+    parser.add_argument("-ec", "--experiment_config", type=str, required=True, nargs="+", metavar="FILE", help='The experiment config used to modify the data.')
+    parser.add_argument("-mc", "--model_config", type=str, required=True, nargs="+", metavar="FILE", help="The tune config file.")
+    parser.add_argument("-d", "--data", type=str, required=True, nargs="+", metavar="FILE", help='List of data files to be used for the analysis.')
+    # parser.add_argument("-o", "--output", type=str, required=True, metavar="FILE", help="output report")
+    parser.add_argument("-o", "--outdir", type=str, required=True, help="output directory")
+
+    args = parser.parse_args()
+    return args
+
+def main(model_path: str, weight_list: list, mconfig_list: list, metrics_list: list, econfig_list: list, data_list: list, outdir: str):
+
+    metrics = ["rocauc", "prauc", "mcc", "f1score", "precision", "recall"]
+
+    # plot the performance during tuning/training
+    run_analysis_performance_tune(
+        metrics_list, 
+        metrics+["loss"], 
+        os.path.join(outdir, "performance_tune_train")
+    )
+
+    # run robustness analysis
+    # this block will first predict the output of each model on each dataset test
+    # and then report some metrics to evaluate each model robustness
+    run_analysis_performance_model(
+        metrics, 
+        model_path, 
+        weight_list, 
+        mconfig_list, 
+        econfig_list, 
+        data_list, 
+        os.path.join(outdir, "performance_robustness")
+    )
+
+def run_analysis_performance_tune(metrics_list: list, metrics: list, outdir: str):
+    """
+    each model has a metrics file obtained during tuning/training,
+    check the performance there and plot it.
+    This is to track the model performance per training iteration.
+    """
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    for metrics_path in metrics_list:
+        AnalysisPerformanceTune(metrics_path).plot_metric_vs_iteration(
+            metrics=metrics, 
+            output=os.path.join(outdir, metrics_path.replace("-metrics.csv", "") + "-metric_vs_iteration.png")
+        )
+
+def run_analysis_performance_model(metrics: list, model_path: list, weight_list: list, mconfig_list: list, econfig_list: list, data_list: list, outdir: str):
+    """
+    Block to report about the model robustness.
+
+    This block will compute the predictions of each model for each dataset.
+    This information will be parsed.
+    And then plots will be generated to report the model robustness.
+    """
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    # load all the models weights into a list
+    model_names = []
+    model_list  = []
+    model_class = import_class_from_file(model_path)
+    for weight_path, mconfig_path in zip(weight_list, mconfig_list):
+        model = load_model( model_class, weight_path, mconfig_path )
+        model_names.append( mconfig_path.split("/")[-1].replace("-config.json", "") )
+        model_list.append( model )
+
+    # read experiment config and retrieve experiment name and then initialize the experiment class
+    experiment_name = None
+    with open(econfig_list[0], 'r') as in_json:
+        d = json.load(in_json)
+        experiment_name = d["experiment"]
+    initialized_experiment_class = get_experiment(experiment_name)
+
+    # initialize analysis
+    # TODO for the moment I am hard coding the batch size for the forward pass to predict
+    # but we can make it dynamic in the future
+    # or depending on the dataset size, etc.
+    analysis = AnalysisRobustness(metrics, initialized_experiment_class, batch_size=256)
+
+    # compute the performance of each model on each dataset
+    df = analysis.get_performance_table(model_names, model_list, data_list)
+    df.to_csv(os.path.join(outdir, "performance_table.csv"), index=False)
+
+    # get the average performance of each model across datasets
+    tmp = analysis.get_average_performance_table(df)
+    tmp.to_csv(os.path.join(outdir, "average_performance_table.csv"), index=False)
+
+    # plot heatmap: model as rows and data as columns
+    analysis.plot_performance_heatmap(df, output=os.path.join(outdir, "performance_heatmap.png"))
+
+    # plot barplot: model delta performance between each dataset and the reference dataset
+    outdir2 = os.path.join(outdir, "delta_performance_vs_data")
+    if not os.path.exists(outdir2):
+        os.makedirs(outdir2)
+    for metric in metrics:
+        analysis.plot_delta_performance(metric, df, output=os.path.join(outdir2, "delta_performance_" + metric + ".png"))
+
+    # TODO add more analysis needed
+
+def load_model(model_class: object, weight_path: str, mconfig_path: str) -> object:
+    """
+    Load the model with its config and weights.
+    """
+    # load model config
+    with open(mconfig_path, 'r') as in_json:
+        mconfig = json.load(in_json)["model_params"]
+
+    # load model
+    model = model_class(**mconfig)
+    model.load_state_dict(torch.load(weight_path))
+
+    return model
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args.model, args.weight, args.model_config, args.metrics, args.experiment_config, args.data, args.outdir)
diff --git a/src/stimulus/cli/check_model.py b/src/stimulus/cli/check_model.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+import yaml
+
+from launch_utils import import_class_from_file, get_experiment, memory_split_for_ray_init
+from json_schema import JsonSchema
+from src.learner.raytune_learner import TuneWrapper as StimulusTuneWrapper
+from src.data.csv import CsvProcessing
+
+
+def get_args():
+
+    """get the arguments when using from the commandline"""
+
+    parser = argparse.ArgumentParser(description="Launch check_model")
+    parser.add_argument("-d", "--data", type=str, required=True, metavar="FILE", help="Path to input csv file")
+    parser.add_argument("-m", "--model", type=str, required=True, metavar="FILE", help="Path to model file")
+    parser.add_argument("-e", "--experiment", type=str, required=True, metavar="FILE", help="Experiment config file. From this the experiment class name is extracted.")
+    parser.add_argument("-c", "--config", type=str, required=True, metavar="FILE", help="Path to yaml config training file")
+    parser.add_argument("-w", "--initial_weights", type=str, required=False, nargs='?', const=None, default=None, metavar="FILE", help="The path to the initial weights. These can be used by the model instead of the random initialization")
+    parser.add_argument("--gpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_GPU", help="Use to limit the number of GPUs ray can use. This might be useful on many occasions, especially in a cluster system. The default value is None meaning ray will use all GPUs available. It can be set to 0 to use only CPUs.")
+    parser.add_argument("--cpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="Use to limit the number of CPUs ray can use. This might be useful on many occasions, especially in a cluster system. The default value is None meaning ray will use all CPUs available. It can be set to 0 to use only GPUs.")
+    parser.add_argument("--memory", type=str, required=False, nargs='?', const=None, default=None, metavar="MAX_MEMORY", help="ray can have a limiter on the total memory it can use. This might be useful on many occasions, especially in a cluster system. The default value is None meaning ray will use all memory available.")
+    parser.add_argument("-n", "--num_samples", type=int, required=False, nargs='?', const=3, default=3, metavar="NUM_SAMPLES", help="the config given for the tuning will have the field tune.tune_params.num_samples overwritten by this value. This means a more or less extensive representation of all possible combinations of choices for the tuning. For each run inside tune a snapshot of the config is taken and some params are chosen like loss function gradient descent, batch size etc. Some of this combination may not be compatible with either the data or the model. So the higher this value is the more likely that every value for a given param is tested. But if there are not that many choices in the tune config there is no point in putting a high value. Default is 3.")
+    parser.add_argument("--ray_results_dirpath", type=str, required=False, nargs='?', const=None, default=None, metavar="DIR_PATH", help="the location where ray_results output dir should be written. if set to None (default) ray will be place it in ~/ray_results. ")
+    parser.add_argument("--debug_mode", type=str, required=False, nargs='?', const=False, default=False, metavar="DEV", help="activate debug mode for tuning. default false, no debug.")
+
+    args = parser.parse_args()
+
+    return args
+
+
+
+def main(data_path: str,
+         model_path: str,
+         experiment_config: str,
+         config_path: str,
+         initial_weights_path: str = None,
+         gpus: int = None,
+         cpus: int = None,
+         memory: str = None,
+         num_samples: int = 3,
+         ray_results_dirpath: str = None,
+         _debug_mode: str = False) -> None:
+
+    # TODO update to yaml the experimnt config
+    # load json into dictionary
+    exp_config = {}
+    with open(experiment_config, 'r') as in_json:
+        exp_config = json.load(in_json)
+
+    # Initialize json schema it checks for correctness of the Json architecture and fields / values. already raises errors.
+    schema = JsonSchema(exp_config)
+
+    # initialize the experiment class
+    initialized_experiment_class = get_experiment(schema.experiment)
+
+    # import the model correctly but do not initialize it yet, ray_tune does that itself
+    model_class = import_class_from_file(model_path)
+
+    # Update the tune config file. no need to run the whole amount asked for the tuning. Basically downsample the tuning.
+    updated_tune_conf = "check_model_modified_tune_config.yaml"
+    with open(config_path, 'r') as conf_file, open(updated_tune_conf, "w") as new_conf:
+        user_tune_config = yaml.safe_load(conf_file)
+        # make so the tune run just once per num_sample
+        user_tune_config["tune"]["tune_params"]["num_samples"]          = num_samples
+        # differentiate between schedulers, some may have different params
+        # for asha make all parameters regarding the length of the tune run equal to 1
+        if user_tune_config["tune"]["scheduler"]["name"] == "ASHAScheduler":
+            user_tune_config["tune"]["scheduler"]["params"]["max_t"]        = 1
+            user_tune_config["tune"]["scheduler"]["params"]["grace_period"] = 1
+            user_tune_config["tune"]["step_size"]                           = 1
+        # for the FIFO scheduler is simpler just set the stop criteria at 1 iteration
+        elif user_tune_config["tune"]["scheduler"]["name"] == "FIFOScheduler":
+            user_tune_config["tune"]["run_params"]["stop"]["training_iteration"] = 1
+
+        # add initial weights to the config, when provided
+        if initial_weights_path is not None:
+            user_tune_config["model_params"]["initial_weights"] = os.path.abspath(initial_weights_path)
+
+        # TODO future schedulers specific info will go here as well. maybe find a cleaner way.
+
+        # TODO check if among the first 2 values of all splitters params there is a percentage that makes the resulting split smaller that the biggest batch value
+
+        # save to file the new dictionary because StimulusTuneWrapper only takes paths
+        yaml.dump(user_tune_config, new_conf)
+
+    # initialize the csv processing class, it open and reads the csv in automatic 
+    csv_obj         = CsvProcessing(initialized_experiment_class, data_path)
+    downsampled_csv = "downsampled.csv"
+
+    # TODO downsample data, do so tasking care of batch size so good point to tell user batch size is too big
+
+
+
+    # add the split column if not present
+    if "split" not in csv_obj.check_and_get_categories():
+        # split values are set to be half the data given so that the downsampled file total lines can be as little as possible
+        config_default = {"name": "RandomSplitter", "params": {"split": [0.5, 0.5, 0.0]} }
+        csv_obj.add_split(config_default)
+
+    # save the modified csv
+    csv_obj.save(downsampled_csv)
+
+    # compute the memory requirements for ray init. Usefull in case ray detects them wrongly. Memory is split in two for ray: for store_object memory and the other actual memory for tuning. The following function takes the total possible usable/allocated memory as a string parameter and return in bytes the values for store_memory (30% as default in ray) and memory (70%).
+    object_store_mem, mem = memory_split_for_ray_init(memory)
+
+    # set ray_result dir ubication. TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath.
+    ray_results_dirpath = None if ray_results_dirpath is None else os.path.abspath(ray_results_dirpath)
+
+    # Create the learner
+    learner = StimulusTuneWrapper(updated_tune_conf,
+                                  model_class,
+                                  downsampled_csv,
+                                  initialized_experiment_class,
+                                  max_gpus=gpus,
+                                  max_cpus=cpus,
+                                  max_object_store_mem=object_store_mem,
+                                  max_mem=mem,
+                                  ray_results_dir=ray_results_dirpath,
+                                  _debug=_debug_mode) # TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath
+
+    # Tune the model and get the tuning results
+    grid_results = learner.tune()
+
+    # check that there were no errors during tuning. Tune still sends exitcode 0 even on internal errors.
+    for i in range(len(grid_results)):
+        result = grid_results[i]
+        if not result.error:
+            print(f"Trial finishes successfully with metrics" f"{result.metrics}.")
+        else:
+            raise TypeError(f"Trial failed with error {result.error}.")
+
+
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args.data, 
+         args.model, 
+         args.experiment, 
+         args.config,
+         args.initial_weights,
+         args.gpus, 
+         args.cpus,
+         args.memory, 
+         args.num_samples, 
+         args.ray_results_dirpath,
+         args.debug_mode)