From 5919fe3f576008d1ae62348d6c2fe327d85f0c43 Mon Sep 17 00:00:00 2001
From: kotaNakm <kotaNakm0317@gmail.com>
Date: Sat, 15 Apr 2023 00:37:58 +0900
Subject: [PATCH] a

---
 datasets/__init__.py        |   2 +-
 datasets/alpi/__init__.py   | 400 +++++++++++++++++++++++-------------
 datasets/alpi/dataset.py    | 398 ++++++++++++++++++++++-------------
 datasets/cbm/__init__.py    |  28 +--
 datasets/cmapss/__init__.py | 169 +++++++++------
 datasets/gdd/__init__.py    | 250 ++++++++++++----------
 datasets/gfd/__init__.py    |  67 +++---
 datasets/hydsys/__init__.py |  66 +++---
 datasets/mapm/__init__.py   | 161 +++++++++------
 datasets/ppd/__init__.py    |  91 ++++----
 datasets/ufd/__init__.py    |  64 +++---
 datasets/utils.py           |  18 +-
 12 files changed, 1031 insertions(+), 683 deletions(-)

diff --git a/datasets/__init__.py b/datasets/__init__.py
index a43e300..6d7b9ca 100644
--- a/datasets/__init__.py
+++ b/datasets/__init__.py
@@ -7,4 +7,4 @@
 from . import mapm
 from . import ppd
 from . import ufd
-from . import utils
\ No newline at end of file
+from . import utils
diff --git a/datasets/alpi/__init__.py b/datasets/alpi/__init__.py
index f81cd72..15004ec 100644
--- a/datasets/alpi/__init__.py
+++ b/datasets/alpi/__init__.py
@@ -11,12 +11,12 @@
 from scipy import sparse
 from sklearn.model_selection import train_test_split
 
-FAMILY = 'MACHINE_TYPE_00'
+FAMILY = "MACHINE_TYPE_00"
 
 
 # UTILS
 def zip_dir(dir_path, zip_path):
-    shutil.make_archive(zip_path, 'zip', dir_path)
+    shutil.make_archive(zip_path, "zip", dir_path)
 
 
 def drop_files(dir_path, extension):
@@ -27,7 +27,9 @@ def drop_files(dir_path, extension):
 
 
 def prune(alarms):
-    return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype('uint16')
+    return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype(
+        "uint16"
+    )
 
 
 def prune_series(seq):
@@ -38,24 +40,24 @@ def prune_series(seq):
 
 def padding_sequence(seq, sequence_length):
     new_seq = np.zeros(sequence_length)
-    new_seq[sequence_length - len(seq):] = seq
+    new_seq[sequence_length - len(seq) :] = seq
     return new_seq
 
 
 def return_variables(params):
-    window_input = params['window_input']
-    window_output = params['window_output']
-    offset = params['offset']
-    verbose = params['verbose']
-    store_path = params['store_path']  # store_path is used to save data
-    min_count = params['min_count']
-    sigma = params['sigma']
+    window_input = params["window_input"]
+    window_output = params["window_output"]
+    offset = params["offset"]
+    verbose = params["verbose"]
+    store_path = params["store_path"]  # store_path is used to save data
+    min_count = params["min_count"]
+    sigma = params["sigma"]
     return window_input, window_output, offset, verbose, store_path, min_count, sigma
 
 
 def find_serials_offsets(store_path):
     filepath = store_path + "/" + store_path.split("/")[-1] + ".config"
-    with open(filepath, 'rb') as f:
+    with open(filepath, "rb") as f:
         serials, offsets = pickle.load(f)
     return serials, offsets
 
@@ -78,31 +80,40 @@ def return_index(date_range, target):
 
 
 def create_params_list(data_path, params, verbose=True):
-    windows_input = params['windows_input']
-    windows_output = params['windows_output']
-    min_counts = params['min_counts']
-    sigmas = params['sigmas']
-    offsets = params['offsets']
+    windows_input = params["windows_input"]
+    windows_output = params["windows_output"]
+    min_counts = params["min_counts"]
+    sigmas = params["sigmas"]
+    offsets = params["offsets"]
     params_list = []
-    dir_template = data_path + '/' + FAMILY + \
-        '_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}'
+    dir_template = (
+        data_path
+        + "/"
+        + FAMILY
+        + "_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}"
+    )
     for window_input in windows_input:
         for window_output in windows_output:
             for min_count in min_counts:
                 for sigma in sigmas:
                     for offset in offsets:
                         store_path = dir_template.format(
-                            window_input=window_input, window_output=window_output, offset=offset, min_count=min_count, sigma=sigma)
+                            window_input=window_input,
+                            window_output=window_output,
+                            offset=offset,
+                            min_count=min_count,
+                            sigma=sigma,
+                        )
                         if not os.path.isdir(store_path):
                             os.makedirs(store_path)
                         params = {
-                            'window_input': window_input,
-                            'window_output': window_output,
-                            'offset': offset,
-                            'min_count': min_count,
-                            'sigma': sigma,
-                            'verbose': verbose,
-                            'store_path': store_path
+                            "window_input": window_input,
+                            "window_output": window_output,
+                            "offset": offset,
+                            "min_count": min_count,
+                            "sigma": sigma,
+                            "verbose": verbose,
+                            "store_path": store_path,
                         }
                         params_list.append(params)
     return params_list
@@ -114,8 +125,7 @@ def create_params_list(data_path, params, verbose=True):
 def generate_dataset_by_serial_offset(data, params, current_offset):
     data["current_offset"] = current_offset
     current_offset = datetime.timedelta(minutes=current_offset)
-    window_input, window_output, _, _, _, _, _ = return_variables(
-        params)
+    window_input, window_output, _, _, _, _, _ = return_variables(params)
 
     min_timestamp = data.index.min()
     min_timestamp += current_offset
@@ -123,24 +133,28 @@ def generate_dataset_by_serial_offset(data, params, current_offset):
 
     # create date range for input
     date_range = pd.date_range(
-        start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min")
+        start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min"
+    )
     date_range = [d for d in date_range]
 
     # create date range for output
     delta_in = datetime.timedelta(minutes=window_input)
     delta_out = datetime.timedelta(minutes=window_output)
-    date_range_output = [(d + delta_in, d + delta_in + delta_out)
-                         for d in date_range]  # ranges of outputs
+    date_range_output = [
+        (d + delta_in, d + delta_in + delta_out) for d in date_range
+    ]  # ranges of outputs
     date_range_output = np.asarray(date_range_output).reshape(1, -1)[0]
 
     # create samples
     series = pd.Series(data.index).apply(
-        lambda target: return_index(date_range, target))
+        lambda target: return_index(date_range, target)
+    )
     series.index = data.index
     data["bin_input"] = series
 
     series = pd.Series(data.index).apply(
-        lambda target: return_index_output(date_range_output, target))
+        lambda target: return_index_output(date_range_output, target)
+    )
     series.index = data.index
     data["bin_output"] = series
 
@@ -149,14 +163,14 @@ def generate_dataset_by_serial_offset(data, params, current_offset):
     unique2 = data["bin_output"].unique()
     periods_id = list(set(unique1).intersection(set(unique2)) - set([-1]))
 
-    data = data[(data["bin_input"].isin(periods_id)) |
-                (data["bin_output"].isin(periods_id))]
+    data = data[
+        (data["bin_input"].isin(periods_id)) | (data["bin_output"].isin(periods_id))
+    ]
     return data
 
 
 def generate_dataset_by_serial(data, params):
-    window_input, _, offset, verbose, store_path, _, _ = return_variables(
-        params)
+    window_input, _, offset, verbose, store_path, _, _ = return_variables(params)
     serial = data["serial"][0]
     if verbose:
         print("serial: ", serial)
@@ -164,9 +178,9 @@ def generate_dataset_by_serial(data, params):
     offsets = list(range(0, window_input, offset))
     offset_data = []
     for current_offset in offsets:
-        df_offset = generate_dataset_by_serial_offset(data.copy(),
-                                                      params,
-                                                      current_offset)
+        df_offset = generate_dataset_by_serial_offset(
+            data.copy(), params, current_offset
+        )
         offset_data.append(df_offset)
     dataset = pd.concat(offset_data)
     if verbose:
@@ -176,7 +190,7 @@ def generate_dataset_by_serial(data, params):
 
 
 def generate_dataset(data, params):
-    grouped_data = data.groupby('serial')
+    grouped_data = data.groupby("serial")
     for _, data_serial in grouped_data:
         generate_dataset_by_serial(data_serial, params)
     return
@@ -184,6 +198,7 @@ def generate_dataset(data, params):
 
 # PHASE 2
 
+
 def prune_dataset(params):
     _, _, _, _, store_path, _, _ = return_variables(params)
     serials = []
@@ -198,8 +213,7 @@ def prune_dataset(params):
 
 
 def prune_df(df, serial_id, params):
-    _, _, offset, verbose, store_path, min_count, _ = return_variables(
-        params)
+    _, _, offset, verbose, store_path, min_count, _ = return_variables(params)
 
     offsets = df["current_offset"].unique()
     for offset in offsets:
@@ -215,27 +229,37 @@ def prune_df(df, serial_id, params):
         periods_id = list(set(unique1).intersection(set(unique2)) - set([-1]))
         periods_id.sort()  # temporal sorting
 
-        diz_input = {bin_id: group.alarm.sort_index(
-        ).values for bin_id, group in groups_input if bin_id in periods_id}
-        diz_output = {bin_id: group.alarm.sort_index(
-        ).values for bin_id, group in groups_output if bin_id in periods_id}
+        diz_input = {
+            bin_id: group.alarm.sort_index().values
+            for bin_id, group in groups_input
+            if bin_id in periods_id
+        }
+        diz_output = {
+            bin_id: group.alarm.sort_index().values
+            for bin_id, group in groups_output
+            if bin_id in periods_id
+        }
 
         # list of [seq_x,seq_y]
-        X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]]
-                      for bin_id in periods_id]
+        X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] for bin_id in periods_id]
 
         # apply min_count: remove sequences with count<min_count
-        X_Y_offset = [seq for seq in X_Y_offset if len(
-            seq[0]) >= min_count and len(seq[1]) >= min_count]
+        X_Y_offset = [
+            seq
+            for seq in X_Y_offset
+            if len(seq[0]) >= min_count and len(seq[1]) >= min_count
+        ]
 
         # save list on file
         # for each (serial,offset) a different list is saved
         if verbose:
-            print("{} has length: {}".format(str(serial_id) +
-                                             "_offset_" + str(offset) + ".npz", len(X_Y_offset)))
-        filepath = store_path + "/" + \
-            str(serial_id) + "_offset_" + str(offset) + ".npz"
-        with open(filepath, 'wb') as f:
+            print(
+                "{} has length: {}".format(
+                    str(serial_id) + "_offset_" + str(offset) + ".npz", len(X_Y_offset)
+                )
+            )
+        filepath = store_path + "/" + str(serial_id) + "_offset_" + str(offset) + ".npz"
+        with open(filepath, "wb") as f:
             pickle.dump(X_Y_offset, f)
 
     return offsets
@@ -243,9 +267,18 @@ def prune_df(df, serial_id, params):
 
 # PHASE 3
 
-def create_final_dataset(params, serials, offsets, sequence_input_length, sequence_output_length, removal_alarms=None, relevance_alarms=None, file_tag='all_alarms'):
-    _, _, offset, verbose, store_path, _, sigma = return_variables(
-        params)
+
+def create_final_dataset(
+    params,
+    serials,
+    offsets,
+    sequence_input_length,
+    sequence_output_length,
+    removal_alarms=None,
+    relevance_alarms=None,
+    file_tag="all_alarms",
+):
+    _, _, offset, verbose, store_path, _, sigma = return_variables(params)
     padding_mode = "after"
     if "padding_mode" in params:
         padding_mode = params["padding_mode"]
@@ -266,24 +299,37 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
         for offset in offsets:
             sentinel += 1
             if verbose:
-                print("{}/{}:  serial: {}  offset: {}".format(sentinel,
-                                                              tot_combo,
-                                                              serial,
-                                                              offset))
+                print(
+                    "{}/{}:  serial: {}  offset: {}".format(
+                        sentinel, tot_combo, serial, offset
+                    )
+                )
             filename = str(serial) + "_offset_" + str(offset) + ".npz"
             if filename in os.listdir(store_path):
                 file_path = os.path.join(store_path, filename)
-                with open(file_path, 'rb') as f:
+                with open(file_path, "rb") as f:
                     seqences = pickle.load(f)
                     if removal_alarms != None and len(removal_alarms) > 0:
-                        X = [[alarm for alarm in seq_x if (
-                            alarm not in removal_alarms and alarm != 0)] for seq_x, seq_y in seqences]
+                        X = [
+                            [
+                                alarm
+                                for alarm in seq_x
+                                if (alarm not in removal_alarms and alarm != 0)
+                            ]
+                            for seq_x, seq_y in seqences
+                        ]
                     else:
                         X = [seq_x for seq_x, seq_y in seqences]
 
                     if relevance_alarms != None and len(relevance_alarms) > 0:
-                        Y = [[alarm for alarm in seq_y if (
-                            alarm in relevance_alarms and alarm != 0)] for seq_x, seq_y in seqences]
+                        Y = [
+                            [
+                                alarm
+                                for alarm in seq_y
+                                if (alarm in relevance_alarms and alarm != 0)
+                            ]
+                            for seq_x, seq_y in seqences
+                        ]
                     else:
                         Y = [seq_y for seq_x, seq_y in seqences]
 
@@ -301,8 +347,9 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
     lengths_Y = np.asarray(lengths_Y)
     mu_sequence_input_length = lengths_X.mean()
     std_sequence_input_length = lengths_X.std()
-    sequence_input_length = int(
-        mu_sequence_input_length + sigma * std_sequence_input_length) + 1
+    sequence_input_length = (
+        int(mu_sequence_input_length + sigma * std_sequence_input_length) + 1
+    )
     sequence_output_length = np.max(lengths_Y)
     sentinel = 0
     # it is iterated on all combinations of serials and offsets
@@ -314,28 +361,29 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
 
         # PADDING
         if padding_mode == "after":
-            X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x)))
-                 for seq_x in X]
-            Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y)))
-                 for seq_y in Y]
+            X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) for seq_x in X]
+            Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) for seq_y in Y]
         elif padding_mode == "before":
             X = [padding_sequence(seq_x, sequence_input_length) for seq_x in X]
-            Y = [padding_sequence(seq_y, sequence_output_length)
-                 for seq_y in Y]
+            Y = [padding_sequence(seq_y, sequence_output_length) for seq_y in Y]
 
         if len(X) > 1:
             X_train, X_test, Y_train, Y_test = train_test_split(
-                X, Y, test_size=0.3, shuffle=False)
+                X, Y, test_size=0.3, shuffle=False
+            )
         else:
             X_train, X_test, Y_train, Y_test = [], [], [], []
 
-        stratify.append({
-            "serial": combos[sentinel][0],
-            "offset": combos[sentinel][1],
-            "X_train": len(X_train),
-            "X_test": len(X_test),
-            "Y_train": len(Y_train),
-            "Y_test": len(Y_test)})
+        stratify.append(
+            {
+                "serial": combos[sentinel][0],
+                "offset": combos[sentinel][1],
+                "X_train": len(X_train),
+                "X_test": len(X_test),
+                "Y_train": len(Y_train),
+                "Y_test": len(Y_test),
+            }
+        )
 
         cont += len(X_train) + len(X_test)
         X_train_tot += X_train
@@ -366,10 +414,19 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
     stratify = stratify_new
 
     # Sparse Matrix
-    X_train, X_test, Y_train, Y_test = sparse.csr_matrix(X_train), sparse.csr_matrix(
-        X_test), sparse.csr_matrix(Y_train), sparse.csr_matrix(Y_test)
-
-    x_train_segments, x_test_segments, y_train_segments, y_test_segments = X_train, X_test, Y_train, Y_test
+    X_train, X_test, Y_train, Y_test = (
+        sparse.csr_matrix(X_train),
+        sparse.csr_matrix(X_test),
+        sparse.csr_matrix(Y_train),
+        sparse.csr_matrix(Y_test),
+    )
+
+    x_train_segments, x_test_segments, y_train_segments, y_test_segments = (
+        X_train,
+        X_test,
+        Y_train,
+        Y_test,
+    )
 
     if verbose:
         print(x_train_segments.shape)
@@ -378,19 +435,27 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
         print(y_test_segments.shape)
 
     file_path = os.path.join(store_path, f"{file_tag}.pickle")
-    with open(file_path, 'wb') as f:
-        pickle.dump([x_train_segments, x_test_segments,
-                     y_train_segments, y_test_segments, stratify], f)
-
-
-def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
+    with open(file_path, "wb") as f:
+        pickle.dump(
+            [
+                x_train_segments,
+                x_test_segments,
+                y_train_segments,
+                y_test_segments,
+                stratify,
+            ],
+            f,
+        )
+
+
+def create_datasets(data, params_list, start_point=0, file_tag="all_alarms"):
     """
     Parameters:
         data : csv file of raw data
-        params_list: list of dictionaries i.e. params 
+        params_list: list of dictionaries i.e. params
                     each params is a dictionary with all parameters necessary to create the dataset
                     such as window_input, window_output, sigma, min_count,ecc.
-        start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) 
+        start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed)
         file_tag: name of dataset
 
     There are 3 phases(start_point:1,2,3):
@@ -399,7 +464,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
              each file contains a list of x,y with pruning of consecutive alarms
     -PHASE3: some alarms in input are removed based on list associated with key 'removal_alarms' to variable params.
              only a subset of alarms in output is keeped based on a list associated with key 'relevance_alarms' to variable params.
-             Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params  
+             Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params
 
 
     Example:
@@ -415,7 +480,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
         #relevance_alarms =  [] <- only if you want to keep a subset of alrams in output
 
         #additional parameters for phase3, comment the section if you don't want to use it
-        for params in params_list:       
+        for params in params_list:
             params["removal_alarms"] = []
             params["relevance_alarms"] = []
 
@@ -424,7 +489,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
     """
 
     for params in params_list:
-        print('-- run ', params)
+        print("-- run ", params)
         verbose = params["verbose"]
         if verbose:
             print(params)
@@ -447,7 +512,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
             # save serials and offsets
             store_path = params["store_path"]
             filepath = store_path + "/" + store_path.split("/")[-1] + ".config"
-            with open(filepath, 'wb') as f:
+            with open(filepath, "wb") as f:
                 pickle.dump((serials, offsets), f)
 
         end = time.time()
@@ -474,9 +539,17 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
             if "removal_alarms" in params and "relevance_alarms" in params:
                 removal_alarms = params["removal_alarms"]
                 relevance_alarms = params["relevance_alarms"]
-            create_final_dataset(params, serials, offsets, sequence_input_length,
-                                 sequence_output_length, removal_alarms, relevance_alarms, file_tag)
-            drop_files(params["store_path"], '.csv')
+            create_final_dataset(
+                params,
+                serials,
+                offsets,
+                sequence_input_length,
+                sequence_output_length,
+                removal_alarms,
+                relevance_alarms,
+                file_tag,
+            )
+            drop_files(params["store_path"], ".csv")
         end = time.time()
         elapsed_time = end - start
         print("phase 3/3 elapsed Time: {}".format(elapsed_time))
@@ -484,10 +557,11 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
 
 # POST BUILD
 
+
 def clean(data_path):
     for filename_dir in os.listdir(data_path):
         print("directory: ", filename_dir)
-        filepath_dir = data_path + '/' + filename_dir
+        filepath_dir = data_path + "/" + filename_dir
         if os.path.isdir(filepath_dir):
             cont_csv, cont_npz, cont_pickle = 0, 0, 0
             for filename in os.listdir(filepath_dir):
@@ -498,11 +572,14 @@ def clean(data_path):
                     cont_npz += os.stat(filepath).st_size
                 elif filename.endswith(".pickle"):
                     cont_pickle += os.stat(filepath).st_size
-            print("cont_csv: {}MB  cont_npz: {}MB   cont_pickle: {}MB ".format(
-                int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6)))
+            print(
+                "cont_csv: {}MB  cont_npz: {}MB   cont_pickle: {}MB ".format(
+                    int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6)
+                )
+            )
     for filename_dir in os.listdir(data_path):
         print("prune directory ", filename_dir)
-        filepath_dir = data_path + '/' + filename_dir
+        filepath_dir = data_path + "/" + filename_dir
         if os.path.isdir(filepath_dir):
             for filename in os.listdir(filepath_dir):
                 filepath = filepath_dir + "/" + filename
@@ -515,7 +592,7 @@ def clean(data_path):
 
 
 def convert_to_json(store_path, filename, verbose=0):
-    """ 
+    """
     function to convert pickle dataset in json format which will have same name of original dataset
     but with json extension and stored in the same path of the original dataset
 
@@ -531,9 +608,14 @@ def convert_to_json(store_path, filename, verbose=0):
     """
     # load dataset
     filepath = os.path.join(store_path, filename)
-    with open(filepath, 'rb') as f:
-        x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load(
-            f)
+    with open(filepath, "rb") as f:
+        (
+            x_train_segments,
+            x_test_segments,
+            y_train_segments,
+            y_test_segments,
+            stratify,
+        ) = pickle.load(f)
 
     x_train_segments = x_train_segments.toarray()
     x_test_segments = x_test_segments.toarray()
@@ -559,38 +641,42 @@ def convert_to_json(store_path, filename, verbose=0):
         print("serials_test.shape: ", len(stratify["test"]))
 
     diz = {}
-    for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_train_segments, y_train_segments, stratify["train"]), start=1):
-        key = "sample"+str(i)
+    for i, (seq_x, seq_y, serial_id) in enumerate(
+        zip(x_train_segments, y_train_segments, stratify["train"]), start=1
+    ):
+        key = "sample" + str(i)
         diz[key] = {}
         diz[key]["x_train"] = seq_x
         diz[key]["y_train"] = seq_y
         diz[key]["serial"] = serial_id
 
-    for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_test_segments, y_test_segments, stratify["test"]), start=i+1):
-        key = "sample"+str(i)
+    for i, (seq_x, seq_y, serial_id) in enumerate(
+        zip(x_test_segments, y_test_segments, stratify["test"]), start=i + 1
+    ):
+        key = "sample" + str(i)
         diz[key] = {}
         diz[key]["x_test"] = seq_x
         diz[key]["y_test"] = seq_y
         diz[key]["serial"] = serial_id
 
     text = json.dumps(diz)
-    filename_json = os.path.splitext(filename)[0]+".json"
+    filename_json = os.path.splitext(filename)[0] + ".json"
     filepath_json = os.path.join(store_path, filename_json)
     if verbose:
         print("filepath_json: ", filepath_json)
-    with open(filepath_json, 'w') as f:
+    with open(filepath_json, "w") as f:
         f.write(text)
 
 
 def load_json_dataset(store_path, filename_json, verbose=0):
-    """ 
+    """
     function to load dataset from json format
 
     Example:
             $#load data in json format
             $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3"
             $filename_json = "all_alarms.json"
-            $filepath = os.path.join(store_path, filename_json)    
+            $filepath = os.path.join(store_path, filename_json)
             $x_train,y_train,serials_train,x_test,y_test,serials_test = load_json_dataset(store_path, filename_json, verbose=1)
 
 
@@ -608,22 +694,32 @@ def load_json_dataset(store_path, filename_json, verbose=0):
     """
 
     filepath = os.path.join(store_path, filename_json)
-    with open(filepath, 'r') as f:
+    with open(filepath, "r") as f:
         dataset = json.load(f)
 
-    x_train = [sample["x_train"]
-               for sample_id, sample in dataset.items() if "x_train" in sample]
-    y_train = [sample["x_train"]
-               for sample_id, sample in dataset.items() if "x_train" in sample]
-    stratify_train = [sample["serial"] for sample_id,
-                      sample in dataset.items() if "x_train" in sample]
-
-    x_test = [sample["x_test"]
-              for sample_id, sample in dataset.items() if "x_test" in sample]
-    y_test = [sample["x_test"]
-              for sample_id, sample in dataset.items() if "x_test" in sample]
-    stratify_test = [sample["serial"]
-                     for sample_id, sample in dataset.items() if "x_test" in sample]
+    x_train = [
+        sample["x_train"]
+        for sample_id, sample in dataset.items()
+        if "x_train" in sample
+    ]
+    y_train = [
+        sample["x_train"]
+        for sample_id, sample in dataset.items()
+        if "x_train" in sample
+    ]
+    stratify_train = [
+        sample["serial"] for sample_id, sample in dataset.items() if "x_train" in sample
+    ]
+
+    x_test = [
+        sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
+    y_test = [
+        sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
+    stratify_test = [
+        sample["serial"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
 
     x_train = np.asarray(x_train)
     y_train = np.asarray(y_train)
@@ -642,14 +738,14 @@ def load_json_dataset(store_path, filename_json, verbose=0):
 
 
 def convert_to_npz(store_path, filename, verbose=0):
-    """ 
+    """
     function to convert pickle dataset in npz format which will have same name of original dataset
     but with npz extension and stored in the same path of the original dataset
 
     Example:
             $#save data in npz format
             $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3"
-            $filename= "all_alarms.pickle"    
+            $filename= "all_alarms.pickle"
             $convert_to_npz(store_path, filename, verbose=1)
 
     Args:
@@ -658,9 +754,14 @@ def convert_to_npz(store_path, filename, verbose=0):
     """
     # load dataset
     filepath = os.path.join(store_path, filename)
-    with open(filepath, 'rb') as f:
-        x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load(
-            f)
+    with open(filepath, "rb") as f:
+        (
+            x_train_segments,
+            x_test_segments,
+            y_train_segments,
+            y_test_segments,
+            stratify,
+        ) = pickle.load(f)
 
     x_train_segments = x_train_segments.toarray()
     x_test_segments = x_test_segments.toarray()
@@ -683,17 +784,23 @@ def convert_to_npz(store_path, filename, verbose=0):
         print("y_test.shape: ", y_test_segments.shape)
         print("serials_test.shape: ", len(stratify["test"]))
 
-    filename_npz = os.path.splitext(filename)[0]+".npz"
+    filename_npz = os.path.splitext(filename)[0] + ".npz"
     filepath_npz = os.path.join(store_path, filename_npz)
     if verbose:
         print("filepath_npz: ", filepath_npz)
-    np.savez_compressed(filepath_npz,
-                        x_train=x_train_segments, y_train=y_train_segments, stratify_train=stratify_train,
-                        x_test=x_test_segments, y_test=y_test_segments, stratify_test=stratify_test)
+    np.savez_compressed(
+        filepath_npz,
+        x_train=x_train_segments,
+        y_train=y_train_segments,
+        stratify_train=stratify_train,
+        x_test=x_test_segments,
+        y_test=y_test_segments,
+        stratify_test=stratify_test,
+    )
 
 
 def load_from_npz():
-    """ 
+    """
     function to load dataset from json format
 
     Example:
@@ -714,8 +821,7 @@ def load_from_npz():
         serials_test: for each test sample is indicated the serial that produced that sample
     """
     # filepath = os.path.join(store_path, filename)
-    filepath = os.path.dirname(__file__) + '/alarms_log_data/processed/all_alarms.npz'
+    filepath = os.path.dirname(__file__) + "/alarms_log_data/processed/all_alarms.npz"
     loaded = np.load(filepath, allow_pickle=True)
-    keys = ["x_train", "y_train", "stratify_train",
-            "x_test", "y_test", "stratify_test"]
+    keys = ["x_train", "y_train", "stratify_train", "x_test", "y_test", "stratify_test"]
     return [loaded[k] for k in keys]
diff --git a/datasets/alpi/dataset.py b/datasets/alpi/dataset.py
index ba5aa29..f41a718 100644
--- a/datasets/alpi/dataset.py
+++ b/datasets/alpi/dataset.py
@@ -11,12 +11,12 @@
 from scipy import sparse
 from sklearn.model_selection import train_test_split
 
-FAMILY = 'MACHINE_TYPE_00'
+FAMILY = "MACHINE_TYPE_00"
 
 
 # UTILS
 def zip_dir(dir_path, zip_path):
-    shutil.make_archive(zip_path, 'zip', dir_path)
+    shutil.make_archive(zip_path, "zip", dir_path)
 
 
 def drop_files(dir_path, extension):
@@ -27,7 +27,9 @@ def drop_files(dir_path, extension):
 
 
 def prune(alarms):
-    return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype('uint16')
+    return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype(
+        "uint16"
+    )
 
 
 def prune_series(seq):
@@ -38,24 +40,24 @@ def prune_series(seq):
 
 def padding_sequence(seq, sequence_length):
     new_seq = np.zeros(sequence_length)
-    new_seq[sequence_length - len(seq):] = seq
+    new_seq[sequence_length - len(seq) :] = seq
     return new_seq
 
 
 def return_variables(params):
-    window_input = params['window_input']
-    window_output = params['window_output']
-    offset = params['offset']
-    verbose = params['verbose']
-    store_path = params['store_path']  # store_path is used to save data
-    min_count = params['min_count']
-    sigma = params['sigma']
+    window_input = params["window_input"]
+    window_output = params["window_output"]
+    offset = params["offset"]
+    verbose = params["verbose"]
+    store_path = params["store_path"]  # store_path is used to save data
+    min_count = params["min_count"]
+    sigma = params["sigma"]
     return window_input, window_output, offset, verbose, store_path, min_count, sigma
 
 
 def find_serials_offsets(store_path):
     filepath = store_path + "/" + store_path.split("/")[-1] + ".config"
-    with open(filepath, 'rb') as f:
+    with open(filepath, "rb") as f:
         serials, offsets = pickle.load(f)
     return serials, offsets
 
@@ -78,31 +80,40 @@ def return_index(date_range, target):
 
 
 def create_params_list(data_path, params, verbose=True):
-    windows_input = params['windows_input']
-    windows_output = params['windows_output']
-    min_counts = params['min_counts']
-    sigmas = params['sigmas']
-    offsets = params['offsets']
+    windows_input = params["windows_input"]
+    windows_output = params["windows_output"]
+    min_counts = params["min_counts"]
+    sigmas = params["sigmas"]
+    offsets = params["offsets"]
     params_list = []
-    dir_template = data_path + '/' + FAMILY + \
-        '_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}'
+    dir_template = (
+        data_path
+        + "/"
+        + FAMILY
+        + "_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}"
+    )
     for window_input in windows_input:
         for window_output in windows_output:
             for min_count in min_counts:
                 for sigma in sigmas:
                     for offset in offsets:
                         store_path = dir_template.format(
-                            window_input=window_input, window_output=window_output, offset=offset, min_count=min_count, sigma=sigma)
+                            window_input=window_input,
+                            window_output=window_output,
+                            offset=offset,
+                            min_count=min_count,
+                            sigma=sigma,
+                        )
                         if not os.path.isdir(store_path):
                             os.makedirs(store_path)
                         params = {
-                            'window_input': window_input,
-                            'window_output': window_output,
-                            'offset': offset,
-                            'min_count': min_count,
-                            'sigma': sigma,
-                            'verbose': verbose,
-                            'store_path': store_path
+                            "window_input": window_input,
+                            "window_output": window_output,
+                            "offset": offset,
+                            "min_count": min_count,
+                            "sigma": sigma,
+                            "verbose": verbose,
+                            "store_path": store_path,
                         }
                         params_list.append(params)
     return params_list
@@ -114,8 +125,7 @@ def create_params_list(data_path, params, verbose=True):
 def generate_dataset_by_serial_offset(data, params, current_offset):
     data["current_offset"] = current_offset
     current_offset = datetime.timedelta(minutes=current_offset)
-    window_input, window_output, _, _, _, _, _ = return_variables(
-        params)
+    window_input, window_output, _, _, _, _, _ = return_variables(params)
 
     min_timestamp = data.index.min()
     min_timestamp += current_offset
@@ -123,24 +133,28 @@ def generate_dataset_by_serial_offset(data, params, current_offset):
 
     # create date range for input
     date_range = pd.date_range(
-        start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min")
+        start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min"
+    )
     date_range = [d for d in date_range]
 
     # create date range for output
     delta_in = datetime.timedelta(minutes=window_input)
     delta_out = datetime.timedelta(minutes=window_output)
-    date_range_output = [(d + delta_in, d + delta_in + delta_out)
-                         for d in date_range]  # ranges of outputs
+    date_range_output = [
+        (d + delta_in, d + delta_in + delta_out) for d in date_range
+    ]  # ranges of outputs
     date_range_output = np.asarray(date_range_output).reshape(1, -1)[0]
 
     # create samples
     series = pd.Series(data.index).apply(
-        lambda target: return_index(date_range, target))
+        lambda target: return_index(date_range, target)
+    )
     series.index = data.index
     data["bin_input"] = series
 
     series = pd.Series(data.index).apply(
-        lambda target: return_index_output(date_range_output, target))
+        lambda target: return_index_output(date_range_output, target)
+    )
     series.index = data.index
     data["bin_output"] = series
 
@@ -149,14 +163,14 @@ def generate_dataset_by_serial_offset(data, params, current_offset):
     unique2 = data["bin_output"].unique()
     periods_id = list(set(unique1).intersection(set(unique2)) - set([-1]))
 
-    data = data[(data["bin_input"].isin(periods_id)) |
-                (data["bin_output"].isin(periods_id))]
+    data = data[
+        (data["bin_input"].isin(periods_id)) | (data["bin_output"].isin(periods_id))
+    ]
     return data
 
 
 def generate_dataset_by_serial(data, params):
-    window_input, _, offset, verbose, store_path, _, _ = return_variables(
-        params)
+    window_input, _, offset, verbose, store_path, _, _ = return_variables(params)
     serial = data["serial"][0]
     if verbose:
         print("serial: ", serial)
@@ -164,9 +178,9 @@ def generate_dataset_by_serial(data, params):
     offsets = list(range(0, window_input, offset))
     offset_data = []
     for current_offset in offsets:
-        df_offset = generate_dataset_by_serial_offset(data.copy(),
-                                                      params,
-                                                      current_offset)
+        df_offset = generate_dataset_by_serial_offset(
+            data.copy(), params, current_offset
+        )
         offset_data.append(df_offset)
     dataset = pd.concat(offset_data)
     if verbose:
@@ -176,7 +190,7 @@ def generate_dataset_by_serial(data, params):
 
 
 def generate_dataset(data, params):
-    grouped_data = data.groupby('serial')
+    grouped_data = data.groupby("serial")
     for _, data_serial in grouped_data:
         generate_dataset_by_serial(data_serial, params)
     return
@@ -184,6 +198,7 @@ def generate_dataset(data, params):
 
 # PHASE 2
 
+
 def prune_dataset(params):
     _, _, _, _, store_path, _, _ = return_variables(params)
     serials = []
@@ -198,8 +213,7 @@ def prune_dataset(params):
 
 
 def prune_df(df, serial_id, params):
-    _, _, offset, verbose, store_path, min_count, _ = return_variables(
-        params)
+    _, _, offset, verbose, store_path, min_count, _ = return_variables(params)
 
     offsets = df["current_offset"].unique()
     for offset in offsets:
@@ -215,27 +229,37 @@ def prune_df(df, serial_id, params):
         periods_id = list(set(unique1).intersection(set(unique2)) - set([-1]))
         periods_id.sort()  # temporal sorting
 
-        diz_input = {bin_id: group.alarm.sort_index(
-        ).values for bin_id, group in groups_input if bin_id in periods_id}
-        diz_output = {bin_id: group.alarm.sort_index(
-        ).values for bin_id, group in groups_output if bin_id in periods_id}
+        diz_input = {
+            bin_id: group.alarm.sort_index().values
+            for bin_id, group in groups_input
+            if bin_id in periods_id
+        }
+        diz_output = {
+            bin_id: group.alarm.sort_index().values
+            for bin_id, group in groups_output
+            if bin_id in periods_id
+        }
 
         # list of [seq_x,seq_y]
-        X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]]
-                      for bin_id in periods_id]
+        X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] for bin_id in periods_id]
 
         # apply min_count: remove sequences with count<min_count
-        X_Y_offset = [seq for seq in X_Y_offset if len(
-            seq[0]) >= min_count and len(seq[1]) >= min_count]
+        X_Y_offset = [
+            seq
+            for seq in X_Y_offset
+            if len(seq[0]) >= min_count and len(seq[1]) >= min_count
+        ]
 
         # save list on file
         # for each (serial,offset) a different list is saved
         if verbose:
-            print("{} has length: {}".format(str(serial_id) +
-                                             "_offset_" + str(offset) + ".npz", len(X_Y_offset)))
-        filepath = store_path + "/" + \
-            str(serial_id) + "_offset_" + str(offset) + ".npz"
-        with open(filepath, 'wb') as f:
+            print(
+                "{} has length: {}".format(
+                    str(serial_id) + "_offset_" + str(offset) + ".npz", len(X_Y_offset)
+                )
+            )
+        filepath = store_path + "/" + str(serial_id) + "_offset_" + str(offset) + ".npz"
+        with open(filepath, "wb") as f:
             pickle.dump(X_Y_offset, f)
 
     return offsets
@@ -243,9 +267,18 @@ def prune_df(df, serial_id, params):
 
 # PHASE 3
 
-def create_final_dataset(params, serials, offsets, sequence_input_length, sequence_output_length, removal_alarms=None, relevance_alarms=None, file_tag='all_alarms'):
-    _, _, offset, verbose, store_path, _, sigma = return_variables(
-        params)
+
+def create_final_dataset(
+    params,
+    serials,
+    offsets,
+    sequence_input_length,
+    sequence_output_length,
+    removal_alarms=None,
+    relevance_alarms=None,
+    file_tag="all_alarms",
+):
+    _, _, offset, verbose, store_path, _, sigma = return_variables(params)
     padding_mode = "after"
     if "padding_mode" in params:
         padding_mode = params["padding_mode"]
@@ -266,24 +299,37 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
         for offset in offsets:
             sentinel += 1
             if verbose:
-                print("{}/{}:  serial: {}  offset: {}".format(sentinel,
-                                                              tot_combo,
-                                                              serial,
-                                                              offset))
+                print(
+                    "{}/{}:  serial: {}  offset: {}".format(
+                        sentinel, tot_combo, serial, offset
+                    )
+                )
             filename = str(serial) + "_offset_" + str(offset) + ".npz"
             if filename in os.listdir(store_path):
                 file_path = os.path.join(store_path, filename)
-                with open(file_path, 'rb') as f:
+                with open(file_path, "rb") as f:
                     seqences = pickle.load(f)
                     if removal_alarms != None and len(removal_alarms) > 0:
-                        X = [[alarm for alarm in seq_x if (
-                            alarm not in removal_alarms and alarm != 0)] for seq_x, seq_y in seqences]
+                        X = [
+                            [
+                                alarm
+                                for alarm in seq_x
+                                if (alarm not in removal_alarms and alarm != 0)
+                            ]
+                            for seq_x, seq_y in seqences
+                        ]
                     else:
                         X = [seq_x for seq_x, seq_y in seqences]
 
                     if relevance_alarms != None and len(relevance_alarms) > 0:
-                        Y = [[alarm for alarm in seq_y if (
-                            alarm in relevance_alarms and alarm != 0)] for seq_x, seq_y in seqences]
+                        Y = [
+                            [
+                                alarm
+                                for alarm in seq_y
+                                if (alarm in relevance_alarms and alarm != 0)
+                            ]
+                            for seq_x, seq_y in seqences
+                        ]
                     else:
                         Y = [seq_y for seq_x, seq_y in seqences]
 
@@ -301,8 +347,9 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
     lengths_Y = np.asarray(lengths_Y)
     mu_sequence_input_length = lengths_X.mean()
     std_sequence_input_length = lengths_X.std()
-    sequence_input_length = int(
-        mu_sequence_input_length + sigma * std_sequence_input_length) + 1
+    sequence_input_length = (
+        int(mu_sequence_input_length + sigma * std_sequence_input_length) + 1
+    )
     sequence_output_length = np.max(lengths_Y)
     sentinel = 0
     # it is iterated on all combinations of serials and offsets
@@ -314,28 +361,29 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
 
         # PADDING
         if padding_mode == "after":
-            X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x)))
-                 for seq_x in X]
-            Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y)))
-                 for seq_y in Y]
+            X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) for seq_x in X]
+            Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) for seq_y in Y]
         elif padding_mode == "before":
             X = [padding_sequence(seq_x, sequence_input_length) for seq_x in X]
-            Y = [padding_sequence(seq_y, sequence_output_length)
-                 for seq_y in Y]
+            Y = [padding_sequence(seq_y, sequence_output_length) for seq_y in Y]
 
         if len(X) > 1:
             X_train, X_test, Y_train, Y_test = train_test_split(
-                X, Y, test_size=0.3, shuffle=False)
+                X, Y, test_size=0.3, shuffle=False
+            )
         else:
             X_train, X_test, Y_train, Y_test = [], [], [], []
 
-        stratify.append({
-            "serial": combos[sentinel][0],
-            "offset": combos[sentinel][1],
-            "X_train": len(X_train),
-            "X_test": len(X_test),
-            "Y_train": len(Y_train),
-            "Y_test": len(Y_test)})
+        stratify.append(
+            {
+                "serial": combos[sentinel][0],
+                "offset": combos[sentinel][1],
+                "X_train": len(X_train),
+                "X_test": len(X_test),
+                "Y_train": len(Y_train),
+                "Y_test": len(Y_test),
+            }
+        )
 
         cont += len(X_train) + len(X_test)
         X_train_tot += X_train
@@ -366,10 +414,19 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
     stratify = stratify_new
 
     # Sparse Matrix
-    X_train, X_test, Y_train, Y_test = sparse.csr_matrix(X_train), sparse.csr_matrix(
-        X_test), sparse.csr_matrix(Y_train), sparse.csr_matrix(Y_test)
-
-    x_train_segments, x_test_segments, y_train_segments, y_test_segments = X_train, X_test, Y_train, Y_test
+    X_train, X_test, Y_train, Y_test = (
+        sparse.csr_matrix(X_train),
+        sparse.csr_matrix(X_test),
+        sparse.csr_matrix(Y_train),
+        sparse.csr_matrix(Y_test),
+    )
+
+    x_train_segments, x_test_segments, y_train_segments, y_test_segments = (
+        X_train,
+        X_test,
+        Y_train,
+        Y_test,
+    )
 
     if verbose:
         print(x_train_segments.shape)
@@ -378,19 +435,27 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen
         print(y_test_segments.shape)
 
     file_path = os.path.join(store_path, f"{file_tag}.pickle")
-    with open(file_path, 'wb') as f:
-        pickle.dump([x_train_segments, x_test_segments,
-                     y_train_segments, y_test_segments, stratify], f)
-
-
-def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
+    with open(file_path, "wb") as f:
+        pickle.dump(
+            [
+                x_train_segments,
+                x_test_segments,
+                y_train_segments,
+                y_test_segments,
+                stratify,
+            ],
+            f,
+        )
+
+
+def create_datasets(data, params_list, start_point=0, file_tag="all_alarms"):
     """
     Parameters:
         data : csv file of raw data
-        params_list: list of dictionaries i.e. params 
+        params_list: list of dictionaries i.e. params
                     each params is a dictionary with all parameters necessary to create the dataset
                     such as window_input, window_output, sigma, min_count,ecc.
-        start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) 
+        start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed)
         file_tag: name of dataset
 
     There are 3 phases(start_point:1,2,3):
@@ -399,7 +464,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
              each file contains a list of x,y with pruning of consecutive alarms
     -PHASE3: some alarms in input are removed based on list associated with key 'removal_alarms' to variable params.
              only a subset of alarms in output is keeped based on a list associated with key 'relevance_alarms' to variable params.
-             Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params  
+             Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params
 
 
     Example:
@@ -415,7 +480,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
         #relevance_alarms =  [] <- only if you want to keep a subset of alrams in output
 
         #additional parameters for phase3, comment the section if you don't want to use it
-        for params in params_list:       
+        for params in params_list:
             params["removal_alarms"] = []
             params["relevance_alarms"] = []
 
@@ -424,7 +489,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
     """
 
     for params in params_list:
-        print('-- run ', params)
+        print("-- run ", params)
         verbose = params["verbose"]
         if verbose:
             print(params)
@@ -447,7 +512,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
             # save serials and offsets
             store_path = params["store_path"]
             filepath = store_path + "/" + store_path.split("/")[-1] + ".config"
-            with open(filepath, 'wb') as f:
+            with open(filepath, "wb") as f:
                 pickle.dump((serials, offsets), f)
 
         end = time.time()
@@ -474,9 +539,17 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
             if "removal_alarms" in params and "relevance_alarms" in params:
                 removal_alarms = params["removal_alarms"]
                 relevance_alarms = params["relevance_alarms"]
-            create_final_dataset(params, serials, offsets, sequence_input_length,
-                                 sequence_output_length, removal_alarms, relevance_alarms, file_tag)
-            drop_files(params["store_path"], '.csv')
+            create_final_dataset(
+                params,
+                serials,
+                offsets,
+                sequence_input_length,
+                sequence_output_length,
+                removal_alarms,
+                relevance_alarms,
+                file_tag,
+            )
+            drop_files(params["store_path"], ".csv")
         end = time.time()
         elapsed_time = end - start
         print("phase 3/3 elapsed Time: {}".format(elapsed_time))
@@ -484,10 +557,11 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'):
 
 # POST BUILD
 
+
 def clean(data_path):
     for filename_dir in os.listdir(data_path):
         print("directory: ", filename_dir)
-        filepath_dir = data_path + '/' + filename_dir
+        filepath_dir = data_path + "/" + filename_dir
         if os.path.isdir(filepath_dir):
             cont_csv, cont_npz, cont_pickle = 0, 0, 0
             for filename in os.listdir(filepath_dir):
@@ -498,11 +572,14 @@ def clean(data_path):
                     cont_npz += os.stat(filepath).st_size
                 elif filename.endswith(".pickle"):
                     cont_pickle += os.stat(filepath).st_size
-            print("cont_csv: {}MB  cont_npz: {}MB   cont_pickle: {}MB ".format(
-                int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6)))
+            print(
+                "cont_csv: {}MB  cont_npz: {}MB   cont_pickle: {}MB ".format(
+                    int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6)
+                )
+            )
     for filename_dir in os.listdir(data_path):
         print("prune directory ", filename_dir)
-        filepath_dir = data_path + '/' + filename_dir
+        filepath_dir = data_path + "/" + filename_dir
         if os.path.isdir(filepath_dir):
             for filename in os.listdir(filepath_dir):
                 filepath = filepath_dir + "/" + filename
@@ -515,7 +592,7 @@ def clean(data_path):
 
 
 def convert_to_json(store_path, filename, verbose=0):
-    """ 
+    """
     function to convert pickle dataset in json format which will have same name of original dataset
     but with json extension and stored in the same path of the original dataset
 
@@ -531,9 +608,14 @@ def convert_to_json(store_path, filename, verbose=0):
     """
     # load dataset
     filepath = os.path.join(store_path, filename)
-    with open(filepath, 'rb') as f:
-        x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load(
-            f)
+    with open(filepath, "rb") as f:
+        (
+            x_train_segments,
+            x_test_segments,
+            y_train_segments,
+            y_test_segments,
+            stratify,
+        ) = pickle.load(f)
 
     x_train_segments = x_train_segments.toarray()
     x_test_segments = x_test_segments.toarray()
@@ -559,38 +641,42 @@ def convert_to_json(store_path, filename, verbose=0):
         print("serials_test.shape: ", len(stratify["test"]))
 
     diz = {}
-    for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_train_segments, y_train_segments, stratify["train"]), start=1):
-        key = "sample"+str(i)
+    for i, (seq_x, seq_y, serial_id) in enumerate(
+        zip(x_train_segments, y_train_segments, stratify["train"]), start=1
+    ):
+        key = "sample" + str(i)
         diz[key] = {}
         diz[key]["x_train"] = seq_x
         diz[key]["y_train"] = seq_y
         diz[key]["serial"] = serial_id
 
-    for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_test_segments, y_test_segments, stratify["test"]), start=i+1):
-        key = "sample"+str(i)
+    for i, (seq_x, seq_y, serial_id) in enumerate(
+        zip(x_test_segments, y_test_segments, stratify["test"]), start=i + 1
+    ):
+        key = "sample" + str(i)
         diz[key] = {}
         diz[key]["x_test"] = seq_x
         diz[key]["y_test"] = seq_y
         diz[key]["serial"] = serial_id
 
     text = json.dumps(diz)
-    filename_json = os.path.splitext(filename)[0]+".json"
+    filename_json = os.path.splitext(filename)[0] + ".json"
     filepath_json = os.path.join(store_path, filename_json)
     if verbose:
         print("filepath_json: ", filepath_json)
-    with open(filepath_json, 'w') as f:
+    with open(filepath_json, "w") as f:
         f.write(text)
 
 
 def load_json_dataset(store_path, filename_json, verbose=0):
-    """ 
+    """
     function to load dataset from json format
 
     Example:
             $#load data in json format
             $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3"
             $filename_json = "all_alarms.json"
-            $filepath = os.path.join(store_path, filename_json)    
+            $filepath = os.path.join(store_path, filename_json)
             $x_train,y_train,serials_train,x_test,y_test,serials_test = load_json_dataset(store_path, filename_json, verbose=1)
 
 
@@ -608,22 +694,32 @@ def load_json_dataset(store_path, filename_json, verbose=0):
     """
 
     filepath = os.path.join(store_path, filename_json)
-    with open(filepath, 'r') as f:
+    with open(filepath, "r") as f:
         dataset = json.load(f)
 
-    x_train = [sample["x_train"]
-               for sample_id, sample in dataset.items() if "x_train" in sample]
-    y_train = [sample["x_train"]
-               for sample_id, sample in dataset.items() if "x_train" in sample]
-    stratify_train = [sample["serial"] for sample_id,
-                      sample in dataset.items() if "x_train" in sample]
-
-    x_test = [sample["x_test"]
-              for sample_id, sample in dataset.items() if "x_test" in sample]
-    y_test = [sample["x_test"]
-              for sample_id, sample in dataset.items() if "x_test" in sample]
-    stratify_test = [sample["serial"]
-                     for sample_id, sample in dataset.items() if "x_test" in sample]
+    x_train = [
+        sample["x_train"]
+        for sample_id, sample in dataset.items()
+        if "x_train" in sample
+    ]
+    y_train = [
+        sample["x_train"]
+        for sample_id, sample in dataset.items()
+        if "x_train" in sample
+    ]
+    stratify_train = [
+        sample["serial"] for sample_id, sample in dataset.items() if "x_train" in sample
+    ]
+
+    x_test = [
+        sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
+    y_test = [
+        sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
+    stratify_test = [
+        sample["serial"] for sample_id, sample in dataset.items() if "x_test" in sample
+    ]
 
     x_train = np.asarray(x_train)
     y_train = np.asarray(y_train)
@@ -642,14 +738,14 @@ def load_json_dataset(store_path, filename_json, verbose=0):
 
 
 def convert_to_npz(store_path, filename, verbose=0):
-    """ 
+    """
     function to convert pickle dataset in npz format which will have same name of original dataset
     but with npz extension and stored in the same path of the original dataset
 
     Example:
             $#save data in npz format
             $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3"
-            $filename= "all_alarms.pickle"    
+            $filename= "all_alarms.pickle"
             $convert_to_npz(store_path, filename, verbose=1)
 
     Args:
@@ -658,9 +754,14 @@ def convert_to_npz(store_path, filename, verbose=0):
     """
     # load dataset
     filepath = os.path.join(store_path, filename)
-    with open(filepath, 'rb') as f:
-        x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load(
-            f)
+    with open(filepath, "rb") as f:
+        (
+            x_train_segments,
+            x_test_segments,
+            y_train_segments,
+            y_test_segments,
+            stratify,
+        ) = pickle.load(f)
 
     x_train_segments = x_train_segments.toarray()
     x_test_segments = x_test_segments.toarray()
@@ -683,17 +784,23 @@ def convert_to_npz(store_path, filename, verbose=0):
         print("y_test.shape: ", y_test_segments.shape)
         print("serials_test.shape: ", len(stratify["test"]))
 
-    filename_npz = os.path.splitext(filename)[0]+".npz"
+    filename_npz = os.path.splitext(filename)[0] + ".npz"
     filepath_npz = os.path.join(store_path, filename_npz)
     if verbose:
         print("filepath_npz: ", filepath_npz)
-    np.savez_compressed(filepath_npz,
-                        x_train=x_train_segments, y_train=y_train_segments, stratify_train=stratify_train,
-                        x_test=x_test_segments, y_test=y_test_segments, stratify_test=stratify_test)
+    np.savez_compressed(
+        filepath_npz,
+        x_train=x_train_segments,
+        y_train=y_train_segments,
+        stratify_train=stratify_train,
+        x_test=x_test_segments,
+        y_test=y_test_segments,
+        stratify_test=stratify_test,
+    )
 
 
 def load_from_npz(store_path, filename, verbose=0):
-    """ 
+    """
     function to load dataset from json format
 
     Example:
@@ -715,6 +822,5 @@ def load_from_npz(store_path, filename, verbose=0):
     """
     filepath = os.path.join(store_path, filename)
     loaded = np.load(filepath, allow_pickle=True)
-    keys = ["x_train", "y_train", "stratify_train",
-            "x_test", "y_test", "stratify_test"]
+    keys = ["x_train", "y_train", "stratify_train", "x_test", "y_test", "stratify_test"]
     return [loaded[k] for k in keys]
diff --git a/datasets/cbm/__init__.py b/datasets/cbm/__init__.py
index acee59f..52e8974 100644
--- a/datasets/cbm/__init__.py
+++ b/datasets/cbm/__init__.py
@@ -11,7 +11,7 @@ def parse_feature_names(fn):
     with open(fn) as f:
         names, lines = [], f.readlines()
         for line in lines:
-            names.append(line.split('-')[-1].lstrip().rstrip())
+            names.append(line.split("-")[-1].lstrip().rstrip())
 
     return names
 
@@ -19,15 +19,15 @@ def parse_feature_names(fn):
 def load_data(shorten_feature_names=True):
 
     fp = os.path.dirname(__file__)
-    raw_data = np.loadtxt(fp + '/data.txt.gz')
-    features = parse_feature_names(fp + '/Features.txt')
+    raw_data = np.loadtxt(fp + "/data.txt.gz")
+    features = parse_feature_names(fp + "/Features.txt")
 
     if shorten_feature_names == True:
         for i in range(len(features) - 2):
-            features[i] = features[i].split('(')[-1].split(')')[0].upper()
+            features[i] = features[i].split("(")[-1].split(")")[0].upper()
 
-        features[-2] = 'comp_decay_state'
-        features[-1] = 'turb_decay_state'
+        features[-2] = "comp_decay_state"
+        features[-1] = "turb_decay_state"
 
     return pd.DataFrame(raw_data, columns=features)
 
@@ -41,7 +41,7 @@ def normalize(df):
             minv = df_norm.iloc[:, i].min()
             df_norm.iloc[:, i] = (df_norm.iloc[:, i] - minv) / (maxv - minv)
         else:
-            df_norm.iloc[:, i] = 0.
+            df_norm.iloc[:, i] = 0.0
 
     return df_norm
 
@@ -57,8 +57,8 @@ def gen_summary(wd=400, outdir=None):
 
     os.makedirs(outdir, exist_ok=True)
     data = normalize(load_data(shorten_feature_names=False))
-    
-    with PdfPages(outdir + '/cbm_summary.pdf') as pp:
+
+    with PdfPages(outdir + "/cbm_summary.pdf") as pp:
         for st in tqdm.trange(0, data.shape[0], wd):
             ed = st + wd
             fig, ax = plt.subplots(9, figsize=(24, 15))
@@ -73,11 +73,11 @@ def gen_summary(wd=400, outdir=None):
             data.iloc[st:ed, 15].plot(legend=True, ax=ax[7])
             data.iloc[st:ed, [0, -2, -1]].plot(legend=True, ax=ax[8])
 
-            ax[0].set_title('Normalized Sensor/Label data')
-            ax[-1].set_xlabel('Time')
-            for axi in ax: axi.set_ylabel('Value')
+            ax[0].set_title("Normalized Sensor/Label data")
+            ax[-1].set_xlabel("Time")
+            for axi in ax:
+                axi.set_ylabel("Value")
 
-            fig.savefig(pp, bbox_inches='tight', format='pdf')
+            fig.savefig(pp, bbox_inches="tight", format="pdf")
             plt.clf()
             plt.close()
-    
\ No newline at end of file
diff --git a/datasets/cmapss/__init__.py b/datasets/cmapss/__init__.py
index 3b48edb..f89d274 100644
--- a/datasets/cmapss/__init__.py
+++ b/datasets/cmapss/__init__.py
@@ -13,7 +13,7 @@ def load_data(index="FD004", features=None):
         assert index in ["FD001", "FD002", "FD003", "FD004"]
     elif type(index) == int:
         assert index in [0, 1, 2, 3]
-        index = f'FD00{index+1}'
+        index = f"FD00{index+1}"
 
     filepath = os.path.dirname(__file__)
 
@@ -47,30 +47,30 @@ def load_data(index="FD004", features=None):
 
     # Original data
     train_set = np.loadtxt(filepath + f"/train_{index}.txt.gz")
-    test_set  = np.loadtxt(filepath + f"/test_{index}.txt.gz")
-    labels    = np.loadtxt(filepath + f"/RUL_{index}.txt.gz")  # RUL: Remaining Useful Life
+    test_set = np.loadtxt(filepath + f"/test_{index}.txt.gz")
+    labels = np.loadtxt(filepath + f"/RUL_{index}.txt.gz")  # RUL: Remaining Useful Life
 
     # Convert to pandas.DataFrame objects
     col_names = ["unit_number", "time"]
     col_names += [f"operation{i}" for i in range(1, 4)]
     col_names += [f"sensor{i}" for i in range(1, 22)]
     train_set = pd.DataFrame(train_set, columns=col_names)
-    test_set  = pd.DataFrame(test_set, columns=col_names)
+    test_set = pd.DataFrame(test_set, columns=col_names)
 
     def set_dtype(df):
         return df.astype({"unit_number": np.int64, "time": np.int64})
 
     def extract_features(df, features):
-        columns = ['unit_number', 'time']
-        columns += [f'sensor{i}' for i in features]
+        columns = ["unit_number", "time"]
+        columns += [f"sensor{i}" for i in features]
         return df.loc[:, columns]
 
     train_set = set_dtype(train_set)
-    test_set  = set_dtype(test_set)
+    test_set = set_dtype(test_set)
 
     if features is not None:
         train_set = extract_features(train_set, features)
-        test_set  = extract_features(test_set, features)
+        test_set = extract_features(test_set, features)
 
     return train_set, test_set, labels
 
@@ -79,34 +79,61 @@ def cleaning(df):
     raise NotImplementedError
 
 
-def load_clean_data():
-    raise NotImplementedError
+def load_clean_data_rul(
+    index="FD004",
+    features=[2, 3, 4, 7, 11, 12, 15],
+):
+    train, test, labels = load_mesurement_list(index=index, features=features)
+    label = "RUL"
+
+    # train
+    train_df_list = []
+    for tt in train:
+        max_rul = len(tt)
+        rul_array = np.array(range(max_rul))[::-1]
+        tt[label] = rul_array
+        train_df_list.append(tt)
+
+    # test
+    test_df_list = []
+    for tt, la in zip(test, labels):
+        max_rul = len(tt)
+        rul_array = np.array(range(max_rul))[::-1]
+        rul_array += int(la)
+        tt[label] = rul_array
+        test_df_list.append(tt)
+
+    return train_df_list, test_df_list
 
 
 def load_mesurement_list(
-    index="FD004", 
-    features=[2, 3, 4, 7, 11, 12 ,15],
-    ):
+    index="FD004",
+    features=[2, 3, 4, 7, 11, 12, 15],
+):
     """
     * transform train_set and test_set into the lists of
         multivariate senser mesurements according to unit numbers.
-    * features: the default features were applied in the previous research, 
+    * features: the default features were applied in the previous research,
         "A Similarity-Based Prognostics Approach
         for Remaining Useful Life Estimation of Engineered Systems".
     """
     assert index in ["FD001", "FD002", "FD003", "FD004"]
     train_set, test_set, labels = load_data(index=index)
 
-    refined_train_set =[]
+    refined_train_set = []
     for _, seq_df in train_set.groupby("unit_number"):
-        seq_df  = seq_df.sort_values("time")
-        ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(drop=True)
+        seq_df = seq_df.sort_values("time")
+        ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(
+            drop=True
+        )
         refined_train_set.append(ex_seq_df)
 
-    refined_test_set =[]
+    refined_test_set = []
     for _, seq_df in test_set.groupby("unit_number"):
-        seq_df  = seq_df.sort_values("time")
-        ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(drop=True)
+        seq_df = seq_df.sort_values("time")
+        ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(
+            drop=True
+        )
         refined_test_set.append(ex_seq_df)
 
     return refined_train_set, refined_test_set, labels
@@ -117,14 +144,16 @@ def run_to_failure_aux(df, lifetime, unit_number):
     assert lifetime <= df.shape[0]
     broken = 0 if lifetime < df.shape[0] else 1
     sample = pd.DataFrame(
-        {'lifetime': lifetime, 'broken': broken, 'unit_number': unit_number}, index=[0])
+        {"lifetime": lifetime, "broken": broken, "unit_number": unit_number}, index=[0]
+    )
 
-    sensors = df.loc[:, df.columns.str.contains('sensor')]
-    num_features = sensors.iloc[:lifetime].agg(['min', 'max', 'mean', 'std'])
+    sensors = df.loc[:, df.columns.str.contains("sensor")]
+    num_features = sensors.iloc[:lifetime].agg(["min", "max", "mean", "std"])
     num_features = num_features.unstack().reset_index()
-    num_features['feature'] = num_features.level_0.str.cat(
-        num_features.level_1, sep='_')
-    num_features = num_features.pivot_table(columns='feature', values=0)
+    num_features["feature"] = num_features.level_0.str.cat(
+        num_features.level_1, sep="_"
+    )
+    num_features = num_features.pivot_table(columns="feature", values=0)
 
     return pd.concat([sample, num_features], axis=1)
 
@@ -132,17 +161,16 @@ def run_to_failure_aux(df, lifetime, unit_number):
 def censoring_augmentation(raw_data, n_samples=10, seed=123):
 
     np.random.seed(seed)
-    datasets = [g for _, g in raw_data.groupby('unit_number')]
-    timeseries = raw_data.groupby('unit_number').size()
+    datasets = [g for _, g in raw_data.groupby("unit_number")]
+    timeseries = raw_data.groupby("unit_number").size()
     samples = []
-    pbar = tqdm.tqdm(total=n_samples, desc='augmentation')
+    pbar = tqdm.tqdm(total=n_samples, desc="augmentation")
 
     while len(samples) < n_samples:
         # draw a machine
         unit_number = np.random.randint(timeseries.shape[0])
         censor_timing = np.random.randint(timeseries.iloc[unit_number])
-        sample = run_to_failure_aux(
-            datasets[unit_number], censor_timing, unit_number)
+        sample = run_to_failure_aux(datasets[unit_number], censor_timing, unit_number)
         samples.append(sample)
         pbar.update(1)
 
@@ -152,23 +180,25 @@ def censoring_augmentation(raw_data, n_samples=10, seed=123):
 def generate_run_to_failure(df, health_censor_aug=0, seed=123):
 
     samples = []
-    for unit_id, timeseries in tqdm.tqdm(df.groupby('unit_number'), desc='RUL'):
-        samples.append(run_to_failure_aux(
-            timeseries, timeseries.shape[0], unit_id))
+    for unit_id, timeseries in tqdm.tqdm(df.groupby("unit_number"), desc="RUL"):
+        samples.append(run_to_failure_aux(timeseries, timeseries.shape[0], unit_id))
 
     samples = pd.concat(samples)
 
     if health_censor_aug > 0:
-        aug_samples = censoring_augmentation(
-            df, n_samples=health_censor_aug, seed=seed)
+        aug_samples = censoring_augmentation(df, n_samples=health_censor_aug, seed=seed)
         return pd.concat([samples, aug_samples]).reset_index(drop=True)
     else:
         return samples.reset_index(drop=True)
 
 
-def leave_one_out(target='run-to-failure',
-                  health_censor_aug=1000, seed=123,
-                  input_fn=None, output_fn=None):
+def leave_one_out(
+    target="run-to-failure",
+    health_censor_aug=1000,
+    seed=123,
+    input_fn=None,
+    output_fn=None,
+):
 
     if input_fn is not None:
         subsets = pd.read_csv(input_fn)
@@ -179,13 +209,12 @@ def leave_one_out(target='run-to-failure',
             raw_data = load_data(index=index)[0]
             raw_data = raw_data.assign(machine_id=index)
 
-            if target == 'run-to-failure':
-                subset = generate_run_to_failure(
-                    raw_data, health_censor_aug, seed)
+            if target == "run-to-failure":
+                subset = generate_run_to_failure(raw_data, health_censor_aug, seed)
                 subset = subset.assign(fold=index)
                 subsets.append(subset)
 
-            elif target == 'time-to-failure':
+            elif target == "time-to-failure":
                 raise NotImplementedError
 
             else:
@@ -197,39 +226,51 @@ def leave_one_out(target='run-to-failure',
         subsets.to_csv(output_fn, index=False)
 
     # List of tuples: (train_data, test_data)
-    train_test_sets = [(
-        subsets[subsets.fold != i].reset_index(drop=True),
-        subsets[subsets.fold == i].reset_index(drop=True)) for i in range(4)]
+    train_test_sets = [
+        (
+            subsets[subsets.fold != i].reset_index(drop=True),
+            subsets[subsets.fold == i].reset_index(drop=True),
+        )
+        for i in range(4)
+    ]
 
     return train_test_sets
 
 
-def generate_validation_sets(method='leave-one-out', n_splits=5, seed=123, outdir=None):
+def generate_validation_sets(method="leave-one-out", n_splits=5, seed=123, outdir=None):
     validation_sets = []
 
-    if method == 'kfold':
+    if method == "kfold":
         raise NotImplementedError
 
-    elif method == 'leave-one-out':
-        validation_sets = leave_one_out(target='run-to-failure',
-                                        health_censor_aug=1000,
-                                        seed=seed)
+    elif method == "leave-one-out":
+        validation_sets = leave_one_out(
+            target="run-to-failure", health_censor_aug=1000, seed=seed
+        )
 
         if outdir is not None:
             for i, (train_data, test_data) in enumerate(validation_sets):
-                train_data.to_csv(outdir + f'/train_{i}.csv.gz', index=False)
-                test_data.to_csv(outdir + f'/test_{i}.csv.gz', index=False)        
+                train_data.to_csv(outdir + f"/train_{i}.csv.gz", index=False)
+                test_data.to_csv(outdir + f"/test_{i}.csv.gz", index=False)
 
     return validation_sets
 
 
-def load_validation_sets(filepath, method='leave-one-out', n_splits=5):
-    if method == 'kfold':
-        return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'),
-                 pd.read_csv(filepath + f'/test_{i}.csv.gz'))
-                 for i in range(n_splits)]
-
-    elif method == 'leave-one-out':
-        return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'),
-                 pd.read_csv(filepath + f'/test_{i}.csv.gz'))
-                 for i in range(4)]
+def load_validation_sets(filepath, method="leave-one-out", n_splits=5):
+    if method == "kfold":
+        return [
+            (
+                pd.read_csv(filepath + f"/train_{i}.csv.gz"),
+                pd.read_csv(filepath + f"/test_{i}.csv.gz"),
+            )
+            for i in range(n_splits)
+        ]
+
+    elif method == "leave-one-out":
+        return [
+            (
+                pd.read_csv(filepath + f"/train_{i}.csv.gz"),
+                pd.read_csv(filepath + f"/test_{i}.csv.gz"),
+            )
+            for i in range(4)
+        ]
diff --git a/datasets/gdd/__init__.py b/datasets/gdd/__init__.py
index eee85fe..a1f8237 100644
--- a/datasets/gdd/__init__.py
+++ b/datasets/gdd/__init__.py
@@ -6,125 +6,157 @@
 import seaborn as sns
 
 
-def load_data(index='state'):
+def load_data(index="state"):
 
-    assert index in ['state', 'anomaly', 'normal', 'linear', 'pressure']
+    assert index in ["state", "anomaly", "normal", "linear", "pressure"]
     fp = os.path.dirname(__file__)
 
-    if index == 'state':
-        df = pd.read_csv(fp + '/Genesis_StateMachineLabel.csv.gz')
-    elif index == 'anomaly':
-        df = pd.read_csv(fp + '/Genesis_AnomalyLabels.csv.gz')
-    elif index == 'normal':
-        df = pd.read_csv(fp + '/Genesis_normal.csv.gz')
+    if index == "state":
+        df = pd.read_csv(fp + "/Genesis_StateMachineLabel.csv.gz")
+    elif index == "anomaly":
+        df = pd.read_csv(fp + "/Genesis_AnomalyLabels.csv.gz")
+    elif index == "normal":
+        df = pd.read_csv(fp + "/Genesis_normal.csv.gz")
         df.Timestamp = df.Timestamp / 1000
-    elif index == 'linear':
-        df = pd.read_csv(fp + '/Genesis_lineardrive.csv.gz')
+    elif index == "linear":
+        df = pd.read_csv(fp + "/Genesis_lineardrive.csv.gz")
         df.Timestamp = df.Timestamp / 1000
-    elif index == 'pressure':
-        df = pd.read_csv(fp + '/Genesis_pressure.csv.gz')
+    elif index == "pressure":
+        df = pd.read_csv(fp + "/Genesis_pressure.csv.gz")
         df.Timestamp = df.Timestamp / 1000
 
     df.Timestamp = df.Timestamp.apply(datetime.datetime.fromtimestamp)
-    
+
     return df
 
 
-def plot_genesis_labels(df, figsize=(15, 20), cmap='tab10'):
-    """ Call this for machine states and anomaly labels """
+def plot_genesis_labels(df, figsize=(15, 20), cmap="tab10"):
+    """Call this for machine states and anomaly labels"""
 
     fig, ax = plt.subplots(10, figsize=figsize)
 
-    df['MotorData.ActCurrent'].plot(ax=ax[0], legend=True, cmap=cmap)
-    df['MotorData.ActPosition'].plot(ax=ax[1], legend=True, cmap=cmap)
-    df['MotorData.ActSpeed'].plot(ax=ax[2], legend=True, cmap=cmap)
-
-    df['MotorData.IsAcceleration'].plot(ax=ax[3], legend=True, cmap=cmap)
-    df['MotorData.IsForce'].plot(ax=ax[4], legend=True, cmap=cmap)
-
-    df[['MotorData.Motor_Pos1reached',    # binary
-        'MotorData.Motor_Pos2reached',    # binary
-        'MotorData.Motor_Pos3reached',    # binary
-        'MotorData.Motor_Pos4reached',    # binary
-    ]].plot(ax=ax[5], legend=True, cmap=cmap)
-
-    df[['NVL_Recv_Ind.GL_Metall',  # binary
-        'NVL_Recv_Ind.GL_NonMetall',  # binary
-    ]].plot(ax=ax[6], legend=True, cmap=cmap)
-
-    df[['NVL_Recv_Storage.GL_I_ProcessStarted',  # binary
-        'NVL_Recv_Storage.GL_I_Slider_IN',  # binary
-        'NVL_Recv_Storage.GL_I_Slider_OUT',  # binary
-        'NVL_Recv_Storage.GL_LightBarrier',  # binary
-        'NVL_Send_Storage.ActivateStorage',  # binary
-    ]].plot(ax=ax[7], legend=True, cmap=cmap)
-
-    df[['PLC_PRG.Gripper',  # binary
-        'PLC_PRG.MaterialIsMetal',  # binary
-    ]].plot(ax=ax[8], legend=True, cmap=cmap)
-
-    df['Label'].plot(ax=ax[9], legend=True, cmap=cmap)
+    df["MotorData.ActCurrent"].plot(ax=ax[0], legend=True, cmap=cmap)
+    df["MotorData.ActPosition"].plot(ax=ax[1], legend=True, cmap=cmap)
+    df["MotorData.ActSpeed"].plot(ax=ax[2], legend=True, cmap=cmap)
+
+    df["MotorData.IsAcceleration"].plot(ax=ax[3], legend=True, cmap=cmap)
+    df["MotorData.IsForce"].plot(ax=ax[4], legend=True, cmap=cmap)
+
+    df[
+        [
+            "MotorData.Motor_Pos1reached",  # binary
+            "MotorData.Motor_Pos2reached",  # binary
+            "MotorData.Motor_Pos3reached",  # binary
+            "MotorData.Motor_Pos4reached",  # binary
+        ]
+    ].plot(ax=ax[5], legend=True, cmap=cmap)
+
+    df[
+        [
+            "NVL_Recv_Ind.GL_Metall",  # binary
+            "NVL_Recv_Ind.GL_NonMetall",  # binary
+        ]
+    ].plot(ax=ax[6], legend=True, cmap=cmap)
+
+    df[
+        [
+            "NVL_Recv_Storage.GL_I_ProcessStarted",  # binary
+            "NVL_Recv_Storage.GL_I_Slider_IN",  # binary
+            "NVL_Recv_Storage.GL_I_Slider_OUT",  # binary
+            "NVL_Recv_Storage.GL_LightBarrier",  # binary
+            "NVL_Send_Storage.ActivateStorage",  # binary
+        ]
+    ].plot(ax=ax[7], legend=True, cmap=cmap)
+
+    df[
+        [
+            "PLC_PRG.Gripper",  # binary
+            "PLC_PRG.MaterialIsMetal",  # binary
+        ]
+    ].plot(ax=ax[8], legend=True, cmap=cmap)
+
+    df["Label"].plot(ax=ax[9], legend=True, cmap=cmap)
 
     for axi in ax:
         axi.set_xlim(0, df.shape[0])
-        axi.set_ylabel('Value')
+        axi.set_ylabel("Value")
 
-    ax[0].set_title('Date: {} to {}'.format(
-        df.Timestamp.min(), df.Timestamp.max()))
-    ax[-1].set_xlabel('Time')
+    ax[0].set_title("Date: {} to {}".format(df.Timestamp.min(), df.Timestamp.max()))
+    ax[-1].set_xlabel("Time")
     fig.tight_layout()
 
     return fig, ax
-    
-    
-def plot_genesis_nonlabels(df, figsize=(15, 20), cmap='tab10'):
-    """ Call this for non-labeled data """
 
-    fig, ax = plt.subplots(8, figsize=figsize)
 
-    df[['MotorData.SetCurrent',
-        'MotorData.ActCurrent',
-    ]].plot(ax=ax[0], legend=True, cmap=cmap)
+def plot_genesis_nonlabels(df, figsize=(15, 20), cmap="tab10"):
+    """Call this for non-labeled data"""
 
-    df[['MotorData.SetSpeed',
-        'MotorData.ActSpeed',
-    ]].plot(ax=ax[1], legend=True, cmap=cmap)
-
-    df[['MotorData.SetAcceleration',
-        'MotorData.IsAcceleration',
-    ]].plot(ax=ax[2], legend=True, cmap=cmap)
-
-    df[['MotorData.SetForce',
-        'MotorData.IsForce'
-    ]].plot(ax=ax[3], legend=True, cmap=cmap)
-
-    df[['MotorData.Motor_Pos1reached',    # binary
-        'MotorData.Motor_Pos2reached',    # binary
-        'MotorData.Motor_Pos3reached',    # binary
-        'MotorData.Motor_Pos4reached',    # binary
-    ]].plot(ax=ax[4], legend=True, cmap=cmap)
-
-    df[['NVL_Recv_Ind.GL_Metall',  # binary
-        'NVL_Recv_Ind.GL_NonMetall',  # binary
-    ]].plot(ax=ax[5], legend=True, cmap=cmap)
-
-    df[['NVL_Recv_Storage.GL_I_ProcessStarted',  # binary
-        'NVL_Recv_Storage.GL_I_Slider_IN',  # binary
-        'NVL_Recv_Storage.GL_I_Slider_OUT',  # binary
-        'NVL_Recv_Storage.GL_LightBarrier',  # binary
-        'NVL_Send_Storage.ActivateStorage',  # binary
-    ]].plot(ax=ax[6], legend=True, cmap=cmap)
+    fig, ax = plt.subplots(8, figsize=figsize)
 
-    df[['PLC_PRG.Gripper',  # binary
-        'PLC_PRG.MaterialIsMetal',  # binary
-    ]].plot(ax=ax[7], legend=True, cmap=cmap)
+    df[
+        [
+            "MotorData.SetCurrent",
+            "MotorData.ActCurrent",
+        ]
+    ].plot(ax=ax[0], legend=True, cmap=cmap)
+
+    df[
+        [
+            "MotorData.SetSpeed",
+            "MotorData.ActSpeed",
+        ]
+    ].plot(ax=ax[1], legend=True, cmap=cmap)
+
+    df[
+        [
+            "MotorData.SetAcceleration",
+            "MotorData.IsAcceleration",
+        ]
+    ].plot(ax=ax[2], legend=True, cmap=cmap)
+
+    df[["MotorData.SetForce", "MotorData.IsForce"]].plot(
+        ax=ax[3], legend=True, cmap=cmap
+    )
+
+    df[
+        [
+            "MotorData.Motor_Pos1reached",  # binary
+            "MotorData.Motor_Pos2reached",  # binary
+            "MotorData.Motor_Pos3reached",  # binary
+            "MotorData.Motor_Pos4reached",  # binary
+        ]
+    ].plot(ax=ax[4], legend=True, cmap=cmap)
+
+    df[
+        [
+            "NVL_Recv_Ind.GL_Metall",  # binary
+            "NVL_Recv_Ind.GL_NonMetall",  # binary
+        ]
+    ].plot(ax=ax[5], legend=True, cmap=cmap)
+
+    df[
+        [
+            "NVL_Recv_Storage.GL_I_ProcessStarted",  # binary
+            "NVL_Recv_Storage.GL_I_Slider_IN",  # binary
+            "NVL_Recv_Storage.GL_I_Slider_OUT",  # binary
+            "NVL_Recv_Storage.GL_LightBarrier",  # binary
+            "NVL_Send_Storage.ActivateStorage",  # binary
+        ]
+    ].plot(ax=ax[6], legend=True, cmap=cmap)
+
+    df[
+        [
+            "PLC_PRG.Gripper",  # binary
+            "PLC_PRG.MaterialIsMetal",  # binary
+        ]
+    ].plot(ax=ax[7], legend=True, cmap=cmap)
 
     for axi in ax:
         axi.set_xlim(0, df.shape[0])
-        axi.set_ylabel('Value')
+        axi.set_ylabel("Value")
 
-    ax[0].set_title('Date: {} to {}'.format(df.Timestamp.min(), df.Timestamp.max()))
-    ax[-1].set_xlabel('Time')
+    ax[0].set_title("Date: {} to {}".format(df.Timestamp.min(), df.Timestamp.max()))
+    ax[-1].set_xlabel("Time")
 
     fig.tight_layout()
     return fig, ax
@@ -136,43 +168,43 @@ def gen_summary(outdir=None):
         outdir = os.path.dirname(__file__)
 
     os.makedirs(outdir, exist_ok=True)
-    sns.set(font_scale=1.1, style='whitegrid')
+    sns.set(font_scale=1.1, style="whitegrid")
+
+    with PdfPages(outdir + "/gdd_summary.pdf") as pp:
 
-    with PdfPages(outdir + '/gdd_summary.pdf') as pp:
-        
-        print('Plotting Genesis_StateMachineLabel...')
-        df = load_data(index='state')
+        print("Plotting Genesis_StateMachineLabel...")
+        df = load_data(index="state")
         fig, _ = plot_genesis_labels(df)
-        fig.savefig(pp, bbox_inches='tight', format='pdf')
+        fig.savefig(pp, bbox_inches="tight", format="pdf")
         plt.clf()
         plt.close()
 
-        print('Plotting Genesis_AnomalyLabels...')
-        df = load_data(index='anomaly')
+        print("Plotting Genesis_AnomalyLabels...")
+        df = load_data(index="anomaly")
         fig, _ = plot_genesis_labels(df)
-        fig.savefig(pp, bbox_inches='tight', format='pdf')
+        fig.savefig(pp, bbox_inches="tight", format="pdf")
         plt.clf()
         plt.close()
 
-        print('Plotting Genesis_normal...')
-        df = load_data(index='normal')
+        print("Plotting Genesis_normal...")
+        df = load_data(index="normal")
         fig, _ = plot_genesis_nonlabels(df)
-        fig.savefig(pp, bbox_inches='tight', format='pdf')
+        fig.savefig(pp, bbox_inches="tight", format="pdf")
         plt.clf()
         plt.close()
 
-        print('Plotting Genesis_lineardrive...')
-        df = load_data(index='linear')
+        print("Plotting Genesis_lineardrive...")
+        df = load_data(index="linear")
         fig, _ = plot_genesis_nonlabels(df)
-        fig.savefig(pp, bbox_inches='tight', format='pdf')
+        fig.savefig(pp, bbox_inches="tight", format="pdf")
         plt.clf()
         plt.close()
 
-        print('Plotting Genesis_pressure...')
-        df = load_data(index='pressure')
+        print("Plotting Genesis_pressure...")
+        df = load_data(index="pressure")
         fig, _ = plot_genesis_nonlabels(df)
-        fig.savefig(pp, bbox_inches='tight', format='pdf')
+        fig.savefig(pp, bbox_inches="tight", format="pdf")
         plt.clf()
         plt.close()
 
-        print("done!")
\ No newline at end of file
+        print("done!")
diff --git a/datasets/gfd/__init__.py b/datasets/gfd/__init__.py
index a3a4700..cb23013 100644
--- a/datasets/gfd/__init__.py
+++ b/datasets/gfd/__init__.py
@@ -5,21 +5,24 @@
 from matplotlib.backends.backend_pdf import PdfPages
 
 
-__name__ = 'gfd'
+__name__ = "gfd"
 
 
-def load_data(label='h', load=0):
-    assert label in ['h', 'b']
+def load_data(label="h", load=0):
+    assert label in ["h", "b"]
     assert load in [i for i in range(0, 100, 10)]
-    return pd.read_csv(os.path.dirname(__file__) + f'/{label}30hz{load}.csv.gz')
+    return pd.read_csv(os.path.dirname(__file__) + f"/{label}30hz{load}.csv.gz")
 
 
 def load_all_data_dict():
     all_data_dict = {}
-    for label in ['h', 'b']:
-        all_data_dict[label] = {load : load_data(label=label,load=load) for load in range(0,100,10)}
+    for label in ["h", "b"]:
+        all_data_dict[label] = {
+            load: load_data(label=label, load=load) for load in range(0, 100, 10)
+        }
     return all_data_dict
 
+
 def plot_sequence(df, st=0, ed=None, ax=None, figsize=(10, 3), individual=True):
     if ed is None:
         ed = df.shape[0]
@@ -39,6 +42,7 @@ def plot_sequence(df, st=0, ed=None, ax=None, figsize=(10, 3), individual=True):
 
         df.iloc[st:ed].plot(ax=ax, figsize=figsize, legend=True)
 
+
 def gen_summary(outdir=None, st=0, ed=500, wd=20, hg=8):
 
     if outdir is None:
@@ -46,31 +50,48 @@ def gen_summary(outdir=None, st=0, ed=500, wd=20, hg=8):
 
     os.makedirs(outdir, exist_ok=True)
 
-    with PdfPages(outdir + '/gfd_summary.pdf') as pp:
-        for label in ['h', 'b']:
+    with PdfPages(outdir + "/gfd_summary.pdf") as pp:
+        for label in ["h", "b"]:
             for load in tqdm.trange(0, 100, 10, desc=label):
                 fig, ax = plt.subplots(5)
 
-                plot_sequence(load_data(label=label, load=load), st=st, ed=ed, ax=ax[:4], figsize=(wd, hg), individual=True)
-                plot_sequence(load_data(label=label, load=load), st=st, ed=ed, ax=ax[-1], figsize=(wd, hg), individual=False)
-
-                ax[-1].set_xlabel('Time')
-                for axi in ax: axi.set_ylabel('Value')
-                name = 'Healty' if label == 'h' else 'BrokenTooth'
-                ax[0].set_title(f'{name}: Load= {load}')
-
-                fig.savefig(pp, bbox_inches='tight', format='pdf')
+                plot_sequence(
+                    load_data(label=label, load=load),
+                    st=st,
+                    ed=ed,
+                    ax=ax[:4],
+                    figsize=(wd, hg),
+                    individual=True,
+                )
+                plot_sequence(
+                    load_data(label=label, load=load),
+                    st=st,
+                    ed=ed,
+                    ax=ax[-1],
+                    figsize=(wd, hg),
+                    individual=False,
+                )
+
+                ax[-1].set_xlabel("Time")
+                for axi in ax:
+                    axi.set_ylabel("Value")
+                name = "Healty" if label == "h" else "BrokenTooth"
+                ax[0].set_title(f"{name}: Load= {load}")
+
+                fig.savefig(pp, bbox_inches="tight", format="pdf")
                 plt.clf()
                 plt.close()
 
+
 def plot_sequences_under_h_and_b_conditions():
     all_data_dict = load_all_data_dict()
-    for load in range(0,100,10):
-        data = pd.concat([all_data_dict["h"][load], all_data_dict["b"][load]],axis=0).reset_index(drop=True)
-        septime=len(all_data_dict["h"][load])
-        fig, axes = plt.subplots(4, figsize=(12,9))
+    for load in range(0, 100, 10):
+        data = pd.concat(
+            [all_data_dict["h"][load], all_data_dict["b"][load]], axis=0
+        ).reset_index(drop=True)
+        septime = len(all_data_dict["h"][load])
+        fig, axes = plt.subplots(4, figsize=(12, 9))
         axes[0].set_title(f"healthy and broken tooth conditions load: {load}")
         for i, col in enumerate(data.columns):
             data[col].reset_index(drop=True).iloc[::10].plot(y=col, ax=axes[i])
-            axes[i].axvline(x=septime,color="red")
-        
+            axes[i].axvline(x=septime, color="red")
diff --git a/datasets/hydsys/__init__.py b/datasets/hydsys/__init__.py
index c13acd7..2d9458a 100644
--- a/datasets/hydsys/__init__.py
+++ b/datasets/hydsys/__init__.py
@@ -5,20 +5,20 @@
 
 
 def load_data(sensor=None, rw=0):
-    
+
     if sensor is None:
         # load full data
         # rw is ignored to concatenate all sensor data
         df = []
-        df.append(load_sensor_data('PS', rw=10))  # default length: 6000
-        df.append(load_sensor_data('EPS', rw=10))  # default length: 6000
-        df.append(load_sensor_data('FS', rw=0))  # default length: 600
+        df.append(load_sensor_data("PS", rw=10))  # default length: 6000
+        df.append(load_sensor_data("EPS", rw=10))  # default length: 6000
+        df.append(load_sensor_data("FS", rw=0))  # default length: 600
         # df.append(load_sensor_data('TS', rw=0))  # default length: 60
         return pd.concat(df)
 
     else:
         return load_sensor_data(sensor, rw)
-        
+
 
 def load_sensor_data(sensor, rw=0):
 
@@ -27,50 +27,54 @@ def load_sensor_data(sensor, rw=0):
     fp = os.path.dirname(__file__)
 
     for name in tqdm.tqdm(sensor_list, desc=sensor):
-        df = pd.DataFrame(np.loadtxt(fp + f'/{name}.txt.gz'))
+        df = pd.DataFrame(np.loadtxt(fp + f"/{name}.txt.gz"))
         df = resample(df, rw)
-        df['sensor'] = name
-        df['cycle'] = df.index.values
+        df["sensor"] = name
+        df["cycle"] = df.index.values
         data.append(df)
 
-    return pd.concat(data).set_index(['cycle', 'sensor']).reset_index()
+    return pd.concat(data).set_index(["cycle", "sensor"]).reset_index()
 
 
 def get_sensor_list(name):
-    if name == 'PS':
-        return [f'PS{i+1}' for i in range(6)]
-    elif name == 'EPS':
-        return ['EPS1']
-    elif name == 'FS':
-        return [f'FS{i+1}' for i in range(2)]
-    elif name == 'TS':
-        return [f'TS{i+1}' for i in range(4)]
-    elif name == 'VS':
-        return ['VS1']
+    if name == "PS":
+        return [f"PS{i+1}" for i in range(6)]
+    elif name == "EPS":
+        return ["EPS1"]
+    elif name == "FS":
+        return [f"FS{i+1}" for i in range(2)]
+    elif name == "TS":
+        return [f"TS{i+1}" for i in range(4)]
+    elif name == "VS":
+        return ["VS1"]
     else:
         raise ValueError
 
 
 def load_labels():
     fp = os.path.dirname(__file__)
-    return pd.DataFrame(np.loadtxt(fp + '/profile.txt'),
-        columns=[
-            'cooler_condition',
-            'valve_condition',
-            'internal_pump_leakage',
-            'hydraulic_accumulator',
-            'stable_flag']).reset_index().rename(
-                columns={'index': 'cycle'})
+    return (
+        pd.DataFrame(
+            np.loadtxt(fp + "/profile.txt"),
+            columns=[
+                "cooler_condition",
+                "valve_condition",
+                "internal_pump_leakage",
+                "hydraulic_accumulator",
+                "stable_flag",
+            ],
+        )
+        .reset_index()
+        .rename(columns={"index": "cycle"})
+    )
 
 
 def resample(df, rw=0):
     if rw > 0:
         # Resampling
         df = df.T
-        df['index'] = df.index.values // rw
-        df = df.groupby('index').mean()
+        df["index"] = df.index.values // rw
+        df = df.groupby("index").mean()
         df = df.T
 
     return df
-
-
diff --git a/datasets/mapm/__init__.py b/datasets/mapm/__init__.py
index 9ad727f..b44553a 100644
--- a/datasets/mapm/__init__.py
+++ b/datasets/mapm/__init__.py
@@ -12,17 +12,19 @@ def load_data():
     fp = os.path.dirname(__file__)
 
     # Sensor data
-    data = pd.read_csv(fp + '/PdM_telemetry.csv.gz')
+    data = pd.read_csv(fp + "/PdM_telemetry.csv.gz")
 
     # Error alarm logs
     data = data.merge(
-        pd.read_csv(fp + '/PdM_errors.csv.gz'),
-        how='left', on=['datetime', 'machineID'])
+        pd.read_csv(fp + "/PdM_errors.csv.gz"), how="left", on=["datetime", "machineID"]
+    )
 
     # Failure logs
     data = data.merge(
-        pd.read_csv(fp + '/PdM_failures.csv.gz'),
-        how='left', on=['datetime', 'machineID'])
+        pd.read_csv(fp + "/PdM_failures.csv.gz"),
+        how="left",
+        on=["datetime", "machineID"],
+    )
 
     # Formatting
     data.datetime = pd.to_datetime(data.datetime)
@@ -33,19 +35,19 @@ def load_data():
 def cleaning(df):
 
     # NaN values are encoded to -1
-    df = df.sort_values('errorID')
+    df = df.sort_values("errorID")
     df.errorID = df.errorID.factorize()[0]
-    df = df.sort_values('failure')
+    df = df.sort_values("failure")
     df.failure = df.failure.factorize()[0]
-    df = df.sort_values(['machineID', 'datetime'])
+    df = df.sort_values(["machineID", "datetime"])
 
-    df.errorID = df.errorID.astype('category')
-    df.failure = df.failure.astype('category')
+    df.errorID = df.errorID.astype("category")
+    df.failure = df.failure.astype("category")
 
-    df.volt = df.volt.astype('float32')
-    df.rotate = df.rotate.astype('float32')
-    df.pressure = df.pressure.astype('float32')
-    df.vibration = df.vibration.astype('float32')
+    df.volt = df.volt.astype("float32")
+    df.rotate = df.rotate.astype("float32")
+    df.pressure = df.pressure.astype("float32")
+    df.vibration = df.vibration.astype("float32")
 
     df.datetime = pd.to_datetime(df.datetime)
     return df
@@ -55,73 +57,94 @@ def load_clean_data():
     return cleaning(load_data())
 
 
-def generate_run_to_failure(raw_data, health_censor_aug=1000,
-                            min_lifetime=10, max_lifetime=300,
-                            seed=123, outfn=None):
+def generate_run_to_failure(
+    raw_data,
+    health_censor_aug=1000,
+    min_lifetime=10,
+    max_lifetime=300,
+    seed=123,
+    outfn=None,
+):
 
     run_to_failure = []
     error_ids = raw_data.errorID.dropna().sort_values().unique().tolist()
 
-    for machine_id, g in tqdm.tqdm(raw_data.groupby('machineID'), desc='run-to-failure'):
-        g = g.set_index('datetime').sort_index()
+    for machine_id, g in tqdm.tqdm(
+        raw_data.groupby("machineID"), desc="run-to-failure"
+    ):
+        g = g.set_index("datetime").sort_index()
 
         start_date = g.index.values[0]
         failures = g.loc[~g.failure.isnull()]
 
         for event_time, event in failures.iterrows():
             # Extracting a single cycle/process
-            cycle = g[start_date:event_time].drop('machineID', axis=1)
+            cycle = g[start_date:event_time].drop("machineID", axis=1)
 
             lifetime = (event_time - start_date).days
             if lifetime < 1:
                 start_date = event_time
                 continue
 
-            numerical_features = cycle.agg(['min', 'max', 'mean']).unstack().reset_index()
-            numerical_features['feature'] = numerical_features.level_0.str.cat(numerical_features.level_1, sep='_')
-            numerical_features = numerical_features.pivot_table(columns='feature', values=0)
+            numerical_features = (
+                cycle.agg(["min", "max", "mean"]).unstack().reset_index()
+            )
+            numerical_features["feature"] = numerical_features.level_0.str.cat(
+                numerical_features.level_1, sep="_"
+            )
+            numerical_features = numerical_features.pivot_table(
+                columns="feature", values=0
+            )
 
-            categorical_features = pd.DataFrame(Counter(cycle.errorID), columns=error_ids, index=[0])
+            categorical_features = pd.DataFrame(
+                Counter(cycle.errorID), columns=error_ids, index=[0]
+            )
 
             sample = pd.concat([numerical_features, categorical_features], axis=1)
-            sample[['machine_id', 'lifetime', 'broken']] = machine_id, lifetime, 1
+            sample[["machine_id", "lifetime", "broken"]] = machine_id, lifetime, 1
 
             run_to_failure.append(sample)
             start_date = event_time
 
     run_to_failure = pd.concat(run_to_failure, axis=0).reset_index(drop=True)
 
-    health_censors = censoring_augmentation(raw_data,
+    health_censors = censoring_augmentation(
+        raw_data,
         n_samples=health_censor_aug,
         min_lifetime=min_lifetime,
         max_lifetime=max_lifetime,
-        seed=seed)
+        seed=seed,
+    )
 
     run_to_failure = pd.concat([run_to_failure, health_censors])
 
     # Shuffle
-    run_to_failure = run_to_failure.sample(frac=1, random_state=seed).reset_index(drop=True)
-    run_to_failure = run_to_failure.fillna(0.)
-    
+    run_to_failure = run_to_failure.sample(frac=1, random_state=seed).reset_index(
+        drop=True
+    )
+    run_to_failure = run_to_failure.fillna(0.0)
+
     if outfn is not None:
         run_to_failure.to_csv(outfn, index=False)
 
     return run_to_failure
 
 
-def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetime=2, seed=123):
+def censoring_augmentation(
+    raw_data, n_samples=10, max_lifetime=150, min_lifetime=2, seed=123
+):
 
     error_ids = raw_data.errorID.dropna().sort_values().unique().tolist()
     np.random.seed(seed)
     samples = []
-    pbar = tqdm.tqdm(total=n_samples, desc='augmentation')
+    pbar = tqdm.tqdm(total=n_samples, desc="augmentation")
 
     while len(samples) < n_samples:
-        
+
         censor_timing = np.random.randint(min_lifetime, max_lifetime)
         machine_id = np.random.randint(100) + 1
         tmp = raw_data[raw_data.machineID == machine_id]
-        tmp = tmp.drop('machineID', axis=1).set_index('datetime').sort_index()
+        tmp = tmp.drop("machineID", axis=1).set_index("datetime").sort_index()
 
         failures = tmp[~tmp.failure.isnull()]
         if failures.shape[0] < 2:
@@ -130,7 +153,11 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim
         failure_id = np.random.randint(failures.shape[0])
         failure = failures.iloc[failure_id]
         event_time = failure.name
-        start_date = tmp.index.values[0] if failure_id == 0 else failures.iloc[failure_id - 1].name
+        start_date = (
+            tmp.index.values[0]
+            if failure_id == 0
+            else failures.iloc[failure_id - 1].name
+        )
 
         # censoring
         cycle = tmp[start_date:event_time]
@@ -139,14 +166,20 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim
         if not cycle.shape[0] == censor_timing:
             continue
 
-        numerical_features = cycle.agg(['min', 'max', 'mean', 'std']).unstack().reset_index()
-        numerical_features['feature'] = numerical_features.level_0.str.cat(numerical_features.level_1, sep='_')
-        numerical_features = numerical_features.pivot_table(columns='feature', values=0)
+        numerical_features = (
+            cycle.agg(["min", "max", "mean", "std"]).unstack().reset_index()
+        )
+        numerical_features["feature"] = numerical_features.level_0.str.cat(
+            numerical_features.level_1, sep="_"
+        )
+        numerical_features = numerical_features.pivot_table(columns="feature", values=0)
 
-        categorical_features = pd.DataFrame(Counter(cycle.errorID), columns=error_ids, index=[0])
+        categorical_features = pd.DataFrame(
+            Counter(cycle.errorID), columns=error_ids, index=[0]
+        )
 
         sample = pd.concat([numerical_features, categorical_features], axis=1)
-        sample[['machine_id', 'lifetime', 'broken']] = machine_id, censor_timing, 0
+        sample[["machine_id", "lifetime", "broken"]] = machine_id, censor_timing, 0
         samples.append(sample)
         pbar.update(1)
 
@@ -154,21 +187,23 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim
     return pd.concat(samples).reset_index(drop=True).fillna(0)
 
 
-def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None):
+def generate_validation_sets(method="kfold", n_splits=5, seed=123, outdir=None):
 
     validation_sets = []
 
-    if method == 'kfold':
+    if method == "kfold":
         # K-fold cross validation
         assert type(n_splits) == int
         assert n_splits > 2
 
         raw_data = load_data()
 
-        kfold = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=seed)
+        kfold = model_selection.KFold(
+            n_splits=n_splits, shuffle=True, random_state=seed
+        )
         for i, (train_index, test_index) in enumerate(kfold.split(np.arange(100))):
-            
-            print('K-fold {}/{}'.format(i+1, n_splits))
+
+            print("K-fold {}/{}".format(i + 1, n_splits))
             # train/test split by machine ID
             train_machines = raw_data[raw_data.machineID.isin(train_index)]
             test_machines = raw_data[raw_data.machineID.isin(test_index)]
@@ -177,9 +212,11 @@ def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None):
 
             # convert the two sets into run-to-failure data
             train_censored_data = generate_run_to_failure(
-                train_machines, health_censor_aug=len(train_index)*10, seed=seed)
+                train_machines, health_censor_aug=len(train_index) * 10, seed=seed
+            )
             test_consored_data = generate_run_to_failure(
-                test_machines, health_censor_aug=len(test_index)*10, seed=seed)
+                test_machines, health_censor_aug=len(test_index) * 10, seed=seed
+            )
 
             # print('train:', train_censored_data.shape)
             # print('test:', test_consored_data.shape)
@@ -187,19 +224,23 @@ def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None):
             validation_sets.append((train_censored_data, test_consored_data))
 
             if outdir is not None:
-                train_censored_data.to_csv(outdir + f'/train_{i}.csv.gz', index=False)
-                test_consored_data.to_csv(outdir + f'/test_{i}.csv.gz', index=False)
+                train_censored_data.to_csv(outdir + f"/train_{i}.csv.gz", index=False)
+                test_consored_data.to_csv(outdir + f"/test_{i}.csv.gz", index=False)
 
-    elif method == 'leave-one-out':
+    elif method == "leave-one-out":
         raise NotImplementedError
 
     return validation_sets
 
 
 def load_validation_sets(filepath, n_splits=5):
-    return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'),
-             pd.read_csv(filepath + f'/test_{i}.csv.gz'))
-             for i in range(n_splits)]
+    return [
+        (
+            pd.read_csv(filepath + f"/train_{i}.csv.gz"),
+            pd.read_csv(filepath + f"/test_{i}.csv.gz"),
+        )
+        for i in range(n_splits)
+    ]
 
 
 def plot_sequence_and_events(data, machine_id=1):
@@ -207,17 +248,17 @@ def plot_sequence_and_events(data, machine_id=1):
     data = data[data.machineID == machine_id]
     fig, ax = plt.subplots(4 + 2, figsize=(8, 8))
 
-    data.plot(y='volt', legend=True, ax=ax[0])
-    data.plot(y='rotate', legend=True, ax=ax[1])
-    data.plot(y='pressure', legend=True, ax=ax[2])
-    data.plot(y='vibration', legend=True, ax=ax[3])
+    data.plot(y="volt", legend=True, ax=ax[0])
+    data.plot(y="rotate", legend=True, ax=ax[1])
+    data.plot(y="pressure", legend=True, ax=ax[2])
+    data.plot(y="vibration", legend=True, ax=ax[3])
 
     if data.errorID.isnull().sum() < data.errorID.shape[0]:
         pd.get_dummies(data.errorID).plot(ax=ax[4])
     if data.failure.isnull().sum() < data.failure.shape[0]:
         pd.get_dummies(data.failure).plot(ax=ax[5])
 
-    ax[0].set_title('Machine #{}'.format(machine_id))
+    ax[0].set_title("Machine #{}".format(machine_id))
 
     for i in range(5):
         ax[i].set_xlabel(None)
@@ -236,9 +277,9 @@ def gen_summary(outdir=None):
     os.makedirs(outdir, exist_ok=True)
     df = load_data()
 
-    with PdfPages(outdir + '/mapm_summary.pdf') as pp:
+    with PdfPages(outdir + "/mapm_summary.pdf") as pp:
         for i in tqdm.trange(1, 101):
             fig, _ = plot_sequence_and_events(df, machine_id=i)
-            fig.savefig(pp, format='pdf')
+            fig.savefig(pp, format="pdf")
             plt.clf()
             plt.close()
diff --git a/datasets/ppd/__init__.py b/datasets/ppd/__init__.py
index 06b0d2a..0d4daa8 100644
--- a/datasets/ppd/__init__.py
+++ b/datasets/ppd/__init__.py
@@ -8,62 +8,61 @@
 
 
 def load_data(index=0):
-    """ 0: C7
-        1: C8
-        2: C9
-        3: C11
-        4: C13
-        5: C14
-        6: C15
-        7: C16
-
-        Note that C7 and C13 included a short break
-        (for about 100 timestamps long)
-        between the two procedure.
+    """0: C7
+    1: C8
+    2: C9
+    3: C11
+    4: C13
+    5: C14
+    6: C15
+    7: C16
+
+    Note that C7 and C13 included a short break
+    (for about 100 timestamps long)
+    between the two procedure.
 
     """
     fp = os.path.dirname(__file__)
     if index == 0:
-        df = pd.read_csv(fp + '/C7-1.csv.gz')
-        df = pd.concat([df, pd.read_csv(fp + '/C7-2.csv.gz')])
+        df = pd.read_csv(fp + "/C7-1.csv.gz")
+        df = pd.concat([df, pd.read_csv(fp + "/C7-2.csv.gz")])
         df = df.reset_index(drop=True)
         df.Timestamp = df.index.values
         return df
     elif index == 1:
-        return pd.read_csv(fp + '/C8.csv.gz')
+        return pd.read_csv(fp + "/C8.csv.gz")
     elif index == 2:
-        return pd.read_csv(fp + '/C9.csv.gz')
+        return pd.read_csv(fp + "/C9.csv.gz")
     elif index == 3:
-        return pd.read_csv(fp + '/C11.csv.gz')
+        return pd.read_csv(fp + "/C11.csv.gz")
     elif index == 4:
-        df = pd.read_csv(fp + '/C13-1.csv.gz')
-        df = pd.concat([df, pd.read_csv(fp + '/C13-2.csv.gz')])
+        df = pd.read_csv(fp + "/C13-1.csv.gz")
+        df = pd.concat([df, pd.read_csv(fp + "/C13-2.csv.gz")])
         df = df.reset_index(drop=True)
         df.Timestamp = df.index.values
         return df
     elif index == 5:
-        return pd.read_csv(fp + '/C14.csv.gz')
+        return pd.read_csv(fp + "/C14.csv.gz")
     elif index == 6:
-        return pd.read_csv(fp + '/C15.csv.gz')
+        return pd.read_csv(fp + "/C15.csv.gz")
     elif index == 7:
-        return pd.read_csv(fp + '/C16.csv.gz')
+        return pd.read_csv(fp + "/C16.csv.gz")
     else:
         raise ValueError
 
 
 def rename_components(df):
-    """ current and speed
-    """
+    """current and speed"""
     # Rename L
-    L_curr = ['L_1', 'L_3', 'L_4', 'L_7', 'L_9']
-    L_speed = ['L_2', 'L_6', 'L_5', 'L_8', 'L_10']
-    df = df.rename(columns={k: f'c{i}_curr' for i, k in enumerate(L_curr)})
-    df = df.rename(columns={k: f'c{i}_speed' for i, k in enumerate(L_speed)})
+    L_curr = ["L_1", "L_3", "L_4", "L_7", "L_9"]
+    L_speed = ["L_2", "L_6", "L_5", "L_8", "L_10"]
+    df = df.rename(columns={k: f"c{i}_curr" for i, k in enumerate(L_curr)})
+    df = df.rename(columns={k: f"c{i}_speed" for i, k in enumerate(L_speed)})
 
     # Rename A, B, and C
-    df = df.rename(columns={f'A_{i}': f'c5_val{i}' for i in range(1, 6)})
-    df = df.rename(columns={f'B_{i}': f'c6_val{i}' for i in range(1, 6)})
-    df = df.rename(columns={f'C_{i}': f'c7_val{i}' for i in range(1, 6)})
+    df = df.rename(columns={f"A_{i}": f"c5_val{i}" for i in range(1, 6)})
+    df = df.rename(columns={f"B_{i}": f"c6_val{i}" for i in range(1, 6)})
+    df = df.rename(columns={f"C_{i}": f"c7_val{i}" for i in range(1, 6)})
 
     return df[df.columns.sort_values()]
 
@@ -75,11 +74,11 @@ def load_clean_data(index=0):
 def set_broken_labels(df, size):
     labels = np.zeros(df.shape[0])
     labels[-size:] = 1
-    df['broken'] = labels
+    df["broken"] = labels
     return df
 
 
-def run_to_failure_aux(df, n_sample, desc=''):
+def run_to_failure_aux(df, n_sample, desc=""):
 
     seq_len = df.shape[0]
     samples = []
@@ -88,7 +87,7 @@ def run_to_failure_aux(df, n_sample, desc=''):
     while len(samples) < n_sample:
         # random censoring
         t = np.random.randint(2, seq_len)
-        sample = {'lifetime': t, 'broken': df.loc[t, 'broken']}
+        sample = {"lifetime": t, "broken": df.loc[t, "broken"]}
         sample = pd.DataFrame(sample, index=[0])
         features = df.iloc[:t].mean(axis=0)[:-1]
         sample[features.keys()] = features.values
@@ -102,14 +101,13 @@ def run_to_failure_aux(df, n_sample, desc=''):
 def generate_run_to_failure(n_sample=1000, bronken_holdout_steps=2000):
 
     samples = []
-    print('Generating run-to-failure data:')
+    print("Generating run-to-failure data:")
 
     for index in range(8):
-        raw_df = load_clean_data(index=index).set_index('Timestamp')
+        raw_df = load_clean_data(index=index).set_index("Timestamp")
         raw_df = set_broken_labels(raw_df, size=bronken_holdout_steps)
-        sample = run_to_failure_aux(
-            raw_df, n_sample, desc=f'component {index+1}/8')
-        sample['trial_id'] = index
+        sample = run_to_failure_aux(raw_df, n_sample, desc=f"component {index+1}/8")
+        sample["trial_id"] = index
         samples.append(sample)
 
     return pd.concat(samples, axis=0).reset_index(drop=True)
@@ -121,21 +119,22 @@ def gen_summary(outdir=None):
         outdir = os.path.dirname(__file__)
 
     os.makedirs(outdir, exist_ok=True)
-    sns.set(font_scale=1.1, style='whitegrid')
+    sns.set(font_scale=1.1, style="whitegrid")
 
-    with PdfPages(outdir + '/ppd_summary.pdf') as pp:
+    with PdfPages(outdir + "/ppd_summary.pdf") as pp:
         for i in tqdm.trange(8):
             df = load_data(index=i)
             df = rename_components(df)
             fig, ax = plt.subplots(8, figsize=(20, 20))
 
             for i in range(8):
-                df.loc[:, df.columns.str.contains(f'c{i}')].plot(
-                    ax=ax[i], legend=True, cmap='tab10')
+                df.loc[:, df.columns.str.contains(f"c{i}")].plot(
+                    ax=ax[i], legend=True, cmap="tab10"
+                )
                 ax[i].set_ylabel("Value")
                 ax[i].set_xlim(0, df.shape[0])
 
-            ax[-1].set_xlabel('Time')
-            fig.savefig(pp, bbox_inches='tight', format='pdf')
+            ax[-1].set_xlabel("Time")
+            fig.savefig(pp, bbox_inches="tight", format="pdf")
             plt.clf()
-            plt.close()
\ No newline at end of file
+            plt.close()
diff --git a/datasets/ufd/__init__.py b/datasets/ufd/__init__.py
index 88922d1..ba8f565 100644
--- a/datasets/ufd/__init__.py
+++ b/datasets/ufd/__init__.py
@@ -3,40 +3,40 @@
 import numpy as np
 
 
-def load_data(meter_id='A'):
+def load_data(meter_id="A"):
     fp = os.path.dirname(__file__)
-    data = np.loadtxt(fp + '/Meter{}.txt'.format(meter_id))
+    data = np.loadtxt(fp + "/Meter{}.txt".format(meter_id))
 
-    if meter_id == 'A':
-        columns =  ['flatness_ratio', 'symmetry', 'crossflow']
-        columns += ['flow_velocity_{}'.format(i+1) for i in range(8)]
-        columns += ['sound_speed_{}'.format(i+1) for i in range(8)]
-        columns += ['average_speed']
-        columns += ['gain_{}'.format(i+1) for i in range(16)]
-        columns += ['health_state']
+    if meter_id == "A":
+        columns = ["flatness_ratio", "symmetry", "crossflow"]
+        columns += ["flow_velocity_{}".format(i + 1) for i in range(8)]
+        columns += ["sound_speed_{}".format(i + 1) for i in range(8)]
+        columns += ["average_speed"]
+        columns += ["gain_{}".format(i + 1) for i in range(16)]
+        columns += ["health_state"]
 
-    if meter_id == 'B':
-        columns =  ['profile_factor', 'symmetry', 'crossflow', 'swirl_angle']
-        columns += ['flow_velocity_{}'.format(i+1) for i in range(4)]
-        columns += ['average_flow']
-        columns += ['sound_speed_{}'.format(i+1) for i in range(4)]
-        columns += ['average_speed']
-        columns += ['signal_strength_{}'.format(i+1) for i in range(8)]
-        columns += ['turbulence_{}'.format(i+1) for i in range(4)]
-        columns += ['meter_performance']
-        columns += ['signal_quality_{}'.format(i+1) for i in range(8)]
-        columns += ['gain_{}'.format(i+1) for i in range(8)]
-        columns += ['transit_time_{}'.format(i+1) for i in range(8)]
-        columns += ['health_state']
+    if meter_id == "B":
+        columns = ["profile_factor", "symmetry", "crossflow", "swirl_angle"]
+        columns += ["flow_velocity_{}".format(i + 1) for i in range(4)]
+        columns += ["average_flow"]
+        columns += ["sound_speed_{}".format(i + 1) for i in range(4)]
+        columns += ["average_speed"]
+        columns += ["signal_strength_{}".format(i + 1) for i in range(8)]
+        columns += ["turbulence_{}".format(i + 1) for i in range(4)]
+        columns += ["meter_performance"]
+        columns += ["signal_quality_{}".format(i + 1) for i in range(8)]
+        columns += ["gain_{}".format(i + 1) for i in range(8)]
+        columns += ["transit_time_{}".format(i + 1) for i in range(8)]
+        columns += ["health_state"]
 
-    if meter_id == 'C' or meter_id == 'D':
-        columns =  ['profile_factor', 'symmetry', 'crossflow']
-        columns += ['flow_velocity_{}'.format(i+1) for i in range(4)]
-        columns += ['sound_speed_{}'.format(i+1) for i in range(4)]
-        columns += ['signal_strength_{}'.format(i+1) for i in range(8)]
-        columns += ['signal_quality_{}'.format(i+1) for i in range(8)]
-        columns += ['gain_{}'.format(i+1) for i in range(8)]
-        columns += ['transit_time_{}'.format(i+1) for i in range(8)]
-        columns += ['health_state']
+    if meter_id == "C" or meter_id == "D":
+        columns = ["profile_factor", "symmetry", "crossflow"]
+        columns += ["flow_velocity_{}".format(i + 1) for i in range(4)]
+        columns += ["sound_speed_{}".format(i + 1) for i in range(4)]
+        columns += ["signal_strength_{}".format(i + 1) for i in range(8)]
+        columns += ["signal_quality_{}".format(i + 1) for i in range(8)]
+        columns += ["gain_{}".format(i + 1) for i in range(8)]
+        columns += ["transit_time_{}".format(i + 1) for i in range(8)]
+        columns += ["health_state"]
 
-    return pd.DataFrame(data, columns=columns)
\ No newline at end of file
+    return pd.DataFrame(data, columns=columns)
diff --git a/datasets/utils.py b/datasets/utils.py
index 9153e9f..190a89b 100644
--- a/datasets/utils.py
+++ b/datasets/utils.py
@@ -4,10 +4,8 @@
 
 def train_test_split(arrays, test_size=0.2, random_state=None, shuffle=False):
     return model_selection.train_test_split(
-        arrays=arrays,
-        test_size=test_size,
-        random_state=random_state,
-        shuffle=shuffle)
+        arrays=arrays, test_size=test_size, random_state=random_state, shuffle=shuffle
+    )
 
 
 def hist_survival_time(data, ax=None, figsize=(6, 4), bins_0=20, bins_1=30):
@@ -15,15 +13,15 @@ def hist_survival_time(data, ax=None, figsize=(6, 4), bins_0=20, bins_1=30):
     if ax is None:
         fig, ax = plt.subplots(figsize=figsize)
 
-    time_0 = data.loc[data['broken'] == 0, 'lifetime']
-    ax.hist(time_0, bins=bins_0, alpha=0.3, color='blue', label='not broken yet')
+    time_0 = data.loc[data["broken"] == 0, "lifetime"]
+    ax.hist(time_0, bins=bins_0, alpha=0.3, color="blue", label="not broken yet")
 
-    time_1 = data.loc[data['broken'] == 1, 'lifetime']
-    ax.hist(time_1, bins=bins_1, alpha=0.7, color='black', label='broken')
+    time_1 = data.loc[data["broken"] == 1, "lifetime"]
+    ax.hist(time_1, bins=bins_1, alpha=0.7, color="black", label="broken")
 
-    ax.set_title( 'Histogram - survival time', fontsize=15)
+    ax.set_title("Histogram - survival time", fontsize=15)
 
     if ax is None:
         return fig, ax
     else:
-        return ax
\ No newline at end of file
+        return ax