From 5919fe3f576008d1ae62348d6c2fe327d85f0c43 Mon Sep 17 00:00:00 2001 From: kotaNakm Date: Sat, 15 Apr 2023 00:37:58 +0900 Subject: [PATCH] a --- datasets/__init__.py | 2 +- datasets/alpi/__init__.py | 400 +++++++++++++++++++++++------------- datasets/alpi/dataset.py | 398 ++++++++++++++++++++++------------- datasets/cbm/__init__.py | 28 +-- datasets/cmapss/__init__.py | 169 +++++++++------ datasets/gdd/__init__.py | 250 ++++++++++++---------- datasets/gfd/__init__.py | 67 +++--- datasets/hydsys/__init__.py | 66 +++--- datasets/mapm/__init__.py | 161 +++++++++------ datasets/ppd/__init__.py | 91 ++++---- datasets/ufd/__init__.py | 64 +++--- datasets/utils.py | 18 +- 12 files changed, 1031 insertions(+), 683 deletions(-) diff --git a/datasets/__init__.py b/datasets/__init__.py index a43e300..6d7b9ca 100644 --- a/datasets/__init__.py +++ b/datasets/__init__.py @@ -7,4 +7,4 @@ from . import mapm from . import ppd from . import ufd -from . import utils \ No newline at end of file +from . import utils diff --git a/datasets/alpi/__init__.py b/datasets/alpi/__init__.py index f81cd72..15004ec 100644 --- a/datasets/alpi/__init__.py +++ b/datasets/alpi/__init__.py @@ -11,12 +11,12 @@ from scipy import sparse from sklearn.model_selection import train_test_split -FAMILY = 'MACHINE_TYPE_00' +FAMILY = "MACHINE_TYPE_00" # UTILS def zip_dir(dir_path, zip_path): - shutil.make_archive(zip_path, 'zip', dir_path) + shutil.make_archive(zip_path, "zip", dir_path) def drop_files(dir_path, extension): @@ -27,7 +27,9 @@ def drop_files(dir_path, extension): def prune(alarms): - return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype('uint16') + return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype( + "uint16" + ) def prune_series(seq): @@ -38,24 +40,24 @@ def prune_series(seq): def padding_sequence(seq, sequence_length): new_seq = np.zeros(sequence_length) - new_seq[sequence_length - len(seq):] = seq + new_seq[sequence_length - len(seq) :] = seq return new_seq def return_variables(params): - window_input = params['window_input'] - window_output = params['window_output'] - offset = params['offset'] - verbose = params['verbose'] - store_path = params['store_path'] # store_path is used to save data - min_count = params['min_count'] - sigma = params['sigma'] + window_input = params["window_input"] + window_output = params["window_output"] + offset = params["offset"] + verbose = params["verbose"] + store_path = params["store_path"] # store_path is used to save data + min_count = params["min_count"] + sigma = params["sigma"] return window_input, window_output, offset, verbose, store_path, min_count, sigma def find_serials_offsets(store_path): filepath = store_path + "/" + store_path.split("/")[-1] + ".config" - with open(filepath, 'rb') as f: + with open(filepath, "rb") as f: serials, offsets = pickle.load(f) return serials, offsets @@ -78,31 +80,40 @@ def return_index(date_range, target): def create_params_list(data_path, params, verbose=True): - windows_input = params['windows_input'] - windows_output = params['windows_output'] - min_counts = params['min_counts'] - sigmas = params['sigmas'] - offsets = params['offsets'] + windows_input = params["windows_input"] + windows_output = params["windows_output"] + min_counts = params["min_counts"] + sigmas = params["sigmas"] + offsets = params["offsets"] params_list = [] - dir_template = data_path + '/' + FAMILY + \ - '_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}' + dir_template = ( + data_path + + "/" + + FAMILY + + "_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}" + ) for window_input in windows_input: for window_output in windows_output: for min_count in min_counts: for sigma in sigmas: for offset in offsets: store_path = dir_template.format( - window_input=window_input, window_output=window_output, offset=offset, min_count=min_count, sigma=sigma) + window_input=window_input, + window_output=window_output, + offset=offset, + min_count=min_count, + sigma=sigma, + ) if not os.path.isdir(store_path): os.makedirs(store_path) params = { - 'window_input': window_input, - 'window_output': window_output, - 'offset': offset, - 'min_count': min_count, - 'sigma': sigma, - 'verbose': verbose, - 'store_path': store_path + "window_input": window_input, + "window_output": window_output, + "offset": offset, + "min_count": min_count, + "sigma": sigma, + "verbose": verbose, + "store_path": store_path, } params_list.append(params) return params_list @@ -114,8 +125,7 @@ def create_params_list(data_path, params, verbose=True): def generate_dataset_by_serial_offset(data, params, current_offset): data["current_offset"] = current_offset current_offset = datetime.timedelta(minutes=current_offset) - window_input, window_output, _, _, _, _, _ = return_variables( - params) + window_input, window_output, _, _, _, _, _ = return_variables(params) min_timestamp = data.index.min() min_timestamp += current_offset @@ -123,24 +133,28 @@ def generate_dataset_by_serial_offset(data, params, current_offset): # create date range for input date_range = pd.date_range( - start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min") + start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min" + ) date_range = [d for d in date_range] # create date range for output delta_in = datetime.timedelta(minutes=window_input) delta_out = datetime.timedelta(minutes=window_output) - date_range_output = [(d + delta_in, d + delta_in + delta_out) - for d in date_range] # ranges of outputs + date_range_output = [ + (d + delta_in, d + delta_in + delta_out) for d in date_range + ] # ranges of outputs date_range_output = np.asarray(date_range_output).reshape(1, -1)[0] # create samples series = pd.Series(data.index).apply( - lambda target: return_index(date_range, target)) + lambda target: return_index(date_range, target) + ) series.index = data.index data["bin_input"] = series series = pd.Series(data.index).apply( - lambda target: return_index_output(date_range_output, target)) + lambda target: return_index_output(date_range_output, target) + ) series.index = data.index data["bin_output"] = series @@ -149,14 +163,14 @@ def generate_dataset_by_serial_offset(data, params, current_offset): unique2 = data["bin_output"].unique() periods_id = list(set(unique1).intersection(set(unique2)) - set([-1])) - data = data[(data["bin_input"].isin(periods_id)) | - (data["bin_output"].isin(periods_id))] + data = data[ + (data["bin_input"].isin(periods_id)) | (data["bin_output"].isin(periods_id)) + ] return data def generate_dataset_by_serial(data, params): - window_input, _, offset, verbose, store_path, _, _ = return_variables( - params) + window_input, _, offset, verbose, store_path, _, _ = return_variables(params) serial = data["serial"][0] if verbose: print("serial: ", serial) @@ -164,9 +178,9 @@ def generate_dataset_by_serial(data, params): offsets = list(range(0, window_input, offset)) offset_data = [] for current_offset in offsets: - df_offset = generate_dataset_by_serial_offset(data.copy(), - params, - current_offset) + df_offset = generate_dataset_by_serial_offset( + data.copy(), params, current_offset + ) offset_data.append(df_offset) dataset = pd.concat(offset_data) if verbose: @@ -176,7 +190,7 @@ def generate_dataset_by_serial(data, params): def generate_dataset(data, params): - grouped_data = data.groupby('serial') + grouped_data = data.groupby("serial") for _, data_serial in grouped_data: generate_dataset_by_serial(data_serial, params) return @@ -184,6 +198,7 @@ def generate_dataset(data, params): # PHASE 2 + def prune_dataset(params): _, _, _, _, store_path, _, _ = return_variables(params) serials = [] @@ -198,8 +213,7 @@ def prune_dataset(params): def prune_df(df, serial_id, params): - _, _, offset, verbose, store_path, min_count, _ = return_variables( - params) + _, _, offset, verbose, store_path, min_count, _ = return_variables(params) offsets = df["current_offset"].unique() for offset in offsets: @@ -215,27 +229,37 @@ def prune_df(df, serial_id, params): periods_id = list(set(unique1).intersection(set(unique2)) - set([-1])) periods_id.sort() # temporal sorting - diz_input = {bin_id: group.alarm.sort_index( - ).values for bin_id, group in groups_input if bin_id in periods_id} - diz_output = {bin_id: group.alarm.sort_index( - ).values for bin_id, group in groups_output if bin_id in periods_id} + diz_input = { + bin_id: group.alarm.sort_index().values + for bin_id, group in groups_input + if bin_id in periods_id + } + diz_output = { + bin_id: group.alarm.sort_index().values + for bin_id, group in groups_output + if bin_id in periods_id + } # list of [seq_x,seq_y] - X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] - for bin_id in periods_id] + X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] for bin_id in periods_id] # apply min_count: remove sequences with count= min_count and len(seq[1]) >= min_count] + X_Y_offset = [ + seq + for seq in X_Y_offset + if len(seq[0]) >= min_count and len(seq[1]) >= min_count + ] # save list on file # for each (serial,offset) a different list is saved if verbose: - print("{} has length: {}".format(str(serial_id) + - "_offset_" + str(offset) + ".npz", len(X_Y_offset))) - filepath = store_path + "/" + \ - str(serial_id) + "_offset_" + str(offset) + ".npz" - with open(filepath, 'wb') as f: + print( + "{} has length: {}".format( + str(serial_id) + "_offset_" + str(offset) + ".npz", len(X_Y_offset) + ) + ) + filepath = store_path + "/" + str(serial_id) + "_offset_" + str(offset) + ".npz" + with open(filepath, "wb") as f: pickle.dump(X_Y_offset, f) return offsets @@ -243,9 +267,18 @@ def prune_df(df, serial_id, params): # PHASE 3 -def create_final_dataset(params, serials, offsets, sequence_input_length, sequence_output_length, removal_alarms=None, relevance_alarms=None, file_tag='all_alarms'): - _, _, offset, verbose, store_path, _, sigma = return_variables( - params) + +def create_final_dataset( + params, + serials, + offsets, + sequence_input_length, + sequence_output_length, + removal_alarms=None, + relevance_alarms=None, + file_tag="all_alarms", +): + _, _, offset, verbose, store_path, _, sigma = return_variables(params) padding_mode = "after" if "padding_mode" in params: padding_mode = params["padding_mode"] @@ -266,24 +299,37 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen for offset in offsets: sentinel += 1 if verbose: - print("{}/{}: serial: {} offset: {}".format(sentinel, - tot_combo, - serial, - offset)) + print( + "{}/{}: serial: {} offset: {}".format( + sentinel, tot_combo, serial, offset + ) + ) filename = str(serial) + "_offset_" + str(offset) + ".npz" if filename in os.listdir(store_path): file_path = os.path.join(store_path, filename) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: seqences = pickle.load(f) if removal_alarms != None and len(removal_alarms) > 0: - X = [[alarm for alarm in seq_x if ( - alarm not in removal_alarms and alarm != 0)] for seq_x, seq_y in seqences] + X = [ + [ + alarm + for alarm in seq_x + if (alarm not in removal_alarms and alarm != 0) + ] + for seq_x, seq_y in seqences + ] else: X = [seq_x for seq_x, seq_y in seqences] if relevance_alarms != None and len(relevance_alarms) > 0: - Y = [[alarm for alarm in seq_y if ( - alarm in relevance_alarms and alarm != 0)] for seq_x, seq_y in seqences] + Y = [ + [ + alarm + for alarm in seq_y + if (alarm in relevance_alarms and alarm != 0) + ] + for seq_x, seq_y in seqences + ] else: Y = [seq_y for seq_x, seq_y in seqences] @@ -301,8 +347,9 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen lengths_Y = np.asarray(lengths_Y) mu_sequence_input_length = lengths_X.mean() std_sequence_input_length = lengths_X.std() - sequence_input_length = int( - mu_sequence_input_length + sigma * std_sequence_input_length) + 1 + sequence_input_length = ( + int(mu_sequence_input_length + sigma * std_sequence_input_length) + 1 + ) sequence_output_length = np.max(lengths_Y) sentinel = 0 # it is iterated on all combinations of serials and offsets @@ -314,28 +361,29 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen # PADDING if padding_mode == "after": - X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) - for seq_x in X] - Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) - for seq_y in Y] + X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) for seq_x in X] + Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) for seq_y in Y] elif padding_mode == "before": X = [padding_sequence(seq_x, sequence_input_length) for seq_x in X] - Y = [padding_sequence(seq_y, sequence_output_length) - for seq_y in Y] + Y = [padding_sequence(seq_y, sequence_output_length) for seq_y in Y] if len(X) > 1: X_train, X_test, Y_train, Y_test = train_test_split( - X, Y, test_size=0.3, shuffle=False) + X, Y, test_size=0.3, shuffle=False + ) else: X_train, X_test, Y_train, Y_test = [], [], [], [] - stratify.append({ - "serial": combos[sentinel][0], - "offset": combos[sentinel][1], - "X_train": len(X_train), - "X_test": len(X_test), - "Y_train": len(Y_train), - "Y_test": len(Y_test)}) + stratify.append( + { + "serial": combos[sentinel][0], + "offset": combos[sentinel][1], + "X_train": len(X_train), + "X_test": len(X_test), + "Y_train": len(Y_train), + "Y_test": len(Y_test), + } + ) cont += len(X_train) + len(X_test) X_train_tot += X_train @@ -366,10 +414,19 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen stratify = stratify_new # Sparse Matrix - X_train, X_test, Y_train, Y_test = sparse.csr_matrix(X_train), sparse.csr_matrix( - X_test), sparse.csr_matrix(Y_train), sparse.csr_matrix(Y_test) - - x_train_segments, x_test_segments, y_train_segments, y_test_segments = X_train, X_test, Y_train, Y_test + X_train, X_test, Y_train, Y_test = ( + sparse.csr_matrix(X_train), + sparse.csr_matrix(X_test), + sparse.csr_matrix(Y_train), + sparse.csr_matrix(Y_test), + ) + + x_train_segments, x_test_segments, y_train_segments, y_test_segments = ( + X_train, + X_test, + Y_train, + Y_test, + ) if verbose: print(x_train_segments.shape) @@ -378,19 +435,27 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen print(y_test_segments.shape) file_path = os.path.join(store_path, f"{file_tag}.pickle") - with open(file_path, 'wb') as f: - pickle.dump([x_train_segments, x_test_segments, - y_train_segments, y_test_segments, stratify], f) - - -def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): + with open(file_path, "wb") as f: + pickle.dump( + [ + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ], + f, + ) + + +def create_datasets(data, params_list, start_point=0, file_tag="all_alarms"): """ Parameters: data : csv file of raw data - params_list: list of dictionaries i.e. params + params_list: list of dictionaries i.e. params each params is a dictionary with all parameters necessary to create the dataset such as window_input, window_output, sigma, min_count,ecc. - start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) + start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) file_tag: name of dataset There are 3 phases(start_point:1,2,3): @@ -399,7 +464,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): each file contains a list of x,y with pruning of consecutive alarms -PHASE3: some alarms in input are removed based on list associated with key 'removal_alarms' to variable params. only a subset of alarms in output is keeped based on a list associated with key 'relevance_alarms' to variable params. - Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params + Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params Example: @@ -415,7 +480,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): #relevance_alarms = [] <- only if you want to keep a subset of alrams in output #additional parameters for phase3, comment the section if you don't want to use it - for params in params_list: + for params in params_list: params["removal_alarms"] = [] params["relevance_alarms"] = [] @@ -424,7 +489,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): """ for params in params_list: - print('-- run ', params) + print("-- run ", params) verbose = params["verbose"] if verbose: print(params) @@ -447,7 +512,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): # save serials and offsets store_path = params["store_path"] filepath = store_path + "/" + store_path.split("/")[-1] + ".config" - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: pickle.dump((serials, offsets), f) end = time.time() @@ -474,9 +539,17 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): if "removal_alarms" in params and "relevance_alarms" in params: removal_alarms = params["removal_alarms"] relevance_alarms = params["relevance_alarms"] - create_final_dataset(params, serials, offsets, sequence_input_length, - sequence_output_length, removal_alarms, relevance_alarms, file_tag) - drop_files(params["store_path"], '.csv') + create_final_dataset( + params, + serials, + offsets, + sequence_input_length, + sequence_output_length, + removal_alarms, + relevance_alarms, + file_tag, + ) + drop_files(params["store_path"], ".csv") end = time.time() elapsed_time = end - start print("phase 3/3 elapsed Time: {}".format(elapsed_time)) @@ -484,10 +557,11 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): # POST BUILD + def clean(data_path): for filename_dir in os.listdir(data_path): print("directory: ", filename_dir) - filepath_dir = data_path + '/' + filename_dir + filepath_dir = data_path + "/" + filename_dir if os.path.isdir(filepath_dir): cont_csv, cont_npz, cont_pickle = 0, 0, 0 for filename in os.listdir(filepath_dir): @@ -498,11 +572,14 @@ def clean(data_path): cont_npz += os.stat(filepath).st_size elif filename.endswith(".pickle"): cont_pickle += os.stat(filepath).st_size - print("cont_csv: {}MB cont_npz: {}MB cont_pickle: {}MB ".format( - int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6))) + print( + "cont_csv: {}MB cont_npz: {}MB cont_pickle: {}MB ".format( + int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6) + ) + ) for filename_dir in os.listdir(data_path): print("prune directory ", filename_dir) - filepath_dir = data_path + '/' + filename_dir + filepath_dir = data_path + "/" + filename_dir if os.path.isdir(filepath_dir): for filename in os.listdir(filepath_dir): filepath = filepath_dir + "/" + filename @@ -515,7 +592,7 @@ def clean(data_path): def convert_to_json(store_path, filename, verbose=0): - """ + """ function to convert pickle dataset in json format which will have same name of original dataset but with json extension and stored in the same path of the original dataset @@ -531,9 +608,14 @@ def convert_to_json(store_path, filename, verbose=0): """ # load dataset filepath = os.path.join(store_path, filename) - with open(filepath, 'rb') as f: - x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load( - f) + with open(filepath, "rb") as f: + ( + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ) = pickle.load(f) x_train_segments = x_train_segments.toarray() x_test_segments = x_test_segments.toarray() @@ -559,38 +641,42 @@ def convert_to_json(store_path, filename, verbose=0): print("serials_test.shape: ", len(stratify["test"])) diz = {} - for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_train_segments, y_train_segments, stratify["train"]), start=1): - key = "sample"+str(i) + for i, (seq_x, seq_y, serial_id) in enumerate( + zip(x_train_segments, y_train_segments, stratify["train"]), start=1 + ): + key = "sample" + str(i) diz[key] = {} diz[key]["x_train"] = seq_x diz[key]["y_train"] = seq_y diz[key]["serial"] = serial_id - for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_test_segments, y_test_segments, stratify["test"]), start=i+1): - key = "sample"+str(i) + for i, (seq_x, seq_y, serial_id) in enumerate( + zip(x_test_segments, y_test_segments, stratify["test"]), start=i + 1 + ): + key = "sample" + str(i) diz[key] = {} diz[key]["x_test"] = seq_x diz[key]["y_test"] = seq_y diz[key]["serial"] = serial_id text = json.dumps(diz) - filename_json = os.path.splitext(filename)[0]+".json" + filename_json = os.path.splitext(filename)[0] + ".json" filepath_json = os.path.join(store_path, filename_json) if verbose: print("filepath_json: ", filepath_json) - with open(filepath_json, 'w') as f: + with open(filepath_json, "w") as f: f.write(text) def load_json_dataset(store_path, filename_json, verbose=0): - """ + """ function to load dataset from json format Example: $#load data in json format $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3" $filename_json = "all_alarms.json" - $filepath = os.path.join(store_path, filename_json) + $filepath = os.path.join(store_path, filename_json) $x_train,y_train,serials_train,x_test,y_test,serials_test = load_json_dataset(store_path, filename_json, verbose=1) @@ -608,22 +694,32 @@ def load_json_dataset(store_path, filename_json, verbose=0): """ filepath = os.path.join(store_path, filename_json) - with open(filepath, 'r') as f: + with open(filepath, "r") as f: dataset = json.load(f) - x_train = [sample["x_train"] - for sample_id, sample in dataset.items() if "x_train" in sample] - y_train = [sample["x_train"] - for sample_id, sample in dataset.items() if "x_train" in sample] - stratify_train = [sample["serial"] for sample_id, - sample in dataset.items() if "x_train" in sample] - - x_test = [sample["x_test"] - for sample_id, sample in dataset.items() if "x_test" in sample] - y_test = [sample["x_test"] - for sample_id, sample in dataset.items() if "x_test" in sample] - stratify_test = [sample["serial"] - for sample_id, sample in dataset.items() if "x_test" in sample] + x_train = [ + sample["x_train"] + for sample_id, sample in dataset.items() + if "x_train" in sample + ] + y_train = [ + sample["x_train"] + for sample_id, sample in dataset.items() + if "x_train" in sample + ] + stratify_train = [ + sample["serial"] for sample_id, sample in dataset.items() if "x_train" in sample + ] + + x_test = [ + sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample + ] + y_test = [ + sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample + ] + stratify_test = [ + sample["serial"] for sample_id, sample in dataset.items() if "x_test" in sample + ] x_train = np.asarray(x_train) y_train = np.asarray(y_train) @@ -642,14 +738,14 @@ def load_json_dataset(store_path, filename_json, verbose=0): def convert_to_npz(store_path, filename, verbose=0): - """ + """ function to convert pickle dataset in npz format which will have same name of original dataset but with npz extension and stored in the same path of the original dataset Example: $#save data in npz format $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3" - $filename= "all_alarms.pickle" + $filename= "all_alarms.pickle" $convert_to_npz(store_path, filename, verbose=1) Args: @@ -658,9 +754,14 @@ def convert_to_npz(store_path, filename, verbose=0): """ # load dataset filepath = os.path.join(store_path, filename) - with open(filepath, 'rb') as f: - x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load( - f) + with open(filepath, "rb") as f: + ( + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ) = pickle.load(f) x_train_segments = x_train_segments.toarray() x_test_segments = x_test_segments.toarray() @@ -683,17 +784,23 @@ def convert_to_npz(store_path, filename, verbose=0): print("y_test.shape: ", y_test_segments.shape) print("serials_test.shape: ", len(stratify["test"])) - filename_npz = os.path.splitext(filename)[0]+".npz" + filename_npz = os.path.splitext(filename)[0] + ".npz" filepath_npz = os.path.join(store_path, filename_npz) if verbose: print("filepath_npz: ", filepath_npz) - np.savez_compressed(filepath_npz, - x_train=x_train_segments, y_train=y_train_segments, stratify_train=stratify_train, - x_test=x_test_segments, y_test=y_test_segments, stratify_test=stratify_test) + np.savez_compressed( + filepath_npz, + x_train=x_train_segments, + y_train=y_train_segments, + stratify_train=stratify_train, + x_test=x_test_segments, + y_test=y_test_segments, + stratify_test=stratify_test, + ) def load_from_npz(): - """ + """ function to load dataset from json format Example: @@ -714,8 +821,7 @@ def load_from_npz(): serials_test: for each test sample is indicated the serial that produced that sample """ # filepath = os.path.join(store_path, filename) - filepath = os.path.dirname(__file__) + '/alarms_log_data/processed/all_alarms.npz' + filepath = os.path.dirname(__file__) + "/alarms_log_data/processed/all_alarms.npz" loaded = np.load(filepath, allow_pickle=True) - keys = ["x_train", "y_train", "stratify_train", - "x_test", "y_test", "stratify_test"] + keys = ["x_train", "y_train", "stratify_train", "x_test", "y_test", "stratify_test"] return [loaded[k] for k in keys] diff --git a/datasets/alpi/dataset.py b/datasets/alpi/dataset.py index ba5aa29..f41a718 100644 --- a/datasets/alpi/dataset.py +++ b/datasets/alpi/dataset.py @@ -11,12 +11,12 @@ from scipy import sparse from sklearn.model_selection import train_test_split -FAMILY = 'MACHINE_TYPE_00' +FAMILY = "MACHINE_TYPE_00" # UTILS def zip_dir(dir_path, zip_path): - shutil.make_archive(zip_path, 'zip', dir_path) + shutil.make_archive(zip_path, "zip", dir_path) def drop_files(dir_path, extension): @@ -27,7 +27,9 @@ def drop_files(dir_path, extension): def prune(alarms): - return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype('uint16') + return alarms.loc[alarms.alarm.shift() != alarms.alarm].alarm.values.astype( + "uint16" + ) def prune_series(seq): @@ -38,24 +40,24 @@ def prune_series(seq): def padding_sequence(seq, sequence_length): new_seq = np.zeros(sequence_length) - new_seq[sequence_length - len(seq):] = seq + new_seq[sequence_length - len(seq) :] = seq return new_seq def return_variables(params): - window_input = params['window_input'] - window_output = params['window_output'] - offset = params['offset'] - verbose = params['verbose'] - store_path = params['store_path'] # store_path is used to save data - min_count = params['min_count'] - sigma = params['sigma'] + window_input = params["window_input"] + window_output = params["window_output"] + offset = params["offset"] + verbose = params["verbose"] + store_path = params["store_path"] # store_path is used to save data + min_count = params["min_count"] + sigma = params["sigma"] return window_input, window_output, offset, verbose, store_path, min_count, sigma def find_serials_offsets(store_path): filepath = store_path + "/" + store_path.split("/")[-1] + ".config" - with open(filepath, 'rb') as f: + with open(filepath, "rb") as f: serials, offsets = pickle.load(f) return serials, offsets @@ -78,31 +80,40 @@ def return_index(date_range, target): def create_params_list(data_path, params, verbose=True): - windows_input = params['windows_input'] - windows_output = params['windows_output'] - min_counts = params['min_counts'] - sigmas = params['sigmas'] - offsets = params['offsets'] + windows_input = params["windows_input"] + windows_output = params["windows_output"] + min_counts = params["min_counts"] + sigmas = params["sigmas"] + offsets = params["offsets"] params_list = [] - dir_template = data_path + '/' + FAMILY + \ - '_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}' + dir_template = ( + data_path + + "/" + + FAMILY + + "_alarms_window_input_{window_input}_window_output_{window_output}_offset_{offset}_min_count_{min_count}_sigma_{sigma}" + ) for window_input in windows_input: for window_output in windows_output: for min_count in min_counts: for sigma in sigmas: for offset in offsets: store_path = dir_template.format( - window_input=window_input, window_output=window_output, offset=offset, min_count=min_count, sigma=sigma) + window_input=window_input, + window_output=window_output, + offset=offset, + min_count=min_count, + sigma=sigma, + ) if not os.path.isdir(store_path): os.makedirs(store_path) params = { - 'window_input': window_input, - 'window_output': window_output, - 'offset': offset, - 'min_count': min_count, - 'sigma': sigma, - 'verbose': verbose, - 'store_path': store_path + "window_input": window_input, + "window_output": window_output, + "offset": offset, + "min_count": min_count, + "sigma": sigma, + "verbose": verbose, + "store_path": store_path, } params_list.append(params) return params_list @@ -114,8 +125,7 @@ def create_params_list(data_path, params, verbose=True): def generate_dataset_by_serial_offset(data, params, current_offset): data["current_offset"] = current_offset current_offset = datetime.timedelta(minutes=current_offset) - window_input, window_output, _, _, _, _, _ = return_variables( - params) + window_input, window_output, _, _, _, _, _ = return_variables(params) min_timestamp = data.index.min() min_timestamp += current_offset @@ -123,24 +133,28 @@ def generate_dataset_by_serial_offset(data, params, current_offset): # create date range for input date_range = pd.date_range( - start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min") + start=min_timestamp, end=max_timestamp, freq=str(window_input) + "min" + ) date_range = [d for d in date_range] # create date range for output delta_in = datetime.timedelta(minutes=window_input) delta_out = datetime.timedelta(minutes=window_output) - date_range_output = [(d + delta_in, d + delta_in + delta_out) - for d in date_range] # ranges of outputs + date_range_output = [ + (d + delta_in, d + delta_in + delta_out) for d in date_range + ] # ranges of outputs date_range_output = np.asarray(date_range_output).reshape(1, -1)[0] # create samples series = pd.Series(data.index).apply( - lambda target: return_index(date_range, target)) + lambda target: return_index(date_range, target) + ) series.index = data.index data["bin_input"] = series series = pd.Series(data.index).apply( - lambda target: return_index_output(date_range_output, target)) + lambda target: return_index_output(date_range_output, target) + ) series.index = data.index data["bin_output"] = series @@ -149,14 +163,14 @@ def generate_dataset_by_serial_offset(data, params, current_offset): unique2 = data["bin_output"].unique() periods_id = list(set(unique1).intersection(set(unique2)) - set([-1])) - data = data[(data["bin_input"].isin(periods_id)) | - (data["bin_output"].isin(periods_id))] + data = data[ + (data["bin_input"].isin(periods_id)) | (data["bin_output"].isin(periods_id)) + ] return data def generate_dataset_by_serial(data, params): - window_input, _, offset, verbose, store_path, _, _ = return_variables( - params) + window_input, _, offset, verbose, store_path, _, _ = return_variables(params) serial = data["serial"][0] if verbose: print("serial: ", serial) @@ -164,9 +178,9 @@ def generate_dataset_by_serial(data, params): offsets = list(range(0, window_input, offset)) offset_data = [] for current_offset in offsets: - df_offset = generate_dataset_by_serial_offset(data.copy(), - params, - current_offset) + df_offset = generate_dataset_by_serial_offset( + data.copy(), params, current_offset + ) offset_data.append(df_offset) dataset = pd.concat(offset_data) if verbose: @@ -176,7 +190,7 @@ def generate_dataset_by_serial(data, params): def generate_dataset(data, params): - grouped_data = data.groupby('serial') + grouped_data = data.groupby("serial") for _, data_serial in grouped_data: generate_dataset_by_serial(data_serial, params) return @@ -184,6 +198,7 @@ def generate_dataset(data, params): # PHASE 2 + def prune_dataset(params): _, _, _, _, store_path, _, _ = return_variables(params) serials = [] @@ -198,8 +213,7 @@ def prune_dataset(params): def prune_df(df, serial_id, params): - _, _, offset, verbose, store_path, min_count, _ = return_variables( - params) + _, _, offset, verbose, store_path, min_count, _ = return_variables(params) offsets = df["current_offset"].unique() for offset in offsets: @@ -215,27 +229,37 @@ def prune_df(df, serial_id, params): periods_id = list(set(unique1).intersection(set(unique2)) - set([-1])) periods_id.sort() # temporal sorting - diz_input = {bin_id: group.alarm.sort_index( - ).values for bin_id, group in groups_input if bin_id in periods_id} - diz_output = {bin_id: group.alarm.sort_index( - ).values for bin_id, group in groups_output if bin_id in periods_id} + diz_input = { + bin_id: group.alarm.sort_index().values + for bin_id, group in groups_input + if bin_id in periods_id + } + diz_output = { + bin_id: group.alarm.sort_index().values + for bin_id, group in groups_output + if bin_id in periods_id + } # list of [seq_x,seq_y] - X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] - for bin_id in periods_id] + X_Y_offset = [[diz_input[bin_id], diz_output[bin_id]] for bin_id in periods_id] # apply min_count: remove sequences with count= min_count and len(seq[1]) >= min_count] + X_Y_offset = [ + seq + for seq in X_Y_offset + if len(seq[0]) >= min_count and len(seq[1]) >= min_count + ] # save list on file # for each (serial,offset) a different list is saved if verbose: - print("{} has length: {}".format(str(serial_id) + - "_offset_" + str(offset) + ".npz", len(X_Y_offset))) - filepath = store_path + "/" + \ - str(serial_id) + "_offset_" + str(offset) + ".npz" - with open(filepath, 'wb') as f: + print( + "{} has length: {}".format( + str(serial_id) + "_offset_" + str(offset) + ".npz", len(X_Y_offset) + ) + ) + filepath = store_path + "/" + str(serial_id) + "_offset_" + str(offset) + ".npz" + with open(filepath, "wb") as f: pickle.dump(X_Y_offset, f) return offsets @@ -243,9 +267,18 @@ def prune_df(df, serial_id, params): # PHASE 3 -def create_final_dataset(params, serials, offsets, sequence_input_length, sequence_output_length, removal_alarms=None, relevance_alarms=None, file_tag='all_alarms'): - _, _, offset, verbose, store_path, _, sigma = return_variables( - params) + +def create_final_dataset( + params, + serials, + offsets, + sequence_input_length, + sequence_output_length, + removal_alarms=None, + relevance_alarms=None, + file_tag="all_alarms", +): + _, _, offset, verbose, store_path, _, sigma = return_variables(params) padding_mode = "after" if "padding_mode" in params: padding_mode = params["padding_mode"] @@ -266,24 +299,37 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen for offset in offsets: sentinel += 1 if verbose: - print("{}/{}: serial: {} offset: {}".format(sentinel, - tot_combo, - serial, - offset)) + print( + "{}/{}: serial: {} offset: {}".format( + sentinel, tot_combo, serial, offset + ) + ) filename = str(serial) + "_offset_" + str(offset) + ".npz" if filename in os.listdir(store_path): file_path = os.path.join(store_path, filename) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: seqences = pickle.load(f) if removal_alarms != None and len(removal_alarms) > 0: - X = [[alarm for alarm in seq_x if ( - alarm not in removal_alarms and alarm != 0)] for seq_x, seq_y in seqences] + X = [ + [ + alarm + for alarm in seq_x + if (alarm not in removal_alarms and alarm != 0) + ] + for seq_x, seq_y in seqences + ] else: X = [seq_x for seq_x, seq_y in seqences] if relevance_alarms != None and len(relevance_alarms) > 0: - Y = [[alarm for alarm in seq_y if ( - alarm in relevance_alarms and alarm != 0)] for seq_x, seq_y in seqences] + Y = [ + [ + alarm + for alarm in seq_y + if (alarm in relevance_alarms and alarm != 0) + ] + for seq_x, seq_y in seqences + ] else: Y = [seq_y for seq_x, seq_y in seqences] @@ -301,8 +347,9 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen lengths_Y = np.asarray(lengths_Y) mu_sequence_input_length = lengths_X.mean() std_sequence_input_length = lengths_X.std() - sequence_input_length = int( - mu_sequence_input_length + sigma * std_sequence_input_length) + 1 + sequence_input_length = ( + int(mu_sequence_input_length + sigma * std_sequence_input_length) + 1 + ) sequence_output_length = np.max(lengths_Y) sentinel = 0 # it is iterated on all combinations of serials and offsets @@ -314,28 +361,29 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen # PADDING if padding_mode == "after": - X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) - for seq_x in X] - Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) - for seq_y in Y] + X = [np.pad(seq_x, (0, sequence_input_length - len(seq_x))) for seq_x in X] + Y = [np.pad(seq_y, (0, sequence_output_length - len(seq_y))) for seq_y in Y] elif padding_mode == "before": X = [padding_sequence(seq_x, sequence_input_length) for seq_x in X] - Y = [padding_sequence(seq_y, sequence_output_length) - for seq_y in Y] + Y = [padding_sequence(seq_y, sequence_output_length) for seq_y in Y] if len(X) > 1: X_train, X_test, Y_train, Y_test = train_test_split( - X, Y, test_size=0.3, shuffle=False) + X, Y, test_size=0.3, shuffle=False + ) else: X_train, X_test, Y_train, Y_test = [], [], [], [] - stratify.append({ - "serial": combos[sentinel][0], - "offset": combos[sentinel][1], - "X_train": len(X_train), - "X_test": len(X_test), - "Y_train": len(Y_train), - "Y_test": len(Y_test)}) + stratify.append( + { + "serial": combos[sentinel][0], + "offset": combos[sentinel][1], + "X_train": len(X_train), + "X_test": len(X_test), + "Y_train": len(Y_train), + "Y_test": len(Y_test), + } + ) cont += len(X_train) + len(X_test) X_train_tot += X_train @@ -366,10 +414,19 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen stratify = stratify_new # Sparse Matrix - X_train, X_test, Y_train, Y_test = sparse.csr_matrix(X_train), sparse.csr_matrix( - X_test), sparse.csr_matrix(Y_train), sparse.csr_matrix(Y_test) - - x_train_segments, x_test_segments, y_train_segments, y_test_segments = X_train, X_test, Y_train, Y_test + X_train, X_test, Y_train, Y_test = ( + sparse.csr_matrix(X_train), + sparse.csr_matrix(X_test), + sparse.csr_matrix(Y_train), + sparse.csr_matrix(Y_test), + ) + + x_train_segments, x_test_segments, y_train_segments, y_test_segments = ( + X_train, + X_test, + Y_train, + Y_test, + ) if verbose: print(x_train_segments.shape) @@ -378,19 +435,27 @@ def create_final_dataset(params, serials, offsets, sequence_input_length, sequen print(y_test_segments.shape) file_path = os.path.join(store_path, f"{file_tag}.pickle") - with open(file_path, 'wb') as f: - pickle.dump([x_train_segments, x_test_segments, - y_train_segments, y_test_segments, stratify], f) - - -def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): + with open(file_path, "wb") as f: + pickle.dump( + [ + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ], + f, + ) + + +def create_datasets(data, params_list, start_point=0, file_tag="all_alarms"): """ Parameters: data : csv file of raw data - params_list: list of dictionaries i.e. params + params_list: list of dictionaries i.e. params each params is a dictionary with all parameters necessary to create the dataset such as window_input, window_output, sigma, min_count,ecc. - start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) + start_point: it tells from which phase to start to create dataset(i.e. if start_point=2 then it is assumed that phase1 is executed) file_tag: name of dataset There are 3 phases(start_point:1,2,3): @@ -399,7 +464,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): each file contains a list of x,y with pruning of consecutive alarms -PHASE3: some alarms in input are removed based on list associated with key 'removal_alarms' to variable params. only a subset of alarms in output is keeped based on a list associated with key 'relevance_alarms' to variable params. - Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params + Padding is performed based on values defined by keys 'sequence_input_length' and 'sequence_output_length' in params Example: @@ -415,7 +480,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): #relevance_alarms = [] <- only if you want to keep a subset of alrams in output #additional parameters for phase3, comment the section if you don't want to use it - for params in params_list: + for params in params_list: params["removal_alarms"] = [] params["relevance_alarms"] = [] @@ -424,7 +489,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): """ for params in params_list: - print('-- run ', params) + print("-- run ", params) verbose = params["verbose"] if verbose: print(params) @@ -447,7 +512,7 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): # save serials and offsets store_path = params["store_path"] filepath = store_path + "/" + store_path.split("/")[-1] + ".config" - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: pickle.dump((serials, offsets), f) end = time.time() @@ -474,9 +539,17 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): if "removal_alarms" in params and "relevance_alarms" in params: removal_alarms = params["removal_alarms"] relevance_alarms = params["relevance_alarms"] - create_final_dataset(params, serials, offsets, sequence_input_length, - sequence_output_length, removal_alarms, relevance_alarms, file_tag) - drop_files(params["store_path"], '.csv') + create_final_dataset( + params, + serials, + offsets, + sequence_input_length, + sequence_output_length, + removal_alarms, + relevance_alarms, + file_tag, + ) + drop_files(params["store_path"], ".csv") end = time.time() elapsed_time = end - start print("phase 3/3 elapsed Time: {}".format(elapsed_time)) @@ -484,10 +557,11 @@ def create_datasets(data, params_list, start_point=0, file_tag='all_alarms'): # POST BUILD + def clean(data_path): for filename_dir in os.listdir(data_path): print("directory: ", filename_dir) - filepath_dir = data_path + '/' + filename_dir + filepath_dir = data_path + "/" + filename_dir if os.path.isdir(filepath_dir): cont_csv, cont_npz, cont_pickle = 0, 0, 0 for filename in os.listdir(filepath_dir): @@ -498,11 +572,14 @@ def clean(data_path): cont_npz += os.stat(filepath).st_size elif filename.endswith(".pickle"): cont_pickle += os.stat(filepath).st_size - print("cont_csv: {}MB cont_npz: {}MB cont_pickle: {}MB ".format( - int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6))) + print( + "cont_csv: {}MB cont_npz: {}MB cont_pickle: {}MB ".format( + int(cont_csv / 1e6), int(cont_npz / 1e6), int(cont_pickle / 1e6) + ) + ) for filename_dir in os.listdir(data_path): print("prune directory ", filename_dir) - filepath_dir = data_path + '/' + filename_dir + filepath_dir = data_path + "/" + filename_dir if os.path.isdir(filepath_dir): for filename in os.listdir(filepath_dir): filepath = filepath_dir + "/" + filename @@ -515,7 +592,7 @@ def clean(data_path): def convert_to_json(store_path, filename, verbose=0): - """ + """ function to convert pickle dataset in json format which will have same name of original dataset but with json extension and stored in the same path of the original dataset @@ -531,9 +608,14 @@ def convert_to_json(store_path, filename, verbose=0): """ # load dataset filepath = os.path.join(store_path, filename) - with open(filepath, 'rb') as f: - x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load( - f) + with open(filepath, "rb") as f: + ( + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ) = pickle.load(f) x_train_segments = x_train_segments.toarray() x_test_segments = x_test_segments.toarray() @@ -559,38 +641,42 @@ def convert_to_json(store_path, filename, verbose=0): print("serials_test.shape: ", len(stratify["test"])) diz = {} - for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_train_segments, y_train_segments, stratify["train"]), start=1): - key = "sample"+str(i) + for i, (seq_x, seq_y, serial_id) in enumerate( + zip(x_train_segments, y_train_segments, stratify["train"]), start=1 + ): + key = "sample" + str(i) diz[key] = {} diz[key]["x_train"] = seq_x diz[key]["y_train"] = seq_y diz[key]["serial"] = serial_id - for i, (seq_x, seq_y, serial_id) in enumerate(zip(x_test_segments, y_test_segments, stratify["test"]), start=i+1): - key = "sample"+str(i) + for i, (seq_x, seq_y, serial_id) in enumerate( + zip(x_test_segments, y_test_segments, stratify["test"]), start=i + 1 + ): + key = "sample" + str(i) diz[key] = {} diz[key]["x_test"] = seq_x diz[key]["y_test"] = seq_y diz[key]["serial"] = serial_id text = json.dumps(diz) - filename_json = os.path.splitext(filename)[0]+".json" + filename_json = os.path.splitext(filename)[0] + ".json" filepath_json = os.path.join(store_path, filename_json) if verbose: print("filepath_json: ", filepath_json) - with open(filepath_json, 'w') as f: + with open(filepath_json, "w") as f: f.write(text) def load_json_dataset(store_path, filename_json, verbose=0): - """ + """ function to load dataset from json format Example: $#load data in json format $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3" $filename_json = "all_alarms.json" - $filepath = os.path.join(store_path, filename_json) + $filepath = os.path.join(store_path, filename_json) $x_train,y_train,serials_train,x_test,y_test,serials_test = load_json_dataset(store_path, filename_json, verbose=1) @@ -608,22 +694,32 @@ def load_json_dataset(store_path, filename_json, verbose=0): """ filepath = os.path.join(store_path, filename_json) - with open(filepath, 'r') as f: + with open(filepath, "r") as f: dataset = json.load(f) - x_train = [sample["x_train"] - for sample_id, sample in dataset.items() if "x_train" in sample] - y_train = [sample["x_train"] - for sample_id, sample in dataset.items() if "x_train" in sample] - stratify_train = [sample["serial"] for sample_id, - sample in dataset.items() if "x_train" in sample] - - x_test = [sample["x_test"] - for sample_id, sample in dataset.items() if "x_test" in sample] - y_test = [sample["x_test"] - for sample_id, sample in dataset.items() if "x_test" in sample] - stratify_test = [sample["serial"] - for sample_id, sample in dataset.items() if "x_test" in sample] + x_train = [ + sample["x_train"] + for sample_id, sample in dataset.items() + if "x_train" in sample + ] + y_train = [ + sample["x_train"] + for sample_id, sample in dataset.items() + if "x_train" in sample + ] + stratify_train = [ + sample["serial"] for sample_id, sample in dataset.items() if "x_train" in sample + ] + + x_test = [ + sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample + ] + y_test = [ + sample["x_test"] for sample_id, sample in dataset.items() if "x_test" in sample + ] + stratify_test = [ + sample["serial"] for sample_id, sample in dataset.items() if "x_test" in sample + ] x_train = np.asarray(x_train) y_train = np.asarray(y_train) @@ -642,14 +738,14 @@ def load_json_dataset(store_path, filename_json, verbose=0): def convert_to_npz(store_path, filename, verbose=0): - """ + """ function to convert pickle dataset in npz format which will have same name of original dataset but with npz extension and stored in the same path of the original dataset Example: $#save data in npz format $store_path = "../data/MACHINE_TYPE_00_alarms_window_input_1720_window_output_480_offset_60_min_count_20_sigma_3" - $filename= "all_alarms.pickle" + $filename= "all_alarms.pickle" $convert_to_npz(store_path, filename, verbose=1) Args: @@ -658,9 +754,14 @@ def convert_to_npz(store_path, filename, verbose=0): """ # load dataset filepath = os.path.join(store_path, filename) - with open(filepath, 'rb') as f: - x_train_segments, x_test_segments, y_train_segments, y_test_segments, stratify = pickle.load( - f) + with open(filepath, "rb") as f: + ( + x_train_segments, + x_test_segments, + y_train_segments, + y_test_segments, + stratify, + ) = pickle.load(f) x_train_segments = x_train_segments.toarray() x_test_segments = x_test_segments.toarray() @@ -683,17 +784,23 @@ def convert_to_npz(store_path, filename, verbose=0): print("y_test.shape: ", y_test_segments.shape) print("serials_test.shape: ", len(stratify["test"])) - filename_npz = os.path.splitext(filename)[0]+".npz" + filename_npz = os.path.splitext(filename)[0] + ".npz" filepath_npz = os.path.join(store_path, filename_npz) if verbose: print("filepath_npz: ", filepath_npz) - np.savez_compressed(filepath_npz, - x_train=x_train_segments, y_train=y_train_segments, stratify_train=stratify_train, - x_test=x_test_segments, y_test=y_test_segments, stratify_test=stratify_test) + np.savez_compressed( + filepath_npz, + x_train=x_train_segments, + y_train=y_train_segments, + stratify_train=stratify_train, + x_test=x_test_segments, + y_test=y_test_segments, + stratify_test=stratify_test, + ) def load_from_npz(store_path, filename, verbose=0): - """ + """ function to load dataset from json format Example: @@ -715,6 +822,5 @@ def load_from_npz(store_path, filename, verbose=0): """ filepath = os.path.join(store_path, filename) loaded = np.load(filepath, allow_pickle=True) - keys = ["x_train", "y_train", "stratify_train", - "x_test", "y_test", "stratify_test"] + keys = ["x_train", "y_train", "stratify_train", "x_test", "y_test", "stratify_test"] return [loaded[k] for k in keys] diff --git a/datasets/cbm/__init__.py b/datasets/cbm/__init__.py index acee59f..52e8974 100644 --- a/datasets/cbm/__init__.py +++ b/datasets/cbm/__init__.py @@ -11,7 +11,7 @@ def parse_feature_names(fn): with open(fn) as f: names, lines = [], f.readlines() for line in lines: - names.append(line.split('-')[-1].lstrip().rstrip()) + names.append(line.split("-")[-1].lstrip().rstrip()) return names @@ -19,15 +19,15 @@ def parse_feature_names(fn): def load_data(shorten_feature_names=True): fp = os.path.dirname(__file__) - raw_data = np.loadtxt(fp + '/data.txt.gz') - features = parse_feature_names(fp + '/Features.txt') + raw_data = np.loadtxt(fp + "/data.txt.gz") + features = parse_feature_names(fp + "/Features.txt") if shorten_feature_names == True: for i in range(len(features) - 2): - features[i] = features[i].split('(')[-1].split(')')[0].upper() + features[i] = features[i].split("(")[-1].split(")")[0].upper() - features[-2] = 'comp_decay_state' - features[-1] = 'turb_decay_state' + features[-2] = "comp_decay_state" + features[-1] = "turb_decay_state" return pd.DataFrame(raw_data, columns=features) @@ -41,7 +41,7 @@ def normalize(df): minv = df_norm.iloc[:, i].min() df_norm.iloc[:, i] = (df_norm.iloc[:, i] - minv) / (maxv - minv) else: - df_norm.iloc[:, i] = 0. + df_norm.iloc[:, i] = 0.0 return df_norm @@ -57,8 +57,8 @@ def gen_summary(wd=400, outdir=None): os.makedirs(outdir, exist_ok=True) data = normalize(load_data(shorten_feature_names=False)) - - with PdfPages(outdir + '/cbm_summary.pdf') as pp: + + with PdfPages(outdir + "/cbm_summary.pdf") as pp: for st in tqdm.trange(0, data.shape[0], wd): ed = st + wd fig, ax = plt.subplots(9, figsize=(24, 15)) @@ -73,11 +73,11 @@ def gen_summary(wd=400, outdir=None): data.iloc[st:ed, 15].plot(legend=True, ax=ax[7]) data.iloc[st:ed, [0, -2, -1]].plot(legend=True, ax=ax[8]) - ax[0].set_title('Normalized Sensor/Label data') - ax[-1].set_xlabel('Time') - for axi in ax: axi.set_ylabel('Value') + ax[0].set_title("Normalized Sensor/Label data") + ax[-1].set_xlabel("Time") + for axi in ax: + axi.set_ylabel("Value") - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - \ No newline at end of file diff --git a/datasets/cmapss/__init__.py b/datasets/cmapss/__init__.py index 3b48edb..f89d274 100644 --- a/datasets/cmapss/__init__.py +++ b/datasets/cmapss/__init__.py @@ -13,7 +13,7 @@ def load_data(index="FD004", features=None): assert index in ["FD001", "FD002", "FD003", "FD004"] elif type(index) == int: assert index in [0, 1, 2, 3] - index = f'FD00{index+1}' + index = f"FD00{index+1}" filepath = os.path.dirname(__file__) @@ -47,30 +47,30 @@ def load_data(index="FD004", features=None): # Original data train_set = np.loadtxt(filepath + f"/train_{index}.txt.gz") - test_set = np.loadtxt(filepath + f"/test_{index}.txt.gz") - labels = np.loadtxt(filepath + f"/RUL_{index}.txt.gz") # RUL: Remaining Useful Life + test_set = np.loadtxt(filepath + f"/test_{index}.txt.gz") + labels = np.loadtxt(filepath + f"/RUL_{index}.txt.gz") # RUL: Remaining Useful Life # Convert to pandas.DataFrame objects col_names = ["unit_number", "time"] col_names += [f"operation{i}" for i in range(1, 4)] col_names += [f"sensor{i}" for i in range(1, 22)] train_set = pd.DataFrame(train_set, columns=col_names) - test_set = pd.DataFrame(test_set, columns=col_names) + test_set = pd.DataFrame(test_set, columns=col_names) def set_dtype(df): return df.astype({"unit_number": np.int64, "time": np.int64}) def extract_features(df, features): - columns = ['unit_number', 'time'] - columns += [f'sensor{i}' for i in features] + columns = ["unit_number", "time"] + columns += [f"sensor{i}" for i in features] return df.loc[:, columns] train_set = set_dtype(train_set) - test_set = set_dtype(test_set) + test_set = set_dtype(test_set) if features is not None: train_set = extract_features(train_set, features) - test_set = extract_features(test_set, features) + test_set = extract_features(test_set, features) return train_set, test_set, labels @@ -79,34 +79,61 @@ def cleaning(df): raise NotImplementedError -def load_clean_data(): - raise NotImplementedError +def load_clean_data_rul( + index="FD004", + features=[2, 3, 4, 7, 11, 12, 15], +): + train, test, labels = load_mesurement_list(index=index, features=features) + label = "RUL" + + # train + train_df_list = [] + for tt in train: + max_rul = len(tt) + rul_array = np.array(range(max_rul))[::-1] + tt[label] = rul_array + train_df_list.append(tt) + + # test + test_df_list = [] + for tt, la in zip(test, labels): + max_rul = len(tt) + rul_array = np.array(range(max_rul))[::-1] + rul_array += int(la) + tt[label] = rul_array + test_df_list.append(tt) + + return train_df_list, test_df_list def load_mesurement_list( - index="FD004", - features=[2, 3, 4, 7, 11, 12 ,15], - ): + index="FD004", + features=[2, 3, 4, 7, 11, 12, 15], +): """ * transform train_set and test_set into the lists of multivariate senser mesurements according to unit numbers. - * features: the default features were applied in the previous research, + * features: the default features were applied in the previous research, "A Similarity-Based Prognostics Approach for Remaining Useful Life Estimation of Engineered Systems". """ assert index in ["FD001", "FD002", "FD003", "FD004"] train_set, test_set, labels = load_data(index=index) - refined_train_set =[] + refined_train_set = [] for _, seq_df in train_set.groupby("unit_number"): - seq_df = seq_df.sort_values("time") - ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(drop=True) + seq_df = seq_df.sort_values("time") + ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index( + drop=True + ) refined_train_set.append(ex_seq_df) - refined_test_set =[] + refined_test_set = [] for _, seq_df in test_set.groupby("unit_number"): - seq_df = seq_df.sort_values("time") - ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index(drop=True) + seq_df = seq_df.sort_values("time") + ex_seq_df = seq_df[[f"sensor{f_id}" for f_id in features]].reset_index( + drop=True + ) refined_test_set.append(ex_seq_df) return refined_train_set, refined_test_set, labels @@ -117,14 +144,16 @@ def run_to_failure_aux(df, lifetime, unit_number): assert lifetime <= df.shape[0] broken = 0 if lifetime < df.shape[0] else 1 sample = pd.DataFrame( - {'lifetime': lifetime, 'broken': broken, 'unit_number': unit_number}, index=[0]) + {"lifetime": lifetime, "broken": broken, "unit_number": unit_number}, index=[0] + ) - sensors = df.loc[:, df.columns.str.contains('sensor')] - num_features = sensors.iloc[:lifetime].agg(['min', 'max', 'mean', 'std']) + sensors = df.loc[:, df.columns.str.contains("sensor")] + num_features = sensors.iloc[:lifetime].agg(["min", "max", "mean", "std"]) num_features = num_features.unstack().reset_index() - num_features['feature'] = num_features.level_0.str.cat( - num_features.level_1, sep='_') - num_features = num_features.pivot_table(columns='feature', values=0) + num_features["feature"] = num_features.level_0.str.cat( + num_features.level_1, sep="_" + ) + num_features = num_features.pivot_table(columns="feature", values=0) return pd.concat([sample, num_features], axis=1) @@ -132,17 +161,16 @@ def run_to_failure_aux(df, lifetime, unit_number): def censoring_augmentation(raw_data, n_samples=10, seed=123): np.random.seed(seed) - datasets = [g for _, g in raw_data.groupby('unit_number')] - timeseries = raw_data.groupby('unit_number').size() + datasets = [g for _, g in raw_data.groupby("unit_number")] + timeseries = raw_data.groupby("unit_number").size() samples = [] - pbar = tqdm.tqdm(total=n_samples, desc='augmentation') + pbar = tqdm.tqdm(total=n_samples, desc="augmentation") while len(samples) < n_samples: # draw a machine unit_number = np.random.randint(timeseries.shape[0]) censor_timing = np.random.randint(timeseries.iloc[unit_number]) - sample = run_to_failure_aux( - datasets[unit_number], censor_timing, unit_number) + sample = run_to_failure_aux(datasets[unit_number], censor_timing, unit_number) samples.append(sample) pbar.update(1) @@ -152,23 +180,25 @@ def censoring_augmentation(raw_data, n_samples=10, seed=123): def generate_run_to_failure(df, health_censor_aug=0, seed=123): samples = [] - for unit_id, timeseries in tqdm.tqdm(df.groupby('unit_number'), desc='RUL'): - samples.append(run_to_failure_aux( - timeseries, timeseries.shape[0], unit_id)) + for unit_id, timeseries in tqdm.tqdm(df.groupby("unit_number"), desc="RUL"): + samples.append(run_to_failure_aux(timeseries, timeseries.shape[0], unit_id)) samples = pd.concat(samples) if health_censor_aug > 0: - aug_samples = censoring_augmentation( - df, n_samples=health_censor_aug, seed=seed) + aug_samples = censoring_augmentation(df, n_samples=health_censor_aug, seed=seed) return pd.concat([samples, aug_samples]).reset_index(drop=True) else: return samples.reset_index(drop=True) -def leave_one_out(target='run-to-failure', - health_censor_aug=1000, seed=123, - input_fn=None, output_fn=None): +def leave_one_out( + target="run-to-failure", + health_censor_aug=1000, + seed=123, + input_fn=None, + output_fn=None, +): if input_fn is not None: subsets = pd.read_csv(input_fn) @@ -179,13 +209,12 @@ def leave_one_out(target='run-to-failure', raw_data = load_data(index=index)[0] raw_data = raw_data.assign(machine_id=index) - if target == 'run-to-failure': - subset = generate_run_to_failure( - raw_data, health_censor_aug, seed) + if target == "run-to-failure": + subset = generate_run_to_failure(raw_data, health_censor_aug, seed) subset = subset.assign(fold=index) subsets.append(subset) - elif target == 'time-to-failure': + elif target == "time-to-failure": raise NotImplementedError else: @@ -197,39 +226,51 @@ def leave_one_out(target='run-to-failure', subsets.to_csv(output_fn, index=False) # List of tuples: (train_data, test_data) - train_test_sets = [( - subsets[subsets.fold != i].reset_index(drop=True), - subsets[subsets.fold == i].reset_index(drop=True)) for i in range(4)] + train_test_sets = [ + ( + subsets[subsets.fold != i].reset_index(drop=True), + subsets[subsets.fold == i].reset_index(drop=True), + ) + for i in range(4) + ] return train_test_sets -def generate_validation_sets(method='leave-one-out', n_splits=5, seed=123, outdir=None): +def generate_validation_sets(method="leave-one-out", n_splits=5, seed=123, outdir=None): validation_sets = [] - if method == 'kfold': + if method == "kfold": raise NotImplementedError - elif method == 'leave-one-out': - validation_sets = leave_one_out(target='run-to-failure', - health_censor_aug=1000, - seed=seed) + elif method == "leave-one-out": + validation_sets = leave_one_out( + target="run-to-failure", health_censor_aug=1000, seed=seed + ) if outdir is not None: for i, (train_data, test_data) in enumerate(validation_sets): - train_data.to_csv(outdir + f'/train_{i}.csv.gz', index=False) - test_data.to_csv(outdir + f'/test_{i}.csv.gz', index=False) + train_data.to_csv(outdir + f"/train_{i}.csv.gz", index=False) + test_data.to_csv(outdir + f"/test_{i}.csv.gz", index=False) return validation_sets -def load_validation_sets(filepath, method='leave-one-out', n_splits=5): - if method == 'kfold': - return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'), - pd.read_csv(filepath + f'/test_{i}.csv.gz')) - for i in range(n_splits)] - - elif method == 'leave-one-out': - return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'), - pd.read_csv(filepath + f'/test_{i}.csv.gz')) - for i in range(4)] +def load_validation_sets(filepath, method="leave-one-out", n_splits=5): + if method == "kfold": + return [ + ( + pd.read_csv(filepath + f"/train_{i}.csv.gz"), + pd.read_csv(filepath + f"/test_{i}.csv.gz"), + ) + for i in range(n_splits) + ] + + elif method == "leave-one-out": + return [ + ( + pd.read_csv(filepath + f"/train_{i}.csv.gz"), + pd.read_csv(filepath + f"/test_{i}.csv.gz"), + ) + for i in range(4) + ] diff --git a/datasets/gdd/__init__.py b/datasets/gdd/__init__.py index eee85fe..a1f8237 100644 --- a/datasets/gdd/__init__.py +++ b/datasets/gdd/__init__.py @@ -6,125 +6,157 @@ import seaborn as sns -def load_data(index='state'): +def load_data(index="state"): - assert index in ['state', 'anomaly', 'normal', 'linear', 'pressure'] + assert index in ["state", "anomaly", "normal", "linear", "pressure"] fp = os.path.dirname(__file__) - if index == 'state': - df = pd.read_csv(fp + '/Genesis_StateMachineLabel.csv.gz') - elif index == 'anomaly': - df = pd.read_csv(fp + '/Genesis_AnomalyLabels.csv.gz') - elif index == 'normal': - df = pd.read_csv(fp + '/Genesis_normal.csv.gz') + if index == "state": + df = pd.read_csv(fp + "/Genesis_StateMachineLabel.csv.gz") + elif index == "anomaly": + df = pd.read_csv(fp + "/Genesis_AnomalyLabels.csv.gz") + elif index == "normal": + df = pd.read_csv(fp + "/Genesis_normal.csv.gz") df.Timestamp = df.Timestamp / 1000 - elif index == 'linear': - df = pd.read_csv(fp + '/Genesis_lineardrive.csv.gz') + elif index == "linear": + df = pd.read_csv(fp + "/Genesis_lineardrive.csv.gz") df.Timestamp = df.Timestamp / 1000 - elif index == 'pressure': - df = pd.read_csv(fp + '/Genesis_pressure.csv.gz') + elif index == "pressure": + df = pd.read_csv(fp + "/Genesis_pressure.csv.gz") df.Timestamp = df.Timestamp / 1000 df.Timestamp = df.Timestamp.apply(datetime.datetime.fromtimestamp) - + return df -def plot_genesis_labels(df, figsize=(15, 20), cmap='tab10'): - """ Call this for machine states and anomaly labels """ +def plot_genesis_labels(df, figsize=(15, 20), cmap="tab10"): + """Call this for machine states and anomaly labels""" fig, ax = plt.subplots(10, figsize=figsize) - df['MotorData.ActCurrent'].plot(ax=ax[0], legend=True, cmap=cmap) - df['MotorData.ActPosition'].plot(ax=ax[1], legend=True, cmap=cmap) - df['MotorData.ActSpeed'].plot(ax=ax[2], legend=True, cmap=cmap) - - df['MotorData.IsAcceleration'].plot(ax=ax[3], legend=True, cmap=cmap) - df['MotorData.IsForce'].plot(ax=ax[4], legend=True, cmap=cmap) - - df[['MotorData.Motor_Pos1reached', # binary - 'MotorData.Motor_Pos2reached', # binary - 'MotorData.Motor_Pos3reached', # binary - 'MotorData.Motor_Pos4reached', # binary - ]].plot(ax=ax[5], legend=True, cmap=cmap) - - df[['NVL_Recv_Ind.GL_Metall', # binary - 'NVL_Recv_Ind.GL_NonMetall', # binary - ]].plot(ax=ax[6], legend=True, cmap=cmap) - - df[['NVL_Recv_Storage.GL_I_ProcessStarted', # binary - 'NVL_Recv_Storage.GL_I_Slider_IN', # binary - 'NVL_Recv_Storage.GL_I_Slider_OUT', # binary - 'NVL_Recv_Storage.GL_LightBarrier', # binary - 'NVL_Send_Storage.ActivateStorage', # binary - ]].plot(ax=ax[7], legend=True, cmap=cmap) - - df[['PLC_PRG.Gripper', # binary - 'PLC_PRG.MaterialIsMetal', # binary - ]].plot(ax=ax[8], legend=True, cmap=cmap) - - df['Label'].plot(ax=ax[9], legend=True, cmap=cmap) + df["MotorData.ActCurrent"].plot(ax=ax[0], legend=True, cmap=cmap) + df["MotorData.ActPosition"].plot(ax=ax[1], legend=True, cmap=cmap) + df["MotorData.ActSpeed"].plot(ax=ax[2], legend=True, cmap=cmap) + + df["MotorData.IsAcceleration"].plot(ax=ax[3], legend=True, cmap=cmap) + df["MotorData.IsForce"].plot(ax=ax[4], legend=True, cmap=cmap) + + df[ + [ + "MotorData.Motor_Pos1reached", # binary + "MotorData.Motor_Pos2reached", # binary + "MotorData.Motor_Pos3reached", # binary + "MotorData.Motor_Pos4reached", # binary + ] + ].plot(ax=ax[5], legend=True, cmap=cmap) + + df[ + [ + "NVL_Recv_Ind.GL_Metall", # binary + "NVL_Recv_Ind.GL_NonMetall", # binary + ] + ].plot(ax=ax[6], legend=True, cmap=cmap) + + df[ + [ + "NVL_Recv_Storage.GL_I_ProcessStarted", # binary + "NVL_Recv_Storage.GL_I_Slider_IN", # binary + "NVL_Recv_Storage.GL_I_Slider_OUT", # binary + "NVL_Recv_Storage.GL_LightBarrier", # binary + "NVL_Send_Storage.ActivateStorage", # binary + ] + ].plot(ax=ax[7], legend=True, cmap=cmap) + + df[ + [ + "PLC_PRG.Gripper", # binary + "PLC_PRG.MaterialIsMetal", # binary + ] + ].plot(ax=ax[8], legend=True, cmap=cmap) + + df["Label"].plot(ax=ax[9], legend=True, cmap=cmap) for axi in ax: axi.set_xlim(0, df.shape[0]) - axi.set_ylabel('Value') + axi.set_ylabel("Value") - ax[0].set_title('Date: {} to {}'.format( - df.Timestamp.min(), df.Timestamp.max())) - ax[-1].set_xlabel('Time') + ax[0].set_title("Date: {} to {}".format(df.Timestamp.min(), df.Timestamp.max())) + ax[-1].set_xlabel("Time") fig.tight_layout() return fig, ax - - -def plot_genesis_nonlabels(df, figsize=(15, 20), cmap='tab10'): - """ Call this for non-labeled data """ - fig, ax = plt.subplots(8, figsize=figsize) - df[['MotorData.SetCurrent', - 'MotorData.ActCurrent', - ]].plot(ax=ax[0], legend=True, cmap=cmap) +def plot_genesis_nonlabels(df, figsize=(15, 20), cmap="tab10"): + """Call this for non-labeled data""" - df[['MotorData.SetSpeed', - 'MotorData.ActSpeed', - ]].plot(ax=ax[1], legend=True, cmap=cmap) - - df[['MotorData.SetAcceleration', - 'MotorData.IsAcceleration', - ]].plot(ax=ax[2], legend=True, cmap=cmap) - - df[['MotorData.SetForce', - 'MotorData.IsForce' - ]].plot(ax=ax[3], legend=True, cmap=cmap) - - df[['MotorData.Motor_Pos1reached', # binary - 'MotorData.Motor_Pos2reached', # binary - 'MotorData.Motor_Pos3reached', # binary - 'MotorData.Motor_Pos4reached', # binary - ]].plot(ax=ax[4], legend=True, cmap=cmap) - - df[['NVL_Recv_Ind.GL_Metall', # binary - 'NVL_Recv_Ind.GL_NonMetall', # binary - ]].plot(ax=ax[5], legend=True, cmap=cmap) - - df[['NVL_Recv_Storage.GL_I_ProcessStarted', # binary - 'NVL_Recv_Storage.GL_I_Slider_IN', # binary - 'NVL_Recv_Storage.GL_I_Slider_OUT', # binary - 'NVL_Recv_Storage.GL_LightBarrier', # binary - 'NVL_Send_Storage.ActivateStorage', # binary - ]].plot(ax=ax[6], legend=True, cmap=cmap) + fig, ax = plt.subplots(8, figsize=figsize) - df[['PLC_PRG.Gripper', # binary - 'PLC_PRG.MaterialIsMetal', # binary - ]].plot(ax=ax[7], legend=True, cmap=cmap) + df[ + [ + "MotorData.SetCurrent", + "MotorData.ActCurrent", + ] + ].plot(ax=ax[0], legend=True, cmap=cmap) + + df[ + [ + "MotorData.SetSpeed", + "MotorData.ActSpeed", + ] + ].plot(ax=ax[1], legend=True, cmap=cmap) + + df[ + [ + "MotorData.SetAcceleration", + "MotorData.IsAcceleration", + ] + ].plot(ax=ax[2], legend=True, cmap=cmap) + + df[["MotorData.SetForce", "MotorData.IsForce"]].plot( + ax=ax[3], legend=True, cmap=cmap + ) + + df[ + [ + "MotorData.Motor_Pos1reached", # binary + "MotorData.Motor_Pos2reached", # binary + "MotorData.Motor_Pos3reached", # binary + "MotorData.Motor_Pos4reached", # binary + ] + ].plot(ax=ax[4], legend=True, cmap=cmap) + + df[ + [ + "NVL_Recv_Ind.GL_Metall", # binary + "NVL_Recv_Ind.GL_NonMetall", # binary + ] + ].plot(ax=ax[5], legend=True, cmap=cmap) + + df[ + [ + "NVL_Recv_Storage.GL_I_ProcessStarted", # binary + "NVL_Recv_Storage.GL_I_Slider_IN", # binary + "NVL_Recv_Storage.GL_I_Slider_OUT", # binary + "NVL_Recv_Storage.GL_LightBarrier", # binary + "NVL_Send_Storage.ActivateStorage", # binary + ] + ].plot(ax=ax[6], legend=True, cmap=cmap) + + df[ + [ + "PLC_PRG.Gripper", # binary + "PLC_PRG.MaterialIsMetal", # binary + ] + ].plot(ax=ax[7], legend=True, cmap=cmap) for axi in ax: axi.set_xlim(0, df.shape[0]) - axi.set_ylabel('Value') + axi.set_ylabel("Value") - ax[0].set_title('Date: {} to {}'.format(df.Timestamp.min(), df.Timestamp.max())) - ax[-1].set_xlabel('Time') + ax[0].set_title("Date: {} to {}".format(df.Timestamp.min(), df.Timestamp.max())) + ax[-1].set_xlabel("Time") fig.tight_layout() return fig, ax @@ -136,43 +168,43 @@ def gen_summary(outdir=None): outdir = os.path.dirname(__file__) os.makedirs(outdir, exist_ok=True) - sns.set(font_scale=1.1, style='whitegrid') + sns.set(font_scale=1.1, style="whitegrid") + + with PdfPages(outdir + "/gdd_summary.pdf") as pp: - with PdfPages(outdir + '/gdd_summary.pdf') as pp: - - print('Plotting Genesis_StateMachineLabel...') - df = load_data(index='state') + print("Plotting Genesis_StateMachineLabel...") + df = load_data(index="state") fig, _ = plot_genesis_labels(df) - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - print('Plotting Genesis_AnomalyLabels...') - df = load_data(index='anomaly') + print("Plotting Genesis_AnomalyLabels...") + df = load_data(index="anomaly") fig, _ = plot_genesis_labels(df) - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - print('Plotting Genesis_normal...') - df = load_data(index='normal') + print("Plotting Genesis_normal...") + df = load_data(index="normal") fig, _ = plot_genesis_nonlabels(df) - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - print('Plotting Genesis_lineardrive...') - df = load_data(index='linear') + print("Plotting Genesis_lineardrive...") + df = load_data(index="linear") fig, _ = plot_genesis_nonlabels(df) - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - print('Plotting Genesis_pressure...') - df = load_data(index='pressure') + print("Plotting Genesis_pressure...") + df = load_data(index="pressure") fig, _ = plot_genesis_nonlabels(df) - fig.savefig(pp, bbox_inches='tight', format='pdf') + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() - print("done!") \ No newline at end of file + print("done!") diff --git a/datasets/gfd/__init__.py b/datasets/gfd/__init__.py index a3a4700..cb23013 100644 --- a/datasets/gfd/__init__.py +++ b/datasets/gfd/__init__.py @@ -5,21 +5,24 @@ from matplotlib.backends.backend_pdf import PdfPages -__name__ = 'gfd' +__name__ = "gfd" -def load_data(label='h', load=0): - assert label in ['h', 'b'] +def load_data(label="h", load=0): + assert label in ["h", "b"] assert load in [i for i in range(0, 100, 10)] - return pd.read_csv(os.path.dirname(__file__) + f'/{label}30hz{load}.csv.gz') + return pd.read_csv(os.path.dirname(__file__) + f"/{label}30hz{load}.csv.gz") def load_all_data_dict(): all_data_dict = {} - for label in ['h', 'b']: - all_data_dict[label] = {load : load_data(label=label,load=load) for load in range(0,100,10)} + for label in ["h", "b"]: + all_data_dict[label] = { + load: load_data(label=label, load=load) for load in range(0, 100, 10) + } return all_data_dict + def plot_sequence(df, st=0, ed=None, ax=None, figsize=(10, 3), individual=True): if ed is None: ed = df.shape[0] @@ -39,6 +42,7 @@ def plot_sequence(df, st=0, ed=None, ax=None, figsize=(10, 3), individual=True): df.iloc[st:ed].plot(ax=ax, figsize=figsize, legend=True) + def gen_summary(outdir=None, st=0, ed=500, wd=20, hg=8): if outdir is None: @@ -46,31 +50,48 @@ def gen_summary(outdir=None, st=0, ed=500, wd=20, hg=8): os.makedirs(outdir, exist_ok=True) - with PdfPages(outdir + '/gfd_summary.pdf') as pp: - for label in ['h', 'b']: + with PdfPages(outdir + "/gfd_summary.pdf") as pp: + for label in ["h", "b"]: for load in tqdm.trange(0, 100, 10, desc=label): fig, ax = plt.subplots(5) - plot_sequence(load_data(label=label, load=load), st=st, ed=ed, ax=ax[:4], figsize=(wd, hg), individual=True) - plot_sequence(load_data(label=label, load=load), st=st, ed=ed, ax=ax[-1], figsize=(wd, hg), individual=False) - - ax[-1].set_xlabel('Time') - for axi in ax: axi.set_ylabel('Value') - name = 'Healty' if label == 'h' else 'BrokenTooth' - ax[0].set_title(f'{name}: Load= {load}') - - fig.savefig(pp, bbox_inches='tight', format='pdf') + plot_sequence( + load_data(label=label, load=load), + st=st, + ed=ed, + ax=ax[:4], + figsize=(wd, hg), + individual=True, + ) + plot_sequence( + load_data(label=label, load=load), + st=st, + ed=ed, + ax=ax[-1], + figsize=(wd, hg), + individual=False, + ) + + ax[-1].set_xlabel("Time") + for axi in ax: + axi.set_ylabel("Value") + name = "Healty" if label == "h" else "BrokenTooth" + ax[0].set_title(f"{name}: Load= {load}") + + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() plt.close() + def plot_sequences_under_h_and_b_conditions(): all_data_dict = load_all_data_dict() - for load in range(0,100,10): - data = pd.concat([all_data_dict["h"][load], all_data_dict["b"][load]],axis=0).reset_index(drop=True) - septime=len(all_data_dict["h"][load]) - fig, axes = plt.subplots(4, figsize=(12,9)) + for load in range(0, 100, 10): + data = pd.concat( + [all_data_dict["h"][load], all_data_dict["b"][load]], axis=0 + ).reset_index(drop=True) + septime = len(all_data_dict["h"][load]) + fig, axes = plt.subplots(4, figsize=(12, 9)) axes[0].set_title(f"healthy and broken tooth conditions load: {load}") for i, col in enumerate(data.columns): data[col].reset_index(drop=True).iloc[::10].plot(y=col, ax=axes[i]) - axes[i].axvline(x=septime,color="red") - + axes[i].axvline(x=septime, color="red") diff --git a/datasets/hydsys/__init__.py b/datasets/hydsys/__init__.py index c13acd7..2d9458a 100644 --- a/datasets/hydsys/__init__.py +++ b/datasets/hydsys/__init__.py @@ -5,20 +5,20 @@ def load_data(sensor=None, rw=0): - + if sensor is None: # load full data # rw is ignored to concatenate all sensor data df = [] - df.append(load_sensor_data('PS', rw=10)) # default length: 6000 - df.append(load_sensor_data('EPS', rw=10)) # default length: 6000 - df.append(load_sensor_data('FS', rw=0)) # default length: 600 + df.append(load_sensor_data("PS", rw=10)) # default length: 6000 + df.append(load_sensor_data("EPS", rw=10)) # default length: 6000 + df.append(load_sensor_data("FS", rw=0)) # default length: 600 # df.append(load_sensor_data('TS', rw=0)) # default length: 60 return pd.concat(df) else: return load_sensor_data(sensor, rw) - + def load_sensor_data(sensor, rw=0): @@ -27,50 +27,54 @@ def load_sensor_data(sensor, rw=0): fp = os.path.dirname(__file__) for name in tqdm.tqdm(sensor_list, desc=sensor): - df = pd.DataFrame(np.loadtxt(fp + f'/{name}.txt.gz')) + df = pd.DataFrame(np.loadtxt(fp + f"/{name}.txt.gz")) df = resample(df, rw) - df['sensor'] = name - df['cycle'] = df.index.values + df["sensor"] = name + df["cycle"] = df.index.values data.append(df) - return pd.concat(data).set_index(['cycle', 'sensor']).reset_index() + return pd.concat(data).set_index(["cycle", "sensor"]).reset_index() def get_sensor_list(name): - if name == 'PS': - return [f'PS{i+1}' for i in range(6)] - elif name == 'EPS': - return ['EPS1'] - elif name == 'FS': - return [f'FS{i+1}' for i in range(2)] - elif name == 'TS': - return [f'TS{i+1}' for i in range(4)] - elif name == 'VS': - return ['VS1'] + if name == "PS": + return [f"PS{i+1}" for i in range(6)] + elif name == "EPS": + return ["EPS1"] + elif name == "FS": + return [f"FS{i+1}" for i in range(2)] + elif name == "TS": + return [f"TS{i+1}" for i in range(4)] + elif name == "VS": + return ["VS1"] else: raise ValueError def load_labels(): fp = os.path.dirname(__file__) - return pd.DataFrame(np.loadtxt(fp + '/profile.txt'), - columns=[ - 'cooler_condition', - 'valve_condition', - 'internal_pump_leakage', - 'hydraulic_accumulator', - 'stable_flag']).reset_index().rename( - columns={'index': 'cycle'}) + return ( + pd.DataFrame( + np.loadtxt(fp + "/profile.txt"), + columns=[ + "cooler_condition", + "valve_condition", + "internal_pump_leakage", + "hydraulic_accumulator", + "stable_flag", + ], + ) + .reset_index() + .rename(columns={"index": "cycle"}) + ) def resample(df, rw=0): if rw > 0: # Resampling df = df.T - df['index'] = df.index.values // rw - df = df.groupby('index').mean() + df["index"] = df.index.values // rw + df = df.groupby("index").mean() df = df.T return df - - diff --git a/datasets/mapm/__init__.py b/datasets/mapm/__init__.py index 9ad727f..b44553a 100644 --- a/datasets/mapm/__init__.py +++ b/datasets/mapm/__init__.py @@ -12,17 +12,19 @@ def load_data(): fp = os.path.dirname(__file__) # Sensor data - data = pd.read_csv(fp + '/PdM_telemetry.csv.gz') + data = pd.read_csv(fp + "/PdM_telemetry.csv.gz") # Error alarm logs data = data.merge( - pd.read_csv(fp + '/PdM_errors.csv.gz'), - how='left', on=['datetime', 'machineID']) + pd.read_csv(fp + "/PdM_errors.csv.gz"), how="left", on=["datetime", "machineID"] + ) # Failure logs data = data.merge( - pd.read_csv(fp + '/PdM_failures.csv.gz'), - how='left', on=['datetime', 'machineID']) + pd.read_csv(fp + "/PdM_failures.csv.gz"), + how="left", + on=["datetime", "machineID"], + ) # Formatting data.datetime = pd.to_datetime(data.datetime) @@ -33,19 +35,19 @@ def load_data(): def cleaning(df): # NaN values are encoded to -1 - df = df.sort_values('errorID') + df = df.sort_values("errorID") df.errorID = df.errorID.factorize()[0] - df = df.sort_values('failure') + df = df.sort_values("failure") df.failure = df.failure.factorize()[0] - df = df.sort_values(['machineID', 'datetime']) + df = df.sort_values(["machineID", "datetime"]) - df.errorID = df.errorID.astype('category') - df.failure = df.failure.astype('category') + df.errorID = df.errorID.astype("category") + df.failure = df.failure.astype("category") - df.volt = df.volt.astype('float32') - df.rotate = df.rotate.astype('float32') - df.pressure = df.pressure.astype('float32') - df.vibration = df.vibration.astype('float32') + df.volt = df.volt.astype("float32") + df.rotate = df.rotate.astype("float32") + df.pressure = df.pressure.astype("float32") + df.vibration = df.vibration.astype("float32") df.datetime = pd.to_datetime(df.datetime) return df @@ -55,73 +57,94 @@ def load_clean_data(): return cleaning(load_data()) -def generate_run_to_failure(raw_data, health_censor_aug=1000, - min_lifetime=10, max_lifetime=300, - seed=123, outfn=None): +def generate_run_to_failure( + raw_data, + health_censor_aug=1000, + min_lifetime=10, + max_lifetime=300, + seed=123, + outfn=None, +): run_to_failure = [] error_ids = raw_data.errorID.dropna().sort_values().unique().tolist() - for machine_id, g in tqdm.tqdm(raw_data.groupby('machineID'), desc='run-to-failure'): - g = g.set_index('datetime').sort_index() + for machine_id, g in tqdm.tqdm( + raw_data.groupby("machineID"), desc="run-to-failure" + ): + g = g.set_index("datetime").sort_index() start_date = g.index.values[0] failures = g.loc[~g.failure.isnull()] for event_time, event in failures.iterrows(): # Extracting a single cycle/process - cycle = g[start_date:event_time].drop('machineID', axis=1) + cycle = g[start_date:event_time].drop("machineID", axis=1) lifetime = (event_time - start_date).days if lifetime < 1: start_date = event_time continue - numerical_features = cycle.agg(['min', 'max', 'mean']).unstack().reset_index() - numerical_features['feature'] = numerical_features.level_0.str.cat(numerical_features.level_1, sep='_') - numerical_features = numerical_features.pivot_table(columns='feature', values=0) + numerical_features = ( + cycle.agg(["min", "max", "mean"]).unstack().reset_index() + ) + numerical_features["feature"] = numerical_features.level_0.str.cat( + numerical_features.level_1, sep="_" + ) + numerical_features = numerical_features.pivot_table( + columns="feature", values=0 + ) - categorical_features = pd.DataFrame(Counter(cycle.errorID), columns=error_ids, index=[0]) + categorical_features = pd.DataFrame( + Counter(cycle.errorID), columns=error_ids, index=[0] + ) sample = pd.concat([numerical_features, categorical_features], axis=1) - sample[['machine_id', 'lifetime', 'broken']] = machine_id, lifetime, 1 + sample[["machine_id", "lifetime", "broken"]] = machine_id, lifetime, 1 run_to_failure.append(sample) start_date = event_time run_to_failure = pd.concat(run_to_failure, axis=0).reset_index(drop=True) - health_censors = censoring_augmentation(raw_data, + health_censors = censoring_augmentation( + raw_data, n_samples=health_censor_aug, min_lifetime=min_lifetime, max_lifetime=max_lifetime, - seed=seed) + seed=seed, + ) run_to_failure = pd.concat([run_to_failure, health_censors]) # Shuffle - run_to_failure = run_to_failure.sample(frac=1, random_state=seed).reset_index(drop=True) - run_to_failure = run_to_failure.fillna(0.) - + run_to_failure = run_to_failure.sample(frac=1, random_state=seed).reset_index( + drop=True + ) + run_to_failure = run_to_failure.fillna(0.0) + if outfn is not None: run_to_failure.to_csv(outfn, index=False) return run_to_failure -def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetime=2, seed=123): +def censoring_augmentation( + raw_data, n_samples=10, max_lifetime=150, min_lifetime=2, seed=123 +): error_ids = raw_data.errorID.dropna().sort_values().unique().tolist() np.random.seed(seed) samples = [] - pbar = tqdm.tqdm(total=n_samples, desc='augmentation') + pbar = tqdm.tqdm(total=n_samples, desc="augmentation") while len(samples) < n_samples: - + censor_timing = np.random.randint(min_lifetime, max_lifetime) machine_id = np.random.randint(100) + 1 tmp = raw_data[raw_data.machineID == machine_id] - tmp = tmp.drop('machineID', axis=1).set_index('datetime').sort_index() + tmp = tmp.drop("machineID", axis=1).set_index("datetime").sort_index() failures = tmp[~tmp.failure.isnull()] if failures.shape[0] < 2: @@ -130,7 +153,11 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim failure_id = np.random.randint(failures.shape[0]) failure = failures.iloc[failure_id] event_time = failure.name - start_date = tmp.index.values[0] if failure_id == 0 else failures.iloc[failure_id - 1].name + start_date = ( + tmp.index.values[0] + if failure_id == 0 + else failures.iloc[failure_id - 1].name + ) # censoring cycle = tmp[start_date:event_time] @@ -139,14 +166,20 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim if not cycle.shape[0] == censor_timing: continue - numerical_features = cycle.agg(['min', 'max', 'mean', 'std']).unstack().reset_index() - numerical_features['feature'] = numerical_features.level_0.str.cat(numerical_features.level_1, sep='_') - numerical_features = numerical_features.pivot_table(columns='feature', values=0) + numerical_features = ( + cycle.agg(["min", "max", "mean", "std"]).unstack().reset_index() + ) + numerical_features["feature"] = numerical_features.level_0.str.cat( + numerical_features.level_1, sep="_" + ) + numerical_features = numerical_features.pivot_table(columns="feature", values=0) - categorical_features = pd.DataFrame(Counter(cycle.errorID), columns=error_ids, index=[0]) + categorical_features = pd.DataFrame( + Counter(cycle.errorID), columns=error_ids, index=[0] + ) sample = pd.concat([numerical_features, categorical_features], axis=1) - sample[['machine_id', 'lifetime', 'broken']] = machine_id, censor_timing, 0 + sample[["machine_id", "lifetime", "broken"]] = machine_id, censor_timing, 0 samples.append(sample) pbar.update(1) @@ -154,21 +187,23 @@ def censoring_augmentation(raw_data, n_samples=10, max_lifetime=150, min_lifetim return pd.concat(samples).reset_index(drop=True).fillna(0) -def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None): +def generate_validation_sets(method="kfold", n_splits=5, seed=123, outdir=None): validation_sets = [] - if method == 'kfold': + if method == "kfold": # K-fold cross validation assert type(n_splits) == int assert n_splits > 2 raw_data = load_data() - kfold = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=seed) + kfold = model_selection.KFold( + n_splits=n_splits, shuffle=True, random_state=seed + ) for i, (train_index, test_index) in enumerate(kfold.split(np.arange(100))): - - print('K-fold {}/{}'.format(i+1, n_splits)) + + print("K-fold {}/{}".format(i + 1, n_splits)) # train/test split by machine ID train_machines = raw_data[raw_data.machineID.isin(train_index)] test_machines = raw_data[raw_data.machineID.isin(test_index)] @@ -177,9 +212,11 @@ def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None): # convert the two sets into run-to-failure data train_censored_data = generate_run_to_failure( - train_machines, health_censor_aug=len(train_index)*10, seed=seed) + train_machines, health_censor_aug=len(train_index) * 10, seed=seed + ) test_consored_data = generate_run_to_failure( - test_machines, health_censor_aug=len(test_index)*10, seed=seed) + test_machines, health_censor_aug=len(test_index) * 10, seed=seed + ) # print('train:', train_censored_data.shape) # print('test:', test_consored_data.shape) @@ -187,19 +224,23 @@ def generate_validation_sets(method='kfold', n_splits=5, seed=123, outdir=None): validation_sets.append((train_censored_data, test_consored_data)) if outdir is not None: - train_censored_data.to_csv(outdir + f'/train_{i}.csv.gz', index=False) - test_consored_data.to_csv(outdir + f'/test_{i}.csv.gz', index=False) + train_censored_data.to_csv(outdir + f"/train_{i}.csv.gz", index=False) + test_consored_data.to_csv(outdir + f"/test_{i}.csv.gz", index=False) - elif method == 'leave-one-out': + elif method == "leave-one-out": raise NotImplementedError return validation_sets def load_validation_sets(filepath, n_splits=5): - return [(pd.read_csv(filepath + f'/train_{i}.csv.gz'), - pd.read_csv(filepath + f'/test_{i}.csv.gz')) - for i in range(n_splits)] + return [ + ( + pd.read_csv(filepath + f"/train_{i}.csv.gz"), + pd.read_csv(filepath + f"/test_{i}.csv.gz"), + ) + for i in range(n_splits) + ] def plot_sequence_and_events(data, machine_id=1): @@ -207,17 +248,17 @@ def plot_sequence_and_events(data, machine_id=1): data = data[data.machineID == machine_id] fig, ax = plt.subplots(4 + 2, figsize=(8, 8)) - data.plot(y='volt', legend=True, ax=ax[0]) - data.plot(y='rotate', legend=True, ax=ax[1]) - data.plot(y='pressure', legend=True, ax=ax[2]) - data.plot(y='vibration', legend=True, ax=ax[3]) + data.plot(y="volt", legend=True, ax=ax[0]) + data.plot(y="rotate", legend=True, ax=ax[1]) + data.plot(y="pressure", legend=True, ax=ax[2]) + data.plot(y="vibration", legend=True, ax=ax[3]) if data.errorID.isnull().sum() < data.errorID.shape[0]: pd.get_dummies(data.errorID).plot(ax=ax[4]) if data.failure.isnull().sum() < data.failure.shape[0]: pd.get_dummies(data.failure).plot(ax=ax[5]) - ax[0].set_title('Machine #{}'.format(machine_id)) + ax[0].set_title("Machine #{}".format(machine_id)) for i in range(5): ax[i].set_xlabel(None) @@ -236,9 +277,9 @@ def gen_summary(outdir=None): os.makedirs(outdir, exist_ok=True) df = load_data() - with PdfPages(outdir + '/mapm_summary.pdf') as pp: + with PdfPages(outdir + "/mapm_summary.pdf") as pp: for i in tqdm.trange(1, 101): fig, _ = plot_sequence_and_events(df, machine_id=i) - fig.savefig(pp, format='pdf') + fig.savefig(pp, format="pdf") plt.clf() plt.close() diff --git a/datasets/ppd/__init__.py b/datasets/ppd/__init__.py index 06b0d2a..0d4daa8 100644 --- a/datasets/ppd/__init__.py +++ b/datasets/ppd/__init__.py @@ -8,62 +8,61 @@ def load_data(index=0): - """ 0: C7 - 1: C8 - 2: C9 - 3: C11 - 4: C13 - 5: C14 - 6: C15 - 7: C16 - - Note that C7 and C13 included a short break - (for about 100 timestamps long) - between the two procedure. + """0: C7 + 1: C8 + 2: C9 + 3: C11 + 4: C13 + 5: C14 + 6: C15 + 7: C16 + + Note that C7 and C13 included a short break + (for about 100 timestamps long) + between the two procedure. """ fp = os.path.dirname(__file__) if index == 0: - df = pd.read_csv(fp + '/C7-1.csv.gz') - df = pd.concat([df, pd.read_csv(fp + '/C7-2.csv.gz')]) + df = pd.read_csv(fp + "/C7-1.csv.gz") + df = pd.concat([df, pd.read_csv(fp + "/C7-2.csv.gz")]) df = df.reset_index(drop=True) df.Timestamp = df.index.values return df elif index == 1: - return pd.read_csv(fp + '/C8.csv.gz') + return pd.read_csv(fp + "/C8.csv.gz") elif index == 2: - return pd.read_csv(fp + '/C9.csv.gz') + return pd.read_csv(fp + "/C9.csv.gz") elif index == 3: - return pd.read_csv(fp + '/C11.csv.gz') + return pd.read_csv(fp + "/C11.csv.gz") elif index == 4: - df = pd.read_csv(fp + '/C13-1.csv.gz') - df = pd.concat([df, pd.read_csv(fp + '/C13-2.csv.gz')]) + df = pd.read_csv(fp + "/C13-1.csv.gz") + df = pd.concat([df, pd.read_csv(fp + "/C13-2.csv.gz")]) df = df.reset_index(drop=True) df.Timestamp = df.index.values return df elif index == 5: - return pd.read_csv(fp + '/C14.csv.gz') + return pd.read_csv(fp + "/C14.csv.gz") elif index == 6: - return pd.read_csv(fp + '/C15.csv.gz') + return pd.read_csv(fp + "/C15.csv.gz") elif index == 7: - return pd.read_csv(fp + '/C16.csv.gz') + return pd.read_csv(fp + "/C16.csv.gz") else: raise ValueError def rename_components(df): - """ current and speed - """ + """current and speed""" # Rename L - L_curr = ['L_1', 'L_3', 'L_4', 'L_7', 'L_9'] - L_speed = ['L_2', 'L_6', 'L_5', 'L_8', 'L_10'] - df = df.rename(columns={k: f'c{i}_curr' for i, k in enumerate(L_curr)}) - df = df.rename(columns={k: f'c{i}_speed' for i, k in enumerate(L_speed)}) + L_curr = ["L_1", "L_3", "L_4", "L_7", "L_9"] + L_speed = ["L_2", "L_6", "L_5", "L_8", "L_10"] + df = df.rename(columns={k: f"c{i}_curr" for i, k in enumerate(L_curr)}) + df = df.rename(columns={k: f"c{i}_speed" for i, k in enumerate(L_speed)}) # Rename A, B, and C - df = df.rename(columns={f'A_{i}': f'c5_val{i}' for i in range(1, 6)}) - df = df.rename(columns={f'B_{i}': f'c6_val{i}' for i in range(1, 6)}) - df = df.rename(columns={f'C_{i}': f'c7_val{i}' for i in range(1, 6)}) + df = df.rename(columns={f"A_{i}": f"c5_val{i}" for i in range(1, 6)}) + df = df.rename(columns={f"B_{i}": f"c6_val{i}" for i in range(1, 6)}) + df = df.rename(columns={f"C_{i}": f"c7_val{i}" for i in range(1, 6)}) return df[df.columns.sort_values()] @@ -75,11 +74,11 @@ def load_clean_data(index=0): def set_broken_labels(df, size): labels = np.zeros(df.shape[0]) labels[-size:] = 1 - df['broken'] = labels + df["broken"] = labels return df -def run_to_failure_aux(df, n_sample, desc=''): +def run_to_failure_aux(df, n_sample, desc=""): seq_len = df.shape[0] samples = [] @@ -88,7 +87,7 @@ def run_to_failure_aux(df, n_sample, desc=''): while len(samples) < n_sample: # random censoring t = np.random.randint(2, seq_len) - sample = {'lifetime': t, 'broken': df.loc[t, 'broken']} + sample = {"lifetime": t, "broken": df.loc[t, "broken"]} sample = pd.DataFrame(sample, index=[0]) features = df.iloc[:t].mean(axis=0)[:-1] sample[features.keys()] = features.values @@ -102,14 +101,13 @@ def run_to_failure_aux(df, n_sample, desc=''): def generate_run_to_failure(n_sample=1000, bronken_holdout_steps=2000): samples = [] - print('Generating run-to-failure data:') + print("Generating run-to-failure data:") for index in range(8): - raw_df = load_clean_data(index=index).set_index('Timestamp') + raw_df = load_clean_data(index=index).set_index("Timestamp") raw_df = set_broken_labels(raw_df, size=bronken_holdout_steps) - sample = run_to_failure_aux( - raw_df, n_sample, desc=f'component {index+1}/8') - sample['trial_id'] = index + sample = run_to_failure_aux(raw_df, n_sample, desc=f"component {index+1}/8") + sample["trial_id"] = index samples.append(sample) return pd.concat(samples, axis=0).reset_index(drop=True) @@ -121,21 +119,22 @@ def gen_summary(outdir=None): outdir = os.path.dirname(__file__) os.makedirs(outdir, exist_ok=True) - sns.set(font_scale=1.1, style='whitegrid') + sns.set(font_scale=1.1, style="whitegrid") - with PdfPages(outdir + '/ppd_summary.pdf') as pp: + with PdfPages(outdir + "/ppd_summary.pdf") as pp: for i in tqdm.trange(8): df = load_data(index=i) df = rename_components(df) fig, ax = plt.subplots(8, figsize=(20, 20)) for i in range(8): - df.loc[:, df.columns.str.contains(f'c{i}')].plot( - ax=ax[i], legend=True, cmap='tab10') + df.loc[:, df.columns.str.contains(f"c{i}")].plot( + ax=ax[i], legend=True, cmap="tab10" + ) ax[i].set_ylabel("Value") ax[i].set_xlim(0, df.shape[0]) - ax[-1].set_xlabel('Time') - fig.savefig(pp, bbox_inches='tight', format='pdf') + ax[-1].set_xlabel("Time") + fig.savefig(pp, bbox_inches="tight", format="pdf") plt.clf() - plt.close() \ No newline at end of file + plt.close() diff --git a/datasets/ufd/__init__.py b/datasets/ufd/__init__.py index 88922d1..ba8f565 100644 --- a/datasets/ufd/__init__.py +++ b/datasets/ufd/__init__.py @@ -3,40 +3,40 @@ import numpy as np -def load_data(meter_id='A'): +def load_data(meter_id="A"): fp = os.path.dirname(__file__) - data = np.loadtxt(fp + '/Meter{}.txt'.format(meter_id)) + data = np.loadtxt(fp + "/Meter{}.txt".format(meter_id)) - if meter_id == 'A': - columns = ['flatness_ratio', 'symmetry', 'crossflow'] - columns += ['flow_velocity_{}'.format(i+1) for i in range(8)] - columns += ['sound_speed_{}'.format(i+1) for i in range(8)] - columns += ['average_speed'] - columns += ['gain_{}'.format(i+1) for i in range(16)] - columns += ['health_state'] + if meter_id == "A": + columns = ["flatness_ratio", "symmetry", "crossflow"] + columns += ["flow_velocity_{}".format(i + 1) for i in range(8)] + columns += ["sound_speed_{}".format(i + 1) for i in range(8)] + columns += ["average_speed"] + columns += ["gain_{}".format(i + 1) for i in range(16)] + columns += ["health_state"] - if meter_id == 'B': - columns = ['profile_factor', 'symmetry', 'crossflow', 'swirl_angle'] - columns += ['flow_velocity_{}'.format(i+1) for i in range(4)] - columns += ['average_flow'] - columns += ['sound_speed_{}'.format(i+1) for i in range(4)] - columns += ['average_speed'] - columns += ['signal_strength_{}'.format(i+1) for i in range(8)] - columns += ['turbulence_{}'.format(i+1) for i in range(4)] - columns += ['meter_performance'] - columns += ['signal_quality_{}'.format(i+1) for i in range(8)] - columns += ['gain_{}'.format(i+1) for i in range(8)] - columns += ['transit_time_{}'.format(i+1) for i in range(8)] - columns += ['health_state'] + if meter_id == "B": + columns = ["profile_factor", "symmetry", "crossflow", "swirl_angle"] + columns += ["flow_velocity_{}".format(i + 1) for i in range(4)] + columns += ["average_flow"] + columns += ["sound_speed_{}".format(i + 1) for i in range(4)] + columns += ["average_speed"] + columns += ["signal_strength_{}".format(i + 1) for i in range(8)] + columns += ["turbulence_{}".format(i + 1) for i in range(4)] + columns += ["meter_performance"] + columns += ["signal_quality_{}".format(i + 1) for i in range(8)] + columns += ["gain_{}".format(i + 1) for i in range(8)] + columns += ["transit_time_{}".format(i + 1) for i in range(8)] + columns += ["health_state"] - if meter_id == 'C' or meter_id == 'D': - columns = ['profile_factor', 'symmetry', 'crossflow'] - columns += ['flow_velocity_{}'.format(i+1) for i in range(4)] - columns += ['sound_speed_{}'.format(i+1) for i in range(4)] - columns += ['signal_strength_{}'.format(i+1) for i in range(8)] - columns += ['signal_quality_{}'.format(i+1) for i in range(8)] - columns += ['gain_{}'.format(i+1) for i in range(8)] - columns += ['transit_time_{}'.format(i+1) for i in range(8)] - columns += ['health_state'] + if meter_id == "C" or meter_id == "D": + columns = ["profile_factor", "symmetry", "crossflow"] + columns += ["flow_velocity_{}".format(i + 1) for i in range(4)] + columns += ["sound_speed_{}".format(i + 1) for i in range(4)] + columns += ["signal_strength_{}".format(i + 1) for i in range(8)] + columns += ["signal_quality_{}".format(i + 1) for i in range(8)] + columns += ["gain_{}".format(i + 1) for i in range(8)] + columns += ["transit_time_{}".format(i + 1) for i in range(8)] + columns += ["health_state"] - return pd.DataFrame(data, columns=columns) \ No newline at end of file + return pd.DataFrame(data, columns=columns) diff --git a/datasets/utils.py b/datasets/utils.py index 9153e9f..190a89b 100644 --- a/datasets/utils.py +++ b/datasets/utils.py @@ -4,10 +4,8 @@ def train_test_split(arrays, test_size=0.2, random_state=None, shuffle=False): return model_selection.train_test_split( - arrays=arrays, - test_size=test_size, - random_state=random_state, - shuffle=shuffle) + arrays=arrays, test_size=test_size, random_state=random_state, shuffle=shuffle + ) def hist_survival_time(data, ax=None, figsize=(6, 4), bins_0=20, bins_1=30): @@ -15,15 +13,15 @@ def hist_survival_time(data, ax=None, figsize=(6, 4), bins_0=20, bins_1=30): if ax is None: fig, ax = plt.subplots(figsize=figsize) - time_0 = data.loc[data['broken'] == 0, 'lifetime'] - ax.hist(time_0, bins=bins_0, alpha=0.3, color='blue', label='not broken yet') + time_0 = data.loc[data["broken"] == 0, "lifetime"] + ax.hist(time_0, bins=bins_0, alpha=0.3, color="blue", label="not broken yet") - time_1 = data.loc[data['broken'] == 1, 'lifetime'] - ax.hist(time_1, bins=bins_1, alpha=0.7, color='black', label='broken') + time_1 = data.loc[data["broken"] == 1, "lifetime"] + ax.hist(time_1, bins=bins_1, alpha=0.7, color="black", label="broken") - ax.set_title( 'Histogram - survival time', fontsize=15) + ax.set_title("Histogram - survival time", fontsize=15) if ax is None: return fig, ax else: - return ax \ No newline at end of file + return ax