-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathrun.py
113 lines (89 loc) · 5.15 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import time
import pandas as pd
from multiprocessing import Pool
from utils.argument_parser import get_input_arguments
from utils.run_utils import get_instances, read_dataframe, run_method, result_post_processing, get_label
from utils.evaluation import root_cause_postprocessing, score_root_causes
def run_directory(data_root, run_path, algorithm, algorithm_args, derived, n_threads, csv_suffix, debug):
"""
Run all files in all subdirectories of run_path.
"""
parallel_run_results = []
def parallel_callback(result):
parallel_run_results.append(result)
instances = get_instances(data_root, run_path)
pool = Pool(n_threads)
for dataset, sub_directory, file in instances:
dataset_name = os.path.basename(dataset)
if derived is None:
derived = dataset_name == 'D' or dataset_name == 'RS'
rs_data = dataset_name == 'RS'
pool.apply_async(run_instance,
args=(data_root, dataset, sub_directory, file, algorithm, algorithm_args, derived, rs_data,
debug),
callback=parallel_callback)
pool.close()
pool.join()
result_post_processing(parallel_run_results, algorithm, csv_suffix)
def run_single_file(data_root, run_path, algorithm, algorithm_args, derived):
"""
Run a single file.
"""
directory_structure = list(filter(None, run_path.split(os.sep)))
dataset_name = directory_structure[0] if len(directory_structure) > 1 else ''
sub_directory = os.path.join(*directory_structure[1:-1]) if len(directory_structure) > 2 else ''
file = directory_structure[-1].split('.')[0]
if derived is None:
derived = dataset_name == 'D' or dataset_name == 'RS'
rs_data = dataset_name == 'RS'
run_instance(data_root, dataset_name, sub_directory, file, algorithm, algorithm_args, derived, rs_data, debug=True)
def run_instance(data_root, dataset_name, sub_directory, file, algorithm, algorithm_args, derived=False, rs_data=False,
debug=False):
"""
Runs a single instance (file) and evaluates the result.
:param data_root: str, the root directory for all datasets.
:param dataset_name: str, the name of the dataset to run (must be located within data_root).
:param sub_directory: str, subdirectory of the dataset (can be an empty string or of a depth >= 1).
:param file: str, the file to run. Should not have any file extension (assumed to be csv).
:param algorithm: str, the name of the algorithm that should be run.
:param algorithm_args: dict, any algorithm specific arguments.
:param derived: boolean, if the dataset is derived.
In this case, two files `file`.a.csv and `file`.b.csv. must exist.
:param rs_data: boolean, if the RobustSpot data (RS) is used which has another input format.
:param debug: boolean, if debug mode should be used.
:return: (str, str, str, float, float, float, float, float), the dataset name, subdirectory and file name
are all returned for collecting the results when using multiple threads. Moreover, the F1-score,
true positive count, false positive count, false negative count and the run time are also returned.
"""
run_directory = os.path.join(data_root, dataset_name, sub_directory)
print('Running file:', os.path.join(run_directory, file), ', derived:', derived)
df, attributes, df_a, df_b = read_dataframe(run_directory, file, derived, rs_data)
start_time = time.time()
root_causes = run_method(df, [df_a, df_b], attributes, algorithm, algorithm_args, derived, debug)
root_cause_predictions = root_cause_postprocessing(root_causes, algorithm)
run_time = time.time() - start_time
# Get the label.
label = get_label(run_directory, file, rs_data)
# Evaluate the root cause.
TP, FP, FN, true_labels = score_root_causes(root_cause_predictions, label)
F1 = 2 * TP / (2 * TP + FP + FN)
print('dataset:', dataset_name, 'sub_directory:', sub_directory, 'file:', file, 'label:', label)
print('Run time:', run_time)
print('TP:', TP, 'FP:', FP, 'FN:', FN)
print('True labels: ', true_labels)
print('Predicted labels:', root_cause_predictions)
return dataset_name, sub_directory, file, F1, TP, FP, FN, run_time
if __name__ == "__main__":
# Get the parsed input arguments.
args, data_root, run_path, algorithm_args, is_single_file = get_input_arguments()
print('Running', args.algorithm, 'with arguments:', algorithm_args)
if is_single_file:
run_single_file(data_root, run_path, args.algorithm, algorithm_args, args.derived)
else:
# Add algorithm specific arguments to the given csv suffix.
argument_list = [k + '-' + str(v).replace('.', '') for k, v in algorithm_args.items()]
csv_suffix = '-'.join(['', args.output_suffix, *argument_list])
csv_suffix = csv_suffix if args.output_suffix != '' else csv_suffix[1:]
run_directory(data_root, run_path, args.algorithm, algorithm_args, args.derived, args.n_threads, csv_suffix,
args.debug)