2
2
import numpy as np
3
3
import matplotlib .pyplot as plt
4
4
import matplotlib
5
- from tqdm import tqdm
6
5
import algorithms
7
6
import datasets
8
7
import pandas as pd
9
8
import seaborn as sns
9
+ import kernel_methods
10
+ import sobol_sphere
11
+ from concurrent import futures
12
+ from itertools import repeat
13
+ import joblib
14
+
15
+ mem = joblib .Memory (location = './tmp' , verbose = 1 )
10
16
11
17
matplotlib .use ('agg' )
12
18
plt .style .use ("seaborn" )
@@ -17,7 +23,74 @@ def rmse(a, b):
17
23
return np .sqrt (((a - b ) ** 2 ).mean ())
18
24
19
25
20
- def plot_experiments ():
26
+ def get_eval_schedule (min_samples , max_evals ):
27
+ eval_schedule = [10 ** (x / 5 ) for x in range (1 , 20 )]
28
+ eval_schedule = (np .round (np .divide (eval_schedule , min_samples )) * min_samples ).astype (
29
+ int )
30
+ eval_schedule = eval_schedule [eval_schedule >= min_samples ]
31
+ eval_schedule = eval_schedule [eval_schedule <= max_evals ]
32
+ # remove duplicates
33
+ return list (dict .fromkeys (eval_schedule ))
34
+
35
+
36
+ def get_partial_results (alg , alg_name , num_evals , required_repeats , data , data_name ):
37
+ df = pd .DataFrame (columns = ["Dataset" , "Algorithm" , "Function evals" , "Trial" , "rmse" ])
38
+ model , X_background , X_foreground , exact_shap_values = data
39
+ if num_evals > alg .max_evals (X_background .shape [1 ]):
40
+ return df
41
+ model_predict = lambda X : model .get_booster ().inplace_predict (X , predict_type = 'margin' )
42
+
43
+ for trial_i in range (required_repeats ):
44
+ shap_values = alg .shap_values (X_background , X_foreground ,
45
+ model_predict ,
46
+ num_evals )
47
+
48
+ df = df .append (
49
+ {"Dataset" : data_name , "Algorithm" : alg_name , "marginal_evals" : num_evals ,
50
+ "Trial" : trial_i ,
51
+ "rmse" : rmse (shap_values , exact_shap_values )},
52
+ ignore_index = True )
53
+ return df
54
+
55
+
56
+ @mem .cache
57
+ def run_experiments (datasets_set , algorithms_set , repeats ,
58
+ max_evals ):
59
+ deterministic_algorithms = ["Fibonacci" ]
60
+
61
+ seed = 33
62
+ np .random .seed (seed )
63
+ cp .random .seed (seed )
64
+ df = pd .DataFrame (columns = ["Dataset" , "Algorithm" , "Function evals" , "Trial" , "rmse" ])
65
+ for data_name , data in datasets_set .items ():
66
+ model , X_background , X_foreground , exact_shap_values = data
67
+ n_features = X_background .shape [1 ]
68
+ for alg_name , alg in algorithms_set .items ():
69
+ eval_schedule = get_eval_schedule (alg .min_samples (n_features ), max_evals )
70
+ print ("Dataset - " + data_name + ", Alg - " + alg_name )
71
+ required_repeats = repeats
72
+ if alg_name in deterministic_algorithms :
73
+ required_repeats = 1
74
+ with futures .ThreadPoolExecutor () as executor :
75
+ for result in executor .map (get_partial_results , repeat (alg ), repeat (alg_name ),
76
+ eval_schedule , repeat (required_repeats ), repeat (data ),
77
+ repeat (data_name )):
78
+ df = df .append (result )
79
+ return df
80
+
81
+
82
+ def plot_experiments (name , df ):
83
+ for d in df ["Dataset" ].unique ():
84
+ plt .figure (figsize = (4 * 1.3 , 3 * 1.3 ))
85
+ sns .lineplot (data = df .loc [df ["Dataset" ] == d ], x = "marginal_evals" , y = "rmse" , hue = "Algorithm" )
86
+ plt .xscale ('log' )
87
+ plt .yscale ('log' )
88
+ plt .tight_layout ()
89
+ plt .savefig ('figures/' + name + '_' + d + '_shap.png' )
90
+ plt .clf ()
91
+
92
+
93
+ def kernel_experiments ():
21
94
repeats = 25
22
95
foreground_examples = 10
23
96
background_examples = 100
@@ -29,65 +102,138 @@ def plot_experiments():
29
102
"breast_cancer" : datasets .get_breast_cancer (foreground_examples , background_examples ),
30
103
}
31
104
algorithms_set = {
32
- # "MC": algorithms.monte_carlo,
33
- # "Bayesian-MC": algorithms.monte_carlo_weighted,
34
- "SBQ" : algorithms .sbq ,
35
- # "MC-Orthogonal-Bayesian": algorithms.orthogonal_weighted,
36
- "MC-Orthogonal" : algorithms .orthogonal ,
37
- # "Castro-Complement": algorithms.monte_carlo_antithetic,
38
- # "Fibonacci": algorithms.fibonacci,
39
- # "Castro-ControlVariate": algorithms.castro_control_variate,
40
- # "Castro-QMC": algorithms.castro_qmc,
41
- # "KT-Herding": algorithms.kt_herding,
42
- # "Spearman-Herding": algorithms.spearman_herding,
43
- # "Spearman-Herding-Exact": algorithms.spearman_herding_exact,
105
+ "Mallows-Herding-0.5" : algorithms .KernelHerding (kernel_methods .MallowsKernel (l = 0.5 )),
106
+ "Mallows-Herding-5" : algorithms .KernelHerding (kernel_methods .MallowsKernel (l = 5 )),
107
+ "Mallows-Herding-50" : algorithms .KernelHerding (kernel_methods .MallowsKernel (l = 50 )),
108
+ "KT-Herding" : algorithms .KernelHerding (kernel_methods .KTKernel ()),
109
+ "Spearman-Herding" : algorithms .KernelHerding (kernel_methods .SpearmanKernel ()),
44
110
}
45
111
46
- deterministic_algorithms = ["Castro-QMC" ,"Fibonacci" ]
112
+ df = run_experiments (datasets_set , algorithms_set , repeats , max_evals )
113
+ plot_experiments ("kernel/kernel" , df )
47
114
48
- seed = 32
49
- np .random .seed (seed )
50
- cp .random .seed (seed )
51
- for data_name , data in datasets_set .items ():
52
- model , X_background , X_foreground , exact_shap_values = data
53
- model_predict = lambda X : model .get_booster ().inplace_predict (X , predict_type = 'margin' )
54
- n_features = X_background .shape [1 ]
55
- df = pd .DataFrame (columns = ["Algorithm" , "Function evals" , "Trial" , "rmse" ])
56
- for alg_name , alg in algorithms_set .items ():
57
- min_samples = algorithms .min_sample_size (alg , n_features )
58
- eval_schedule = [10 ** (x / 5 ) for x in range (1 , 20 )]
59
- eval_schedule = (np .round (np .divide (eval_schedule , min_samples )) * min_samples ).astype (
60
- int )
61
- eval_schedule = eval_schedule [eval_schedule >= min_samples ]
62
- eval_schedule = eval_schedule [eval_schedule <= max_evals ]
63
-
64
- for evals in tqdm (eval_schedule , desc = "Dataset - " + data_name + ", Alg - " + alg_name ):
65
- required_repeats = repeats
66
- if alg_name in deterministic_algorithms :
67
- required_repeats = 1
68
- for i in range (required_repeats ):
69
- shap_values = alg (X_background , X_foreground ,
70
- model_predict ,
71
- evals )
72
- df = df .append ({"Algorithm" : alg_name , "n_permutations" : evals / (n_features + 1 ), "Trial" : i ,
73
- "rmse" : rmse (shap_values , exact_shap_values )},
74
- ignore_index = True )
75
- sns .lineplot (data = df , x = "n_permutations" , y = "rmse" , hue = "Algorithm" )
76
- plt .xscale ('log' )
77
- plt .yscale ('log' )
78
- plt .savefig ('figures/' + data_name + '_shap.png' )
79
- plt .clf ()
80
115
116
+ def kernel_argmax_experiments ():
117
+ repeats = 25
118
+ foreground_examples = 10
119
+ background_examples = 100
120
+ max_evals = 5000
121
+ datasets_set = {
122
+ "cal_housing" : datasets .get_cal_housing (foreground_examples , background_examples ),
123
+ }
124
+ algorithms_set = {
125
+ "Mallows-5-trials" : algorithms .KernelHerding (kernel_methods .MallowsKernel (),
126
+ max_trials = 5 ),
127
+ "Mallows-10-trials" : algorithms .KernelHerding (kernel_methods .MallowsKernel (),
128
+ max_trials = 10 ),
129
+ "Mallows-25-trials" : algorithms .KernelHerding (kernel_methods .MallowsKernel (),
130
+ max_trials = 25 ),
131
+ "Mallows-50-trials" : algorithms .KernelHerding (kernel_methods .MallowsKernel (),
132
+ max_trials = 50 ),
133
+ }
134
+
135
+ df = run_experiments (datasets_set , algorithms_set , repeats , max_evals )
136
+ plot_experiments ("kernel/kernel_trials" , df )
137
+
138
+
139
+ def incumbent_experiments ():
140
+ repeats = 25
141
+ foreground_examples = 10
142
+ background_examples = 100
143
+ max_evals = 5000
144
+ datasets_set = {
145
+ "make_regression" : datasets .get_regression (foreground_examples , background_examples ),
146
+ "cal_housing" : datasets .get_cal_housing (foreground_examples , background_examples ),
147
+ "adult" : datasets .get_adult (foreground_examples , background_examples ),
148
+ "breast_cancer" : datasets .get_breast_cancer (foreground_examples , background_examples ),
149
+ }
150
+ algorithms_set = {
151
+ "MC" : algorithms .MonteCarlo (),
152
+ "MC-antithetic" : algorithms .MonteCarloAntithetic (),
153
+ "Stratified" : algorithms .Stratified (),
154
+ "Owen" : algorithms .Owen (),
155
+ "Owen-Halved" : algorithms .OwenHalved (),
156
+ }
157
+
158
+ df = run_experiments (datasets_set , algorithms_set , repeats , max_evals )
159
+ plot_experiments ("incumbent/incumbent" , df )
160
+
161
+
162
+ def new_experiments ():
163
+ repeats = 25
164
+ foreground_examples = 10
165
+ background_examples = 100
166
+ max_evals = 5000
167
+ datasets_set = {
168
+ "make_regression" : datasets .get_regression (foreground_examples , background_examples ),
169
+ "cal_housing" : datasets .get_cal_housing (foreground_examples , background_examples ),
170
+ "adult" : datasets .get_adult (foreground_examples , background_examples ),
171
+ "breast_cancer" : datasets .get_breast_cancer (foreground_examples , background_examples ),
172
+ }
173
+ algorithms_set = {
174
+ "MC-antithetic" : algorithms .MonteCarloAntithetic (),
175
+ "Herding" : algorithms .KernelHerding (kernel_methods .MallowsKernel ()),
176
+ "SBQ" : algorithms .SequentialBayesianQuadrature (kernel_methods .MallowsKernel ()),
177
+ "Orthogonal" : algorithms .OrthogonalSphericalCodes (),
178
+ "Sobol" : algorithms .Sobol (),
179
+ }
180
+
181
+ df = run_experiments (datasets_set , algorithms_set , repeats , max_evals )
182
+ plot_experiments ("new/new" , df )
183
+
184
+
185
+ def get_discrepancy (n , d , alg , kernel ):
186
+ return kernel_methods .discrepancy (* alg (n , d ), kernel )
187
+
188
+
189
+ @mem .cache
190
+ def run_discrepancy_experiments (lengths , sizes , repeats ):
191
+ algs = {
192
+ "MC-Antithetic" : lambda n , d : (algorithms .get_antithetic_permutations (n , d ), None ),
193
+ "Herding" : lambda n , d : (
194
+ kernel_methods .kernel_herding (n , d , kernel_methods .MallowsKernel (), 25 ), None ),
195
+ "SBQ" : lambda n , d : kernel_methods .sequential_bayesian_quadrature (n , d , kernel , 25 ),
196
+ "Orthogonal" : lambda n , d : (algorithms ._orthogonal_permutations (n , d ), None ),
197
+ "Sobol" : lambda n , d : (sobol_sphere .sobol_permutations (n , d ), None ),
198
+ }
199
+ df = pd .DataFrame (columns = ["Algorithm" , "d" , "n" , "Discrepancy" , "std" ])
200
+ kernel = kernel_methods .MallowsKernel ()
201
+
202
+ for d in lengths :
203
+ for n in sizes :
204
+ for name , alg in algs .items ():
205
+ if name == "SBQ" and n > 100 :
206
+ df = df .append (
207
+ {"Algorithm" : name , "d" : d , "n" : n , "Discrepancy" : "-" , "std" : "-" },
208
+ ignore_index = True )
209
+ continue
210
+
211
+ disc = []
212
+ with futures .ThreadPoolExecutor () as executor :
213
+ for result in executor .map (get_discrepancy , repeat (n , repeats ), repeat (d ),
214
+ repeat (alg ), repeat (kernel )):
215
+ disc .append (result )
216
+ df = df .append (
217
+ {"Algorithm" : name , "d" : d , "n" : n , "Discrepancy" : np .mean (disc ),
218
+ "std" : np .std (disc )},
219
+ ignore_index = True )
220
+ print (df .to_latex (index = False ))
221
+ return df
222
+
223
+
224
+ def discrepancy_experiments ():
225
+ lengths = [10 , 50 , 200 ]
226
+ sizes = [10 , 100 , 1000 ]
227
+ repeats = 25
228
+ df = run_discrepancy_experiments (lengths , sizes , repeats )
229
+ df = df .pivot (index = "Algorithm" , columns = ['d' , 'n' ], values = ['Discrepancy' ])
230
+ df = df .sort_index (axis = 1 )
231
+ df = df .transpose ().droplevel (0 )
232
+
233
+ print (df .to_latex ( multirow = True ))
81
234
82
- import cProfile , pstats , io
83
- from pstats import SortKey
84
235
85
- pr = cProfile .Profile ()
86
- pr .enable ()
87
- plot_experiments ()
88
- pr .disable ()
89
- s = io .StringIO ()
90
- sortby = SortKey .CUMULATIVE
91
- ps = pstats .Stats (pr , stream = s ).sort_stats (sortby )
92
- ps .print_stats (20 )
93
- print (s .getvalue ())
236
+ kernel_experiments ()
237
+ kernel_argmax_experiments ()
238
+ new_experiments ()
239
+ discrepancy_experiments ()
0 commit comments