Skip to content

Commit a0bc143

Browse files
committed
April update
1 parent 263832a commit a0bc143

12 files changed

+882
-739
lines changed

algorithms.py

+264-344
Large diffs are not rendered by default.

datasets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
def get_regression(n_foreground, n_background):
88
X, y = sklearn.datasets.make_regression(
9-
n_samples=100,
9+
n_samples=1000,
1010
n_features=10,
1111
noise=0.1,
1212
random_state=47)

experiments.py

+204-58
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,17 @@
22
import numpy as np
33
import matplotlib.pyplot as plt
44
import matplotlib
5-
from tqdm import tqdm
65
import algorithms
76
import datasets
87
import pandas as pd
98
import seaborn as sns
9+
import kernel_methods
10+
import sobol_sphere
11+
from concurrent import futures
12+
from itertools import repeat
13+
import joblib
14+
15+
mem = joblib.Memory(location='./tmp', verbose=1)
1016

1117
matplotlib.use('agg')
1218
plt.style.use("seaborn")
@@ -17,7 +23,74 @@ def rmse(a, b):
1723
return np.sqrt(((a - b) ** 2).mean())
1824

1925

20-
def plot_experiments():
26+
def get_eval_schedule(min_samples, max_evals):
27+
eval_schedule = [10 ** (x / 5) for x in range(1, 20)]
28+
eval_schedule = (np.round(np.divide(eval_schedule, min_samples)) * min_samples).astype(
29+
int)
30+
eval_schedule = eval_schedule[eval_schedule >= min_samples]
31+
eval_schedule = eval_schedule[eval_schedule <= max_evals]
32+
# remove duplicates
33+
return list(dict.fromkeys(eval_schedule))
34+
35+
36+
def get_partial_results(alg, alg_name, num_evals, required_repeats, data, data_name):
37+
df = pd.DataFrame(columns=["Dataset", "Algorithm", "Function evals", "Trial", "rmse"])
38+
model, X_background, X_foreground, exact_shap_values = data
39+
if num_evals > alg.max_evals(X_background.shape[1]):
40+
return df
41+
model_predict = lambda X: model.get_booster().inplace_predict(X, predict_type='margin')
42+
43+
for trial_i in range(required_repeats):
44+
shap_values = alg.shap_values(X_background, X_foreground,
45+
model_predict,
46+
num_evals)
47+
48+
df = df.append(
49+
{"Dataset": data_name, "Algorithm": alg_name, "marginal_evals": num_evals,
50+
"Trial": trial_i,
51+
"rmse": rmse(shap_values, exact_shap_values)},
52+
ignore_index=True)
53+
return df
54+
55+
56+
@mem.cache
57+
def run_experiments(datasets_set, algorithms_set, repeats,
58+
max_evals):
59+
deterministic_algorithms = ["Fibonacci"]
60+
61+
seed = 33
62+
np.random.seed(seed)
63+
cp.random.seed(seed)
64+
df = pd.DataFrame(columns=["Dataset", "Algorithm", "Function evals", "Trial", "rmse"])
65+
for data_name, data in datasets_set.items():
66+
model, X_background, X_foreground, exact_shap_values = data
67+
n_features = X_background.shape[1]
68+
for alg_name, alg in algorithms_set.items():
69+
eval_schedule = get_eval_schedule(alg.min_samples(n_features), max_evals)
70+
print("Dataset - " + data_name + ", Alg - " + alg_name)
71+
required_repeats = repeats
72+
if alg_name in deterministic_algorithms:
73+
required_repeats = 1
74+
with futures.ThreadPoolExecutor() as executor:
75+
for result in executor.map(get_partial_results, repeat(alg), repeat(alg_name),
76+
eval_schedule, repeat(required_repeats), repeat(data),
77+
repeat(data_name)):
78+
df = df.append(result)
79+
return df
80+
81+
82+
def plot_experiments(name, df):
83+
for d in df["Dataset"].unique():
84+
plt.figure(figsize=(4 * 1.3, 3 * 1.3))
85+
sns.lineplot(data=df.loc[df["Dataset"] == d], x="marginal_evals", y="rmse", hue="Algorithm")
86+
plt.xscale('log')
87+
plt.yscale('log')
88+
plt.tight_layout()
89+
plt.savefig('figures/' + name + '_' + d + '_shap.png')
90+
plt.clf()
91+
92+
93+
def kernel_experiments():
2194
repeats = 25
2295
foreground_examples = 10
2396
background_examples = 100
@@ -29,65 +102,138 @@ def plot_experiments():
29102
"breast_cancer": datasets.get_breast_cancer(foreground_examples, background_examples),
30103
}
31104
algorithms_set = {
32-
# "MC": algorithms.monte_carlo,
33-
# "Bayesian-MC": algorithms.monte_carlo_weighted,
34-
"SBQ": algorithms.sbq,
35-
# "MC-Orthogonal-Bayesian": algorithms.orthogonal_weighted,
36-
"MC-Orthogonal": algorithms.orthogonal,
37-
# "Castro-Complement": algorithms.monte_carlo_antithetic,
38-
# "Fibonacci": algorithms.fibonacci,
39-
# "Castro-ControlVariate": algorithms.castro_control_variate,
40-
# "Castro-QMC": algorithms.castro_qmc,
41-
# "KT-Herding": algorithms.kt_herding,
42-
# "Spearman-Herding": algorithms.spearman_herding,
43-
# "Spearman-Herding-Exact": algorithms.spearman_herding_exact,
105+
"Mallows-Herding-0.5": algorithms.KernelHerding(kernel_methods.MallowsKernel(l=0.5)),
106+
"Mallows-Herding-5": algorithms.KernelHerding(kernel_methods.MallowsKernel(l=5)),
107+
"Mallows-Herding-50": algorithms.KernelHerding(kernel_methods.MallowsKernel(l=50)),
108+
"KT-Herding": algorithms.KernelHerding(kernel_methods.KTKernel()),
109+
"Spearman-Herding": algorithms.KernelHerding(kernel_methods.SpearmanKernel()),
44110
}
45111

46-
deterministic_algorithms = ["Castro-QMC","Fibonacci"]
112+
df = run_experiments(datasets_set, algorithms_set, repeats, max_evals)
113+
plot_experiments("kernel/kernel", df)
47114

48-
seed = 32
49-
np.random.seed(seed)
50-
cp.random.seed(seed)
51-
for data_name, data in datasets_set.items():
52-
model, X_background, X_foreground, exact_shap_values = data
53-
model_predict = lambda X: model.get_booster().inplace_predict(X, predict_type='margin')
54-
n_features = X_background.shape[1]
55-
df = pd.DataFrame(columns=["Algorithm", "Function evals", "Trial", "rmse"])
56-
for alg_name, alg in algorithms_set.items():
57-
min_samples = algorithms.min_sample_size(alg, n_features)
58-
eval_schedule = [10 ** (x / 5) for x in range(1, 20)]
59-
eval_schedule = (np.round(np.divide(eval_schedule, min_samples)) * min_samples).astype(
60-
int)
61-
eval_schedule = eval_schedule[eval_schedule >= min_samples]
62-
eval_schedule = eval_schedule[eval_schedule <= max_evals]
63-
64-
for evals in tqdm(eval_schedule, desc="Dataset - " + data_name + ", Alg - " + alg_name):
65-
required_repeats = repeats
66-
if alg_name in deterministic_algorithms:
67-
required_repeats = 1
68-
for i in range(required_repeats):
69-
shap_values = alg(X_background, X_foreground,
70-
model_predict,
71-
evals)
72-
df = df.append({"Algorithm": alg_name, "n_permutations": evals/(n_features+1), "Trial": i,
73-
"rmse": rmse(shap_values, exact_shap_values)},
74-
ignore_index=True)
75-
sns.lineplot(data=df, x="n_permutations", y="rmse", hue="Algorithm")
76-
plt.xscale('log')
77-
plt.yscale('log')
78-
plt.savefig('figures/' + data_name + '_shap.png')
79-
plt.clf()
80115

116+
def kernel_argmax_experiments():
117+
repeats = 25
118+
foreground_examples = 10
119+
background_examples = 100
120+
max_evals = 5000
121+
datasets_set = {
122+
"cal_housing": datasets.get_cal_housing(foreground_examples, background_examples),
123+
}
124+
algorithms_set = {
125+
"Mallows-5-trials": algorithms.KernelHerding(kernel_methods.MallowsKernel(),
126+
max_trials=5),
127+
"Mallows-10-trials": algorithms.KernelHerding(kernel_methods.MallowsKernel(),
128+
max_trials=10),
129+
"Mallows-25-trials": algorithms.KernelHerding(kernel_methods.MallowsKernel(),
130+
max_trials=25),
131+
"Mallows-50-trials": algorithms.KernelHerding(kernel_methods.MallowsKernel(),
132+
max_trials=50),
133+
}
134+
135+
df = run_experiments(datasets_set, algorithms_set, repeats, max_evals)
136+
plot_experiments("kernel/kernel_trials", df)
137+
138+
139+
def incumbent_experiments():
140+
repeats = 25
141+
foreground_examples = 10
142+
background_examples = 100
143+
max_evals = 5000
144+
datasets_set = {
145+
"make_regression": datasets.get_regression(foreground_examples, background_examples),
146+
"cal_housing": datasets.get_cal_housing(foreground_examples, background_examples),
147+
"adult": datasets.get_adult(foreground_examples, background_examples),
148+
"breast_cancer": datasets.get_breast_cancer(foreground_examples, background_examples),
149+
}
150+
algorithms_set = {
151+
"MC": algorithms.MonteCarlo(),
152+
"MC-antithetic": algorithms.MonteCarloAntithetic(),
153+
"Stratified": algorithms.Stratified(),
154+
"Owen": algorithms.Owen(),
155+
"Owen-Halved": algorithms.OwenHalved(),
156+
}
157+
158+
df = run_experiments(datasets_set, algorithms_set, repeats, max_evals)
159+
plot_experiments("incumbent/incumbent", df)
160+
161+
162+
def new_experiments():
163+
repeats = 25
164+
foreground_examples = 10
165+
background_examples = 100
166+
max_evals = 5000
167+
datasets_set = {
168+
"make_regression": datasets.get_regression(foreground_examples, background_examples),
169+
"cal_housing": datasets.get_cal_housing(foreground_examples, background_examples),
170+
"adult": datasets.get_adult(foreground_examples, background_examples),
171+
"breast_cancer": datasets.get_breast_cancer(foreground_examples, background_examples),
172+
}
173+
algorithms_set = {
174+
"MC-antithetic": algorithms.MonteCarloAntithetic(),
175+
"Herding": algorithms.KernelHerding(kernel_methods.MallowsKernel()),
176+
"SBQ": algorithms.SequentialBayesianQuadrature(kernel_methods.MallowsKernel()),
177+
"Orthogonal": algorithms.OrthogonalSphericalCodes(),
178+
"Sobol": algorithms.Sobol(),
179+
}
180+
181+
df = run_experiments(datasets_set, algorithms_set, repeats, max_evals)
182+
plot_experiments("new/new", df)
183+
184+
185+
def get_discrepancy(n, d, alg, kernel):
186+
return kernel_methods.discrepancy(*alg(n, d), kernel)
187+
188+
189+
@mem.cache
190+
def run_discrepancy_experiments(lengths, sizes, repeats):
191+
algs = {
192+
"MC-Antithetic": lambda n, d: (algorithms.get_antithetic_permutations(n, d), None),
193+
"Herding": lambda n, d: (
194+
kernel_methods.kernel_herding(n, d, kernel_methods.MallowsKernel(), 25), None),
195+
"SBQ": lambda n, d: kernel_methods.sequential_bayesian_quadrature(n, d, kernel, 25),
196+
"Orthogonal": lambda n, d: (algorithms._orthogonal_permutations(n, d), None),
197+
"Sobol": lambda n, d: (sobol_sphere.sobol_permutations(n, d), None),
198+
}
199+
df = pd.DataFrame(columns=["Algorithm", "d", "n", "Discrepancy", "std"])
200+
kernel = kernel_methods.MallowsKernel()
201+
202+
for d in lengths:
203+
for n in sizes:
204+
for name, alg in algs.items():
205+
if name == "SBQ" and n > 100:
206+
df = df.append(
207+
{"Algorithm": name, "d": d, "n": n, "Discrepancy": "-", "std": "-"},
208+
ignore_index=True)
209+
continue
210+
211+
disc = []
212+
with futures.ThreadPoolExecutor() as executor:
213+
for result in executor.map(get_discrepancy, repeat(n, repeats), repeat(d),
214+
repeat(alg), repeat(kernel)):
215+
disc.append(result)
216+
df = df.append(
217+
{"Algorithm": name, "d": d, "n": n, "Discrepancy": np.mean(disc),
218+
"std": np.std(disc)},
219+
ignore_index=True)
220+
print(df.to_latex(index=False))
221+
return df
222+
223+
224+
def discrepancy_experiments():
225+
lengths = [10, 50, 200]
226+
sizes = [10, 100, 1000]
227+
repeats = 25
228+
df = run_discrepancy_experiments(lengths, sizes, repeats)
229+
df = df.pivot(index="Algorithm", columns=['d', 'n'], values=['Discrepancy'])
230+
df = df.sort_index(axis=1)
231+
df = df.transpose().droplevel(0)
232+
233+
print(df.to_latex( multirow=True))
81234

82-
import cProfile, pstats, io
83-
from pstats import SortKey
84235

85-
pr = cProfile.Profile()
86-
pr.enable()
87-
plot_experiments()
88-
pr.disable()
89-
s = io.StringIO()
90-
sortby = SortKey.CUMULATIVE
91-
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
92-
ps.print_stats(20)
93-
print(s.getvalue())
236+
kernel_experiments()
237+
kernel_argmax_experiments()
238+
new_experiments()
239+
discrepancy_experiments()

fibonacci_histogram.py

-15
This file was deleted.

figures/cal_housing_shap.png

8.52 KB
Loading

figures/make_regression_shap.png

7.43 KB
Loading

0 commit comments

Comments
 (0)