Skip to content

Commit

Permalink
bug fix, minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
beef-broccoli committed Oct 24, 2023
1 parent cd60d86 commit c408909
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 23 deletions.
2 changes: 1 addition & 1 deletion dataset-analysis/amidation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def plot_all_results(single_component='activator'):
# processing dataset
# use shorter name for bases
# use labels for nucleophiles
df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/ami.csv')
df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/amidation.csv')

short_name_dict = {
'1-Methylimidazole': 'NMI',
Expand Down
39 changes: 28 additions & 11 deletions dataset-analysis/arylation_scope_ligand.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,12 @@ def plot_results_with_model_substrates_color_match_publication(cutoff=75, preset
fd = df.copy()
fd['combo'] = fd['electrophile_id'].astype('str') + fd['nucleophile_id'].astype('str')
#fd = fd.sort_values(by=['combo', 'ligand_name'])
max = fd.loc[fd.groupby(by=['combo'])['yield'].idxmax()]
maxes = fd.loc[fd.groupby(by=['combo'])['yield'].idxmax()]
#print(list(max['ligand_name'].unique()))
#print(max.loc[max['plot']!=0]['ligand_name'].value_counts())

# new way to assign colors for all ligands that give above cutoff yields
ligands_to_color = max.loc[max['yield']>cutoff]['ligand_name'].unique()
ligands_to_color = maxes.loc[maxes['yield']>cutoff]['ligand_name'].unique()

val_to_rgb = {} # {value: rgb}
def color(x):
Expand All @@ -457,29 +457,29 @@ def color(x):
val_to_rgb[d[x]] = preset_color_dict[x]
return d[x]

max['valid'] = df['yield'].apply(lambda x: 0 if x<cutoff else 1) # 0 for plotting, if highest yield < 75%
max['plot'] = df['ligand_name'].apply(color)
max['plot'] = max['plot']*max['valid']
max = max.pivot(index='nucleophile_id', columns='electrophile_id', values='plot')
max[max==0] = -1 # set all zeros to -1, this helps with plotting with a cmap, i can set the color for -1
maxes['valid'] = df['yield'].apply(lambda x: 0 if x<cutoff else 1) # 0 for plotting, if highest yield < 75%
maxes['plot'] = df['ligand_name'].apply(color)
maxes['plot'] = maxes['plot']*maxes['valid']
maxes = maxes.pivot(index='nucleophile_id', columns='electrophile_id', values='plot')
maxes[maxes==0] = -1 # set all zeros to -1, this helps with plotting with a cmap, i can set the color for -1

fig, ax = plt.subplots()
if preset_color_dict is not None:
# val_to_rgb is unordered dict, have to call one by one with a np.arange() list
listedcolors = [val_to_rgb[ii] for ii in np.arange(len(ligands_to_color))+1]
cmap = mpl.colors.ListedColormap(listedcolors)
else:
cmap = mpl.cm.get_cmap('Paired').copy()
cmap = mpl.colormaps.get_cmap('Paired').copy()
cmap.set_under('k')
im = ax.imshow(max, cmap=cmap, vmin=1)
im = ax.imshow(maxes, cmap=cmap, vmin=1)

# grid line
for i in range(8):
for j in range(8):
ax.add_patch(Rectangle((j-0.5, i-0.5), 1, 1, fill=False, edgecolor='white', lw=1))

ax.set_xticks(np.arange(8), labels=list(max.columns))
ax.set_yticks(np.arange(8), labels=list(max.index))
ax.set_xticks(np.arange(8), labels=list(maxes.columns))
ax.set_yticks(np.arange(8), labels=list(maxes.index))
ax.set_xlabel('Electrophile (aryl bromide)')
ax.set_ylabel('Nucleophile (imidazole)')

Expand Down Expand Up @@ -1012,6 +1012,23 @@ def simulate_etc(top=1, max_sample=3, n_simulations=10000):
def plot_ligand_perf_expansion(scenario=1, nlargest=5, preset_color_dict=None):
# preset_color_dict is used to ensure consistent colors for ligands throughout different plots
# one set of color is saved in arylation_colors.json
"""
bar plots for ligand expansions schemes.
Plot the average yield of the top-<nlargest> ligands before each expansion
Parameters
----------
scenario:
nlargest: int
decides how many top ligands are plotted
preset_color_dict: dict
dictionary with pre-specified ligand color {ligand_name: ligand_color}
if None (default), use customized colors
Returns
-------
"""

with open('colors.yml', 'r') as file:
COLORS = yaml.safe_load(file)
Expand Down
6 changes: 3 additions & 3 deletions dataset-analysis/cn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import gif


def plot_all_results():
def plot_all_rmesults():
df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/cn-processed.csv')
df = df[['base_name', 'ligand_name', 'substrate_id', 'additive_id', 'yield']]

Expand Down Expand Up @@ -107,8 +107,8 @@ def plot_acquisition_history_heatmap_cn(history_fp='./test/history.csv', sim=0,
----------
history_fp: str
file path of history.csv
roun: list-like
snapshot of heatmap at this round
roun: int
snapshot of heatmap up until round <roun>
sim: int
which simulation to plot
binary: bool
Expand Down
10 changes: 5 additions & 5 deletions deebo/_algotests_regret.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def ucb1_tuned(scenario, n_sims, n_horizon, folder_name):
algo = UCB1Tuned(n_arms)
algo.reset(n_arms)
results = test_algorithm_regret(algo, arms, n_sims, n_horizon)
filename = 'ucb1_tuned.csv'
filename = f'ucb1_tuned-{n_sims}s-{n_horizon}r.csv'
results.to_csv(output_dir / filename)

return None
Expand Down Expand Up @@ -296,7 +296,7 @@ def ts_beta(scenario, n_sims, n_horizon, folder_name):
algo = ThompsonSamplingBeta(n_arms)
algo.reset(n_arms)
results = test_algorithm_regret(algo, arms, n_sims, n_horizon)
filename = 'TS.csv'
filename = f'TS-{n_sims}s-{n_horizon}r.csv'
results.to_csv(output_dir / filename)

return None
Expand Down Expand Up @@ -430,8 +430,8 @@ def test_TS_gaussian(scenario, n_sims, n_horizon):
return None


def test_n_arms(folder_name, n_sims=1000, n_horizon=10000):
for s in [11, 12, 13, 14, 15]:
def test_n_arms(folder_name, n_sims=500, n_horizon=15000):
for s in [15]:
ts_beta(scenario=s,
n_sims=n_sims,
n_horizon=n_horizon,
Expand All @@ -448,5 +448,5 @@ def test_n_arms(folder_name, n_sims=1000, n_horizon=10000):

#test_eps_greedy(1, 3, 200, './test/')

test_n_arms('logs/scalibility')
test_n_arms('logs/scalability')

20 changes: 19 additions & 1 deletion deebo/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,8 @@ def scenario5_best_perfomers():
return None

def scalability():
fn_list = [f'logs/scalability/scenario{n}/optim/TS.csv' for n in [11, 12, 13, 14, 15]]
# # Average reward
# fn_list = [f'logs/scalability/scenario{n}/optim/TS.csv' for n in [11, 12, 13, 14, 15]]
# plot_average_reward(
# fn_list=fn_list,
# legend_list=['20', '50', '100', '500', '1000'],
Expand All @@ -601,6 +602,16 @@ def scalability():
# show_se=True,
# long_legend=False,
# )

# Accuracy
fn_list = ['logs/scalability/scenario11/optim/TS-1000s-10000r.csv',
'logs/scalability/scenario12/optim/TS-1000s-10000r.csv',
'logs/scalability/scenario13/optim/TS-1000s-10000r.csv',
'logs/scalability/scenario14/optim/TS-1000s-15000r.csv',
'logs/scalability/scenario15/optim/TS-500s-15000r.csv',]
fn_list = ['logs/scalability/scenario11/optim/ucb1_tuned-1000s-10000r.csv',
'logs/scalability/scenario12/optim/ucb1_tuned-1000s-10000r.csv',
'logs/scalability/scenario13/optim/ucb1_tuned-1000s-10000r.csv',]
plot_probs_choosing_best_arm(
fn_list=fn_list,
legend_list=['20', '50', '100', '500', '1000'],
Expand All @@ -609,6 +620,13 @@ def scalability():
best_arm_index=[19, 49, 99, 499, 999],
long_legend=False,
)
plot_average_reward(
fn_list=fn_list,
legend_list=['20', '50', '100'],
title='Accuracy with TS (beta prior)',
legend_title='# of arms',
long_legend=False,
)
return None

def normal_scenario1_best_performers(sd=0.5):
Expand Down
7 changes: 5 additions & 2 deletions deebo/chem_arms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from tqdm.autonotebook import tqdm
# from tqdm import tqdm # if tqdm.autonotebook breaks
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import (r2_score,
mean_squared_error,
mean_absolute_error)


import algos_regret
Expand Down
2 changes: 2 additions & 0 deletions deebo/chem_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pandas as pd
import numpy as np

"""Functions to run simulations on different datasets"""


def deoxyf():
# fetch ground truth data
Expand Down

0 comments on commit c408909

Please sign in to comment.