bug fix, minor updates

ZimmermanGroup · Oct 24, 2023 · c408909 · c408909
1 parent cd60d86
commit c408909
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 23 deletions.
diff --git a/dataset-analysis/amidation.py b/dataset-analysis/amidation.py
@@ -24,7 +24,7 @@ def plot_all_results(single_component='activator'):
     # processing dataset
     # use shorter name for bases
     # use labels for nucleophiles
-    df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/ami.csv')
+    df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/amidation.csv')
 
     short_name_dict = {
         '1-Methylimidazole': 'NMI',

diff --git a/dataset-analysis/arylation_scope_ligand.py b/dataset-analysis/arylation_scope_ligand.py
@@ -438,12 +438,12 @@ def plot_results_with_model_substrates_color_match_publication(cutoff=75, preset
     fd = df.copy()
     fd['combo'] = fd['electrophile_id'].astype('str') + fd['nucleophile_id'].astype('str')
     #fd = fd.sort_values(by=['combo', 'ligand_name'])
-    max = fd.loc[fd.groupby(by=['combo'])['yield'].idxmax()]
+    maxes = fd.loc[fd.groupby(by=['combo'])['yield'].idxmax()]
     #print(list(max['ligand_name'].unique()))
     #print(max.loc[max['plot']!=0]['ligand_name'].value_counts())
 
     # new way to assign colors for all ligands that give above cutoff yields
-    ligands_to_color = max.loc[max['yield']>cutoff]['ligand_name'].unique()
+    ligands_to_color = maxes.loc[maxes['yield']>cutoff]['ligand_name'].unique()
 
     val_to_rgb = {}  # {value: rgb}
     def color(x):
@@ -457,29 +457,29 @@ def color(x):
                 val_to_rgb[d[x]] = preset_color_dict[x]
             return d[x]
 
-    max['valid'] = df['yield'].apply(lambda x: 0 if x<cutoff else 1)  # 0 for plotting, if highest yield < 75%
-    max['plot'] = df['ligand_name'].apply(color)
-    max['plot'] = max['plot']*max['valid']
-    max = max.pivot(index='nucleophile_id', columns='electrophile_id', values='plot')
-    max[max==0] = -1  # set all zeros to -1, this helps with plotting with a cmap, i can set the color for -1
+    maxes['valid'] = df['yield'].apply(lambda x: 0 if x<cutoff else 1)  # 0 for plotting, if highest yield < 75%
+    maxes['plot'] = df['ligand_name'].apply(color)
+    maxes['plot'] = maxes['plot']*maxes['valid']
+    maxes = maxes.pivot(index='nucleophile_id', columns='electrophile_id', values='plot')
+    maxes[maxes==0] = -1  # set all zeros to -1, this helps with plotting with a cmap, i can set the color for -1
 
     fig, ax = plt.subplots()
     if preset_color_dict is not None:
         # val_to_rgb is unordered dict, have to call one by one with a np.arange() list
         listedcolors = [val_to_rgb[ii] for ii in np.arange(len(ligands_to_color))+1]
         cmap = mpl.colors.ListedColormap(listedcolors)
     else:
-        cmap = mpl.cm.get_cmap('Paired').copy()
+        cmap = mpl.colormaps.get_cmap('Paired').copy()
     cmap.set_under('k')
-    im = ax.imshow(max, cmap=cmap, vmin=1)
+    im = ax.imshow(maxes, cmap=cmap, vmin=1)
 
     # grid line
     for i in range(8):
         for j in range(8):
             ax.add_patch(Rectangle((j-0.5, i-0.5), 1, 1, fill=False, edgecolor='white', lw=1))
 
-    ax.set_xticks(np.arange(8), labels=list(max.columns))
-    ax.set_yticks(np.arange(8), labels=list(max.index))
+    ax.set_xticks(np.arange(8), labels=list(maxes.columns))
+    ax.set_yticks(np.arange(8), labels=list(maxes.index))
     ax.set_xlabel('Electrophile (aryl bromide)')
     ax.set_ylabel('Nucleophile (imidazole)')
 
@@ -1012,6 +1012,23 @@ def simulate_etc(top=1, max_sample=3, n_simulations=10000):
 def plot_ligand_perf_expansion(scenario=1, nlargest=5, preset_color_dict=None):
     # preset_color_dict is used to ensure consistent colors for ligands throughout different plots
     # one set of color is saved in arylation_colors.json
+    """
+    bar plots for ligand expansions schemes.
+    Plot the average yield of the top-<nlargest> ligands before each expansion
+
+    Parameters
+    ----------
+    scenario:
+    nlargest: int
+        decides how many top ligands are plotted
+    preset_color_dict: dict
+        dictionary with pre-specified ligand color {ligand_name: ligand_color}
+        if None (default), use customized colors
+
+    Returns
+    -------
+
+    """
 
     with open('colors.yml', 'r') as file:
         COLORS = yaml.safe_load(file)

diff --git a/dataset-analysis/cn.py b/dataset-analysis/cn.py
@@ -8,7 +8,7 @@
 import gif
 
 
-def plot_all_results():
+def plot_all_rmesults():
     df = pd.read_csv('https://raw.githubusercontent.com/beef-broccoli/ochem-data/main/deebo/cn-processed.csv')
     df = df[['base_name', 'ligand_name', 'substrate_id', 'additive_id', 'yield']]
 
@@ -107,8 +107,8 @@ def plot_acquisition_history_heatmap_cn(history_fp='./test/history.csv', sim=0,
     ----------
     history_fp: str
         file path of history.csv
-    roun: list-like
-        snapshot of heatmap at this round
+    roun: int
+        snapshot of heatmap up until round <roun>
     sim: int
         which simulation to plot
     binary: bool

diff --git a/deebo/_algotests_regret.py b/deebo/_algotests_regret.py
@@ -181,7 +181,7 @@ def ucb1_tuned(scenario, n_sims, n_horizon, folder_name):
     algo = UCB1Tuned(n_arms)
     algo.reset(n_arms)
     results = test_algorithm_regret(algo, arms, n_sims, n_horizon)
-    filename = 'ucb1_tuned.csv'
+    filename = f'ucb1_tuned-{n_sims}s-{n_horizon}r.csv'
     results.to_csv(output_dir / filename)
 
     return None
@@ -296,7 +296,7 @@ def ts_beta(scenario, n_sims, n_horizon, folder_name):
     algo = ThompsonSamplingBeta(n_arms)
     algo.reset(n_arms)
     results = test_algorithm_regret(algo, arms, n_sims, n_horizon)
-    filename = 'TS.csv'
+    filename = f'TS-{n_sims}s-{n_horizon}r.csv'
     results.to_csv(output_dir / filename)
 
     return None
@@ -430,8 +430,8 @@ def test_TS_gaussian(scenario, n_sims, n_horizon):
     return None
 
 
-def test_n_arms(folder_name, n_sims=1000, n_horizon=10000):
-    for s in [11, 12, 13, 14, 15]:
+def test_n_arms(folder_name, n_sims=500, n_horizon=15000):
+    for s in [15]:
         ts_beta(scenario=s,
                 n_sims=n_sims,
                 n_horizon=n_horizon,
@@ -448,5 +448,5 @@ def test_n_arms(folder_name, n_sims=1000, n_horizon=10000):
 
     #test_eps_greedy(1, 3, 200, './test/')
 
-    test_n_arms('logs/scalibility')
+    test_n_arms('logs/scalability')
 
diff --git a/deebo/analyze.py b/deebo/analyze.py
@@ -592,7 +592,8 @@ def scenario5_best_perfomers():
         return None
 
     def scalability():
-        fn_list = [f'logs/scalability/scenario{n}/optim/TS.csv' for n in [11, 12, 13, 14, 15]]
+        # # Average reward
+        # fn_list = [f'logs/scalability/scenario{n}/optim/TS.csv' for n in [11, 12, 13, 14, 15]]
         # plot_average_reward(
         #     fn_list=fn_list,
         #     legend_list=['20', '50', '100', '500', '1000'],
@@ -601,6 +602,16 @@ def scalability():
         #     show_se=True,
         #     long_legend=False,
         # )
+
+        # Accuracy
+        fn_list = ['logs/scalability/scenario11/optim/TS-1000s-10000r.csv',
+                   'logs/scalability/scenario12/optim/TS-1000s-10000r.csv',
+                   'logs/scalability/scenario13/optim/TS-1000s-10000r.csv',
+                   'logs/scalability/scenario14/optim/TS-1000s-15000r.csv',
+                   'logs/scalability/scenario15/optim/TS-500s-15000r.csv',]
+        fn_list = ['logs/scalability/scenario11/optim/ucb1_tuned-1000s-10000r.csv',
+                   'logs/scalability/scenario12/optim/ucb1_tuned-1000s-10000r.csv',
+                   'logs/scalability/scenario13/optim/ucb1_tuned-1000s-10000r.csv',]
         plot_probs_choosing_best_arm(
             fn_list=fn_list,
             legend_list=['20', '50', '100', '500', '1000'],
@@ -609,6 +620,13 @@ def scalability():
             best_arm_index=[19, 49, 99, 499, 999],
             long_legend=False,
         )
+        plot_average_reward(
+            fn_list=fn_list,
+            legend_list=['20', '50', '100'],
+            title='Accuracy with TS (beta prior)',
+            legend_title='# of arms',
+            long_legend=False,
+        )
         return None
 
     def normal_scenario1_best_performers(sd=0.5):

diff --git a/deebo/chem_arms.py b/deebo/chem_arms.py
@@ -6,12 +6,15 @@
 import pandas as pd
 import numpy as np
 import os
-from tqdm import tqdm
+from tqdm.autonotebook import tqdm
+# from tqdm import tqdm   # if tqdm.autonotebook breaks
 from sklearn.preprocessing import OneHotEncoder as OHE
 from sklearn.ensemble import RandomForestRegressor as RFR
 from sklearn.linear_model import LinearRegression as LR
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+from sklearn.metrics import (r2_score,
+                             mean_squared_error,
+                             mean_absolute_error)
 
 
 import algos_regret

diff --git a/deebo/chem_simulate.py b/deebo/chem_simulate.py
@@ -6,6 +6,8 @@
 import pandas as pd
 import numpy as np
 
+"""Functions to run simulations on different datasets"""
+
 
 def deoxyf():
     # fetch ground truth data