Using qiita.allocation_equations table in util.py

Gossty · Gossty · commit 1c30fc8c0ea4 · 2025-01-09T13:02:30.000-08:00
diff --git a/qiita_db/support_files/patches/test_db_sql/93.sql b/qiita_db/support_files/patches/test_db_sql/93.sql
@@ -0,0 +1,9 @@
+INSERT INTO qiita.allocation_equations(equation_name, expression)
+        VALUES  ('mem_model1', 'k * np.log(x) + x * a + b'),
+('mem_model2', 'k * np.log(x) + b * np.log(x)**2 + a'),
+('mem_model3', 'k * np.log(x) + b * np.log(x)**2 + a * np.log(x)**3'),
+('mem_model4', 'k * np.log(x) + b * np.log(x)**2 + a * np.log(x)**2.5'),
+('time_model1', 'a + b + np.log(x) * k'),
+('time_model2', 'a + b * x + np.log(x) * k'),
+('time_model3', 'a + b * np.log(x)**2 + np.log(x) * k'),
+('time_model4', 'a * np.log(x)**3 + b * np.log(x)**2 + np.log(x) * k');
diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py
@@ -1343,42 +1343,45 @@ def test_minimize_const(self):
         self.df[self.col_name] = self.df.samples * self.df['columns']
         fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)
 
-        bm, options = qdb.util._resource_allocation_plot_helper(
-            self.df, axs[0], 'MaxRSSRaw', qdb.util.MODELS_MEM, self.col_name)
+        mem_models, time_models = qdb.util._retrieve_equations()
+        bm_name, bm, options = qdb.util._resource_allocation_plot_helper(
+            self.df, axs[0], 'MaxRSSRaw', mem_models, self.col_name)
         # check that the algorithm chooses correct model for MaxRSSRaw and
         # has 0 failures
         k, a, b = options.x
         failures_df = qdb.util._resource_allocation_success_failures(
             self.df, k, a, b, bm, self.col_name, 'MaxRSSRaw')[-1]
         failures = failures_df.shape[0]
-        self.assertEqual(bm, qdb.util.mem_model3,
+
+        self.assertEqual(bm_name, 'mem_model3',
+                         msg=f"""Best memory model
+                         doesn't match
+                         {bm_name} != 'mem_model3'""")
+        self.assertEqual(bm, mem_models['mem_model3'],
                          msg=f"""Best memory model
                                  doesn't match
                                  Coefficients:{k} {a} {b}
-                                 {qdb.util.mem_model1}, "qdb.util.mem_model1"
-                                 {qdb.util.mem_model2}, "qdb.util.mem_model2"
-                                 {qdb.util.mem_model3}, "qdb.util.mem_model3"
-                                 {qdb.util.mem_model4}, "qdb.util.mem_model4"
                             """)
         self.assertEqual(failures, 0, "Number of failures must be 0")
 
         # check that the algorithm chooses correct model for ElapsedRaw and
         # has 1 failure
-        bm, options = qdb.util._resource_allocation_plot_helper(
-            self.df, axs[1], 'ElapsedRaw', qdb.util.MODELS_TIME, self.col_name)
+        bm_name, bm, options = qdb.util._resource_allocation_plot_helper(
+            self.df, axs[1], 'ElapsedRaw', time_models, self.col_name)
         k, a, b = options.x
         failures_df = qdb.util._resource_allocation_success_failures(
             self.df, k, a, b, bm, self.col_name, 'ElapsedRaw')[-1]
         failures = failures_df.shape[0]
 
+        self.assertEqual(bm_name, 'time_model1',
+                         msg=f"""Best time model
+                         doesn't match
+                         {bm_name} != 'time_model1'""")
+
         self.assertEqual(bm, qdb.util.time_model1,
                          msg=f"""Best time model
                                 doesn't match
                                 Coefficients:{k} {a} {b}
-                                 {qdb.util.time_model1}, "qdb.util.time_model1"
-                                 {qdb.util.time_model2}, "qdb.util.time_model2"
-                                 {qdb.util.time_model3}, "qdb.util.time_model3"
-                                 {qdb.util.time_model4}, "qdb.util.time_model4"
                                 """)
         self.assertEqual(failures, 1, "Number of failures must be 1")
 
diff --git a/qiita_db/util.py b/qiita_db/util.py
@@ -81,41 +81,23 @@
 from json import loads
 from scipy.optimize import minimize
 
-# memory constant functions defined for @resource_allocation_plot
-mem_model1 = (lambda x, k, a, b: k * np.log(x) + x * a + b)
-mem_model2 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 + a)
-mem_model3 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 +
-              a * np.log(x)**3)
-mem_model4 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 +
-              a * np.log(x)**2.5)
-MODELS_MEM = [mem_model1, mem_model2, mem_model3, mem_model4]
-
-# time constant functions defined for @resource_allocation_plot
-time_model1 = (lambda x, k, a, b: a + b + np.log(x) * k)
-time_model2 = (lambda x, k, a, b: a + b * x + np.log(x) * k)
-time_model3 = (lambda x, k, a, b: a + b * np.log(x)**2 + np.log(x) * k)
-time_model4 = (lambda x, k, a, b: a * np.log(x)**3 + b * np.log(x)**2 +
-               np.log(x) * k)
-
-MODELS_TIME = [time_model1, time_model2, time_model3, time_model4]
-
 
 def get_model_name(model):
-    if model == mem_model1:
+    if model == 'mem_model1':
         return "k * log(x) + x * a + b"
-    elif model == mem_model2:
+    elif model == 'mem_model2':
         return "k * log(x) + b * log(x)^2 + a"
-    elif model == mem_model3:
+    elif model == 'mem_model3':
         return "k * log(x) + b * log(x)^2 + a * log(x)^3"
-    elif model == mem_model4:
+    elif model == 'mem_model4':
         return "k * log(x) + b * log(x)^2 + a * log(x)^2.5"
-    elif model == time_model1:
+    elif model == 'time_model1':
         return "a + b + log(x) * k"
-    elif model == time_model2:
+    elif model == 'time_model2':
         return "a + b * x + log(x) * k"
-    elif model == time_model3:
+    elif model == 'time_model3':
         return "a + b * log(x)^2 + log(x) * k"
-    elif model == time_model4:
+    elif model == 'time_model4':
         return "a * log(x)^3 + b * log(x)^2 + log(x) * k"
     else:
         return "Unknown model"
@@ -2387,19 +2369,63 @@ def resource_allocation_plot(df, col_name):
     fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)
 
     ax = axs[0]
+    mem_models, time_models = _retrieve_equations()
+
     # models for memory
     _resource_allocation_plot_helper(
-        df, ax, "MaxRSSRaw",  MODELS_MEM, col_name)
-
+        df, ax, "MaxRSSRaw",  mem_models, col_name)
     ax = axs[1]
     # models for time
     _resource_allocation_plot_helper(
-        df, ax, "ElapsedRaw",  MODELS_TIME, col_name)
+        df, ax, "ElapsedRaw",  time_models, col_name)
 
     return fig, axs
 
 
+def _retrieve_equations():
+    '''
+    Helepr function for resource_allocation_plot.
+    Retrieves equations from db. Creates dictionary for memory and time models.
+
+    Returns
+    -------
+    tuple
+        dict
+            memory models - potential memory models for resource allocations
+        dict
+            time models - potential time models for resource allocations
+    '''
+    memory_models = {}
+    time_models = {}
+    with qdb.sql_connection.TRN:
+        sql = ''' SELECT * FROM qiita.allocation_equations; '''
+        qdb.sql_connection.TRN.add(sql)
+        res = qdb.sql_connection.TRN.execute_fetchindex()
+        for models in res:
+            if 'mem' in models[1]:
+                memory_models[models[1]] = lambda x, k, a, b: eval(models[2])
+            else:
+                time_models[models[2]] = lambda x, k, a, b: eval(models[2])
+        return (memory_models, time_models)
+
+
 def retrieve_resource_data(cname, sname, version, columns):
+    '''
+    Retrieves resource data from db and constructs a DataFrame with relevant
+    fields.
+
+    Parameters
+    ----------
+    cname - command name for which we retrieve the resources
+    sname - software name for which we retrieve the resources
+    version - version of sftware for whhich we retrieve the resources
+    columns - column names for the DataFrame returned by this function
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with resources.
+    '''
     with qdb.sql_connection.TRN:
         sql = """
             SELECT
@@ -2457,8 +2483,8 @@ def _resource_allocation_plot_helper(
         Specifies x axis for the graph
     curr: str, required
         Either MaxRSSRaw or ElapsedRaw (y axis)
-    models: list, required
-        List of functions that will be used for visualization
+    models: dictionary, required
+        Dictionary of functions that will be used for visualization
 
     """
 
@@ -2494,7 +2520,7 @@ def _resource_allocation_plot_helper(
     ax.set_xlabel(col_name)
 
     # 50 - number of maximum iterations, 3 - number of failures we tolerate
-    best_model, options = _resource_allocation_calculate(
+    best_model_name, best_model, options = _resource_allocation_calculate(
         df, x_data, y_data, models, curr, col_name, 50, 3)
     k, a, b = options.x
     x_plot = np.array(sorted(df[col_name].unique()))
@@ -2522,22 +2548,24 @@ def _resource_allocation_plot_helper(
                label="failures")
     success_df['node_name'] = success_df['node_name'].fillna('unknown')
     slurm_hosts = set(success_df['node_name'].tolist())
-    cmap = colormaps.get_cmap('Accent').resampled(len(slurm_hosts))
-    colors = [cmap(
-              i / (len(slurm_hosts) - 1)) for i in range(len(slurm_hosts))]
+    cmap = colormaps.get_cmap('Accent')
+    if len(slurm_hosts) > len(cmap.colors):
+        raise ValueError(f"""'Accent' colormap only has {len(cmap.colors)}
+                     colors, but {len(slurm_hosts)} hosts are provided.""")
+    colors = cmap.colors[:len(slurm_hosts)]
 
     for i, host in enumerate(slurm_hosts):
         host_df = success_df[success_df['node_name'] == host]
         ax.scatter(host_df[col_name], host_df[curr], color=colors[i], s=3,
                    label=host)
     ax.set_title(
                  f'k||a||b: {k}||{a}||{b}\n'
-                 f'model: {get_model_name(best_model)}\n'
+                 f'model: {get_model_name(best_model_name)}\n'
                  f'real: {mini} || {maxi}\n'
                  f'calculated: {cmin} || {cmax}\n'
                  f'failures: {failures}')
     ax.legend(loc='upper left')
-    return best_model, options
+    return best_model_name, best_model, options
 
 
 def _resource_allocation_calculate(
@@ -2555,27 +2583,30 @@ def _resource_allocation_calculate(
         current type (e.g. MaxRSSRaw)
     col_name: str, required
         Specifies x axis for the graph
-    models: list, required
-        List of functions that will be used for visualization
+    models: dictionary, required
+        Dictionary of functions that will be used for visualization
     depth: int, required
         Maximum number of iterations in binary search
     tolerance: int, required,
         Tolerance to number of failures possible to be considered as a model
 
     Returns
     ----------
+    best_model_name: string
+        the name of the best model from the table
     best_model: function
-        best fitting function for the current list models
+        best fitting function for the current dictionary models
     best_result: object
         object containing constants for the best model (e.g. k, a, b in kx+b*a)
     """
 
     init = [1, 1, 1]
+    best_model_name = None
     best_model = None
     best_result = None
     best_failures = np.inf
     best_max = np.inf
-    for model in models:
+    for model_name, model in models.items():
         # start values for binary search, where sl is left, sr is right
         # penalty weight must be positive & non-zero, hence, sl >= 1.
         # the upper bound for error can be an arbitrary large number
@@ -2646,9 +2677,10 @@ def _resource_allocation_calculate(
             if min_max <= best_max:
                 best_failures = prev_failures
                 best_max = min_max
+                best_model_name = model_name
                 best_model = model
                 best_result = res
-    return best_model, best_result
+    return best_model_name, best_model, best_result
 
 
 def _resource_allocation_custom_loss(params, x, y, model, p):