Skip to content

Commit 4163fe0

Browse files
committed
Initial commmit to resource allocation plots
1 parent 62292ca commit 4163fe0

File tree

3 files changed

+214
-0
lines changed

3 files changed

+214
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,6 @@ qiita_pet/*.conf
7272

7373
# jupyter notebooks input data
7474
notebooks/*/*.tsv.gz
75+
76+
# jupyter notebooks input data
77+
notebooks/resource-allocation/data

qiita_db/test/test_util.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from qiita_core.util import qiita_test_checker
2222
import qiita_db as qdb
2323

24+
from matplotlib.figure import Figure
25+
import numpy as np
2426

2527
@qiita_test_checker()
2628
class DBUtilTestsBase(TestCase):
@@ -1303,6 +1305,30 @@ def test_quick_mounts_purge(self):
13031305
qdb.util.quick_mounts_purge()
13041306

13051307

1308+
class ResourceAllocationPlotTests():
1309+
def __init__(self) -> None:
1310+
self.PATH_TO_DATA = '''../../notebooks/resource-allocation/data/
1311+
jobs_2024-02-21.tsv.gz'''
1312+
self.CNAME = "Validate"
1313+
self.SNAME = "Diversity types - alpha_vector"
1314+
1315+
1316+
def _get_return_value(self):
1317+
return qdb.util.resource_allocation_plot(self.PATH_TO_DATA, self.CNAME,
1318+
self.SNAME)
1319+
1320+
def _test_return_value_type(self):
1321+
fig, axs = self._get_return_value()
1322+
assert isinstance(fig, Figure), "Returned object is Matplotlib Figure"
1323+
1324+
# TODO test individual functions. E.g. constants returned by minimize.
1325+
1326+
1327+
1328+
1329+
1330+
1331+
13061332
STUDY_INFO = {
13071333
'study_id': 1,
13081334
'owner': 'Dude',

qiita_db/util.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@
7373
from email.mime.multipart import MIMEMultipart
7474
from email.mime.text import MIMEText
7575

76+
import pandas as pd
77+
from datetime import timedelta
78+
import matplotlib.pyplot as plt
79+
import numpy as np
80+
from scipy.optimize import minimize
81+
7682

7783
def scrub_data(s):
7884
r"""Scrubs data fields of characters not allowed by PostgreSQL
@@ -2311,3 +2317,182 @@ def send_email(to, subject, body):
23112317
raise RuntimeError("Can't send email!")
23122318
finally:
23132319
smtp.close()
2320+
2321+
2322+
def resource_allocation_plot(file, cname, sname='all'):
2323+
"""Builds resource allocation plot for given filename and jobs
2324+
2325+
Parameters
2326+
----------
2327+
file : str, required
2328+
Builds plot for the specified file name. Usually provided as tsv.gz
2329+
cname: str, required
2330+
Specified job type
2331+
sname: str, optional
2332+
Specified job sub type.
2333+
2334+
Returns
2335+
----------
2336+
matplotlib.pyplot object
2337+
Returns a matplotlib object with a plot
2338+
"""
2339+
# Constants
2340+
global M1G
2341+
M1G = 2**30
2342+
global COL_NAME
2343+
COL_NAME = 'samples * columns'
2344+
2345+
df = pd.read_csv(file, sep='\t', dtype={'extra_info': str})
2346+
df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)
2347+
# Diversity types - alpha_vector ; BIOM type - BIOM
2348+
if sname == "all":
2349+
_df = df[(df.cName == cname)].copy()
2350+
else:
2351+
_df = df[(df.cName == cname) & (df.sName == sname)].copy()
2352+
2353+
fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)
2354+
_df.dropna(subset=['samples', 'columns'], inplace=True)
2355+
_df[COL_NAME] = _df.samples * _df['columns']
2356+
ax = axs[0]
2357+
models = [mem_model1]
2358+
_resource_allocation_plot_helper(_df, ax, cname, sname, "MaxRSSRaw",
2359+
models)
2360+
2361+
ax = axs[1]
2362+
models = [time_model1]
2363+
_resource_allocation_plot_helper(_df, ax, cname, sname, "ElapsedRaw",
2364+
models)
2365+
return fig, axs
2366+
2367+
2368+
def _resource_allocation_plot_helper(_df, ax, cname, sname, curr, models):
2369+
"""Helper function for resource allocation plot. Builds plot for MaxRSSRaw
2370+
and ElapsedRaw
2371+
2372+
Parameters
2373+
----------
2374+
_df: pandas dataframe, required
2375+
Filtered dataframe for the plot
2376+
ax : matplotlib axes, required
2377+
Axes for current subplot
2378+
cname: str, required
2379+
Specified job type
2380+
sname: str, optional
2381+
Specified job sub type.
2382+
curr: str, required
2383+
Either MaxRSSRaw or ElapsedRaw
2384+
models: list, required
2385+
List of functions that will be used for visualization
2386+
2387+
"""
2388+
2389+
x_data, y_data = _df[COL_NAME], _df[curr]
2390+
ax.scatter(x_data, y_data, s=2)
2391+
ax.set_xscale('log')
2392+
ax.set_yscale('log')
2393+
ax.set_ylabel(curr)
2394+
ax.set_xlabel("samples * columns")
2395+
2396+
best_model, options = _resource_allocation_calculate(x_data, y_data,
2397+
models)
2398+
k, a, b = options.x
2399+
2400+
x_plot = np.array(sorted(_df[COL_NAME].unique()))
2401+
shift = 0
2402+
x_plot_adjusted = np.exp(np.log(x_plot) - shift)
2403+
y_plot = best_model(x_plot_adjusted, k, a, b)
2404+
ax.plot(x_plot_adjusted, y_plot, linewidth=1, color='orange')
2405+
2406+
x_plot = np.array(_df[COL_NAME])
2407+
x_plot_adjusted = np.exp(np.log(x_plot) - shift)
2408+
_df[f'c{curr}'] = best_model(x_plot_adjusted, k, a, b)
2409+
maxi = naturalsize(_df[curr].max(), gnu=True) if curr == "MaxRSSRaw" else \
2410+
timedelta(seconds=float(_df[curr].max()))
2411+
cmax = naturalsize(max(y_plot), gnu=True) if curr == "MaxRSSRaw" else \
2412+
timedelta(seconds=float(max(y_plot)))
2413+
mini = naturalsize(_df[curr].min(), gnu=True) if curr == "MaxRSSRaw" else \
2414+
timedelta(seconds=float(_df[curr].min()))
2415+
cmin = naturalsize(min(y_plot), gnu=True) if curr == "MaxRSSRaw" else \
2416+
timedelta(seconds=float(min(y_plot)))
2417+
2418+
failures_df = _df[_df[curr] > _df[f'c{curr}']]
2419+
failures = failures_df.shape[0]
2420+
2421+
ax.scatter(failures_df[COL_NAME], failures_df[curr], color='red', s=3)
2422+
2423+
ax.set_title(f'{cname}: {sname}\n real: {mini} || {maxi}\n'
2424+
f'calculated: {cmin} || {cmax}\n'
2425+
f'failures: {failures}')
2426+
2427+
2428+
def _resource_allocation_calculate(x, y, models):
2429+
"""Helper function for resource allocation plot. Calculates best_model and
2430+
best_result given the models list and x,y data.
2431+
2432+
Parameters
2433+
----------
2434+
x: pandas.Series (pandas column), required
2435+
Represents x data for the function calculation
2436+
y: pandas.Series (pandas column), required
2437+
Represents y data for the function calculation
2438+
models: list, required
2439+
List of functions that will be used for visualization
2440+
2441+
Returns
2442+
----------
2443+
best_model: function
2444+
best fitting function for the current list models
2445+
best_result: object
2446+
object containing constants for the best model (e.g. k, a, b in kx+b*a)
2447+
"""
2448+
2449+
init = [1, 1, 1]
2450+
best_model = None
2451+
best_loss = np.inf
2452+
best_result = None
2453+
for model in models:
2454+
bounds = [(0, float('inf')), (0, float('inf')), (0, float('inf'))]
2455+
options = minimize(_resource_allocation_custom_loss, init,
2456+
args=(x, y, model))
2457+
if options.fun < best_loss:
2458+
best_loss = options.fun
2459+
best_model = model
2460+
best_result = options
2461+
return best_model, best_result
2462+
2463+
2464+
def _resource_allocation_custom_loss(params, x, y, model):
2465+
"""Helper function for resource allocation plot. Calculates custom loss
2466+
for given model.
2467+
2468+
Parameters
2469+
----------
2470+
params: list, required
2471+
Initial list of integers for the given model
2472+
x: pandas.Series (pandas column), required
2473+
Represents x data for the function calculation
2474+
y: pandas.Series (pandas column), required
2475+
Represents y data for the function calculation
2476+
models: list, required
2477+
List of functions that will be used for visualization
2478+
2479+
Returns
2480+
----------
2481+
float
2482+
The mean of the list returned by the loss calculation (np.where)
2483+
"""
2484+
k, a, b = params
2485+
errors = y - model(x, k, a, b)
2486+
# Penalty weights
2487+
w1, w2 = 10000, 1
2488+
# positive error
2489+
loss = np.where(errors > 0, w1 * errors**2, w2 * errors**2)
2490+
return np.mean(loss)
2491+
2492+
2493+
def mem_model1(x, k, a, b):
2494+
return k*np.log(x) + x * a + b
2495+
2496+
2497+
def time_model1(x, k, a, b):
2498+
return a + b + (np.log(x) * k)

0 commit comments

Comments
 (0)