|
73 | 73 | from email.mime.multipart import MIMEMultipart
|
74 | 74 | from email.mime.text import MIMEText
|
75 | 75 |
|
| 76 | +import pandas as pd |
| 77 | +from datetime import timedelta |
| 78 | +import matplotlib.pyplot as plt |
| 79 | +import numpy as np |
| 80 | +from scipy.optimize import minimize |
| 81 | + |
76 | 82 |
|
77 | 83 | def scrub_data(s):
|
78 | 84 | r"""Scrubs data fields of characters not allowed by PostgreSQL
|
@@ -2311,3 +2317,182 @@ def send_email(to, subject, body):
|
2311 | 2317 | raise RuntimeError("Can't send email!")
|
2312 | 2318 | finally:
|
2313 | 2319 | smtp.close()
|
| 2320 | + |
| 2321 | + |
| 2322 | +def resource_allocation_plot(file, cname, sname='all'): |
| 2323 | + """Builds resource allocation plot for given filename and jobs |
| 2324 | +
|
| 2325 | + Parameters |
| 2326 | + ---------- |
| 2327 | + file : str, required |
| 2328 | + Builds plot for the specified file name. Usually provided as tsv.gz |
| 2329 | + cname: str, required |
| 2330 | + Specified job type |
| 2331 | + sname: str, optional |
| 2332 | + Specified job sub type. |
| 2333 | +
|
| 2334 | + Returns |
| 2335 | + ---------- |
| 2336 | + matplotlib.pyplot object |
| 2337 | + Returns a matplotlib object with a plot |
| 2338 | + """ |
| 2339 | + # Constants |
| 2340 | + global M1G |
| 2341 | + M1G = 2**30 |
| 2342 | + global COL_NAME |
| 2343 | + COL_NAME = 'samples * columns' |
| 2344 | + |
| 2345 | + df = pd.read_csv(file, sep='\t', dtype={'extra_info': str}) |
| 2346 | + df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime) |
| 2347 | + # Diversity types - alpha_vector ; BIOM type - BIOM |
| 2348 | + if sname == "all": |
| 2349 | + _df = df[(df.cName == cname)].copy() |
| 2350 | + else: |
| 2351 | + _df = df[(df.cName == cname) & (df.sName == sname)].copy() |
| 2352 | + |
| 2353 | + fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False) |
| 2354 | + _df.dropna(subset=['samples', 'columns'], inplace=True) |
| 2355 | + _df[COL_NAME] = _df.samples * _df['columns'] |
| 2356 | + ax = axs[0] |
| 2357 | + models = [mem_model1] |
| 2358 | + _resource_allocation_plot_helper(_df, ax, cname, sname, "MaxRSSRaw", |
| 2359 | + models) |
| 2360 | + |
| 2361 | + ax = axs[1] |
| 2362 | + models = [time_model1] |
| 2363 | + _resource_allocation_plot_helper(_df, ax, cname, sname, "ElapsedRaw", |
| 2364 | + models) |
| 2365 | + return fig, axs |
| 2366 | + |
| 2367 | + |
| 2368 | +def _resource_allocation_plot_helper(_df, ax, cname, sname, curr, models): |
| 2369 | + """Helper function for resource allocation plot. Builds plot for MaxRSSRaw |
| 2370 | + and ElapsedRaw |
| 2371 | +
|
| 2372 | + Parameters |
| 2373 | + ---------- |
| 2374 | + _df: pandas dataframe, required |
| 2375 | + Filtered dataframe for the plot |
| 2376 | + ax : matplotlib axes, required |
| 2377 | + Axes for current subplot |
| 2378 | + cname: str, required |
| 2379 | + Specified job type |
| 2380 | + sname: str, optional |
| 2381 | + Specified job sub type. |
| 2382 | + curr: str, required |
| 2383 | + Either MaxRSSRaw or ElapsedRaw |
| 2384 | + models: list, required |
| 2385 | + List of functions that will be used for visualization |
| 2386 | +
|
| 2387 | + """ |
| 2388 | + |
| 2389 | + x_data, y_data = _df[COL_NAME], _df[curr] |
| 2390 | + ax.scatter(x_data, y_data, s=2) |
| 2391 | + ax.set_xscale('log') |
| 2392 | + ax.set_yscale('log') |
| 2393 | + ax.set_ylabel(curr) |
| 2394 | + ax.set_xlabel("samples * columns") |
| 2395 | + |
| 2396 | + best_model, options = _resource_allocation_calculate(x_data, y_data, |
| 2397 | + models) |
| 2398 | + k, a, b = options.x |
| 2399 | + |
| 2400 | + x_plot = np.array(sorted(_df[COL_NAME].unique())) |
| 2401 | + shift = 0 |
| 2402 | + x_plot_adjusted = np.exp(np.log(x_plot) - shift) |
| 2403 | + y_plot = best_model(x_plot_adjusted, k, a, b) |
| 2404 | + ax.plot(x_plot_adjusted, y_plot, linewidth=1, color='orange') |
| 2405 | + |
| 2406 | + x_plot = np.array(_df[COL_NAME]) |
| 2407 | + x_plot_adjusted = np.exp(np.log(x_plot) - shift) |
| 2408 | + _df[f'c{curr}'] = best_model(x_plot_adjusted, k, a, b) |
| 2409 | + maxi = naturalsize(_df[curr].max(), gnu=True) if curr == "MaxRSSRaw" else \ |
| 2410 | + timedelta(seconds=float(_df[curr].max())) |
| 2411 | + cmax = naturalsize(max(y_plot), gnu=True) if curr == "MaxRSSRaw" else \ |
| 2412 | + timedelta(seconds=float(max(y_plot))) |
| 2413 | + mini = naturalsize(_df[curr].min(), gnu=True) if curr == "MaxRSSRaw" else \ |
| 2414 | + timedelta(seconds=float(_df[curr].min())) |
| 2415 | + cmin = naturalsize(min(y_plot), gnu=True) if curr == "MaxRSSRaw" else \ |
| 2416 | + timedelta(seconds=float(min(y_plot))) |
| 2417 | + |
| 2418 | + failures_df = _df[_df[curr] > _df[f'c{curr}']] |
| 2419 | + failures = failures_df.shape[0] |
| 2420 | + |
| 2421 | + ax.scatter(failures_df[COL_NAME], failures_df[curr], color='red', s=3) |
| 2422 | + |
| 2423 | + ax.set_title(f'{cname}: {sname}\n real: {mini} || {maxi}\n' |
| 2424 | + f'calculated: {cmin} || {cmax}\n' |
| 2425 | + f'failures: {failures}') |
| 2426 | + |
| 2427 | + |
| 2428 | +def _resource_allocation_calculate(x, y, models): |
| 2429 | + """Helper function for resource allocation plot. Calculates best_model and |
| 2430 | + best_result given the models list and x,y data. |
| 2431 | +
|
| 2432 | + Parameters |
| 2433 | + ---------- |
| 2434 | + x: pandas.Series (pandas column), required |
| 2435 | + Represents x data for the function calculation |
| 2436 | + y: pandas.Series (pandas column), required |
| 2437 | + Represents y data for the function calculation |
| 2438 | + models: list, required |
| 2439 | + List of functions that will be used for visualization |
| 2440 | +
|
| 2441 | + Returns |
| 2442 | + ---------- |
| 2443 | + best_model: function |
| 2444 | + best fitting function for the current list models |
| 2445 | + best_result: object |
| 2446 | + object containing constants for the best model (e.g. k, a, b in kx+b*a) |
| 2447 | + """ |
| 2448 | + |
| 2449 | + init = [1, 1, 1] |
| 2450 | + best_model = None |
| 2451 | + best_loss = np.inf |
| 2452 | + best_result = None |
| 2453 | + for model in models: |
| 2454 | + bounds = [(0, float('inf')), (0, float('inf')), (0, float('inf'))] |
| 2455 | + options = minimize(_resource_allocation_custom_loss, init, |
| 2456 | + args=(x, y, model)) |
| 2457 | + if options.fun < best_loss: |
| 2458 | + best_loss = options.fun |
| 2459 | + best_model = model |
| 2460 | + best_result = options |
| 2461 | + return best_model, best_result |
| 2462 | + |
| 2463 | + |
| 2464 | +def _resource_allocation_custom_loss(params, x, y, model): |
| 2465 | + """Helper function for resource allocation plot. Calculates custom loss |
| 2466 | + for given model. |
| 2467 | +
|
| 2468 | + Parameters |
| 2469 | + ---------- |
| 2470 | + params: list, required |
| 2471 | + Initial list of integers for the given model |
| 2472 | + x: pandas.Series (pandas column), required |
| 2473 | + Represents x data for the function calculation |
| 2474 | + y: pandas.Series (pandas column), required |
| 2475 | + Represents y data for the function calculation |
| 2476 | + models: list, required |
| 2477 | + List of functions that will be used for visualization |
| 2478 | +
|
| 2479 | + Returns |
| 2480 | + ---------- |
| 2481 | + float |
| 2482 | + The mean of the list returned by the loss calculation (np.where) |
| 2483 | + """ |
| 2484 | + k, a, b = params |
| 2485 | + errors = y - model(x, k, a, b) |
| 2486 | + # Penalty weights |
| 2487 | + w1, w2 = 10000, 1 |
| 2488 | + # positive error |
| 2489 | + loss = np.where(errors > 0, w1 * errors**2, w2 * errors**2) |
| 2490 | + return np.mean(loss) |
| 2491 | + |
| 2492 | + |
| 2493 | +def mem_model1(x, k, a, b): |
| 2494 | + return k*np.log(x) + x * a + b |
| 2495 | + |
| 2496 | + |
| 2497 | +def time_model1(x, k, a, b): |
| 2498 | + return a + b + (np.log(x) * k) |
0 commit comments