AutoDQM · ryanznie · Nov 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Aug 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,147 +1 @@
-led / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# nfs files
-.nfs*
-
-#tar gz files
-*.tar.gz
-
-#outputs
-*output/
+.DS_Store
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 ## Description
 This repository contains tools relevant for training and evaluating anomaly detection algorithms on CMS DQM data.
 Core code is contained in `autodqm_ml`, core scripts are contained in `scripts` and some helpful examples are in `examples`.
-See the README of each subdirectory for more information on each.
+See the README of each subdirectory for more information on each. A more in depth tutorial of the tool can be found [here](https://autodqm.github.io/autodqm_ml.github.io/). 
 
 ## Installation
 **1. Clone repository**
@@ -123,7 +123,7 @@ python scripts/train.py --input_file "data_fetched/pretraining/myOutputFile.parq
 ```
 Here, the full set or subset of histograms as feature in your `myHistList.json` file is entered as an argument. A quick way to obtain this list is to run the command
 ```
-python scripts/json_to_string.py -i metadata/histogram_lists/myHistList.json
+python scripts/json_to_string.py -i metadata/histogram_lists/myHistList.json -d "<detector>"
 ```
 FOR SMALL ORIGINAL V RECO STUDIES: If interested in using the `scripts/assess.py` macro to generate plots comparing original and reconstructed histogram distributions (i.e. the original assessment version of the repo), add the argument `--reco_assess_plots True` to the `scripts/train.py` stage to output a parquet file containing the relevant histogram information to do this. This is recommended for a subset of the runs fetched, and a subset of the histograms fetched, due to the exhaustive nature of generating the plots. A typical plotting assessment command for this would be
 ```

diff --git a/scripts/sse_scores_to_roc.py b/scripts/sse_scores_to_roc.py
@@ -15,6 +15,7 @@
 import json
 import argparse
 import awkward
+from tqdm import tqdm
 
 from autodqm_ml.utils import expand_path
 from autodqm_ml.constants import kANOMALOUS, kGOOD
@@ -68,7 +69,9 @@ def main(args):
   with open(args.output_dir + '/commands_sse_scores_to_roc.txt', 'w') as f:
     for arg in arguments:
       f.write(arg + ' ')
-
+
+  print("--------------------------------")
+  print('[1/5] Reading files...')
   sse_df = pd.read_csv(args.input_file)
   algorithm_name = str(sse_df['algo'].iloc[0]).upper()
   if algorithm_name == "BETAB": algorithm_name = "Beta_Binomial"
@@ -86,13 +89,14 @@ def main(args):
   sse_df_bad = sse_df.loc[sse_df['label'] == 1].reset_index()
   sse_df_good = sse_df_good[['run_number'] + hist_cols]
   sse_df_bad = sse_df_bad[['run_number'] + hist_cols]
-
+
+  print("[2/5] Iterating through histogram columns:")
   # new threshold cut-offs per Si's recommendations
   # 0th cut-off at 1st highest SSE + (1st - 2nd highest)*0.5
   # 1st cut-off at mean<1st, 2nd> highest SSE
   # Nth cut-off at mean<Nth, N+1th> highest SSE
   cutoffs_across_hists = []
-  for histogram in hist_cols:
+  for histogram in tqdm(hist_cols):
     sse_ordered = sorted(sse_df_good[histogram], reverse=True)
     cutoff_0 = sse_ordered[0] + 0.5*(sse_ordered[0] - sse_ordered[1])
     cutoff_thresholds = []
@@ -101,17 +105,19 @@ def main(args):
       cutoff_ii = 0.5*(sse_ordered[ii]+sse_ordered[ii+1])
       cutoff_thresholds.append(cutoff_ii)
     cutoffs_across_hists.append(cutoff_thresholds)
-
+   
   cutoffs_across_hists = np.array(cutoffs_across_hists)
 
   N_bad_hists = [5,3,1]
   tFRF_ROC_good_X = []
   tFRF_ROC_bad_Y = []
 
-  for nbh_ii in N_bad_hists:
+  print(f'[3/5] Iterating through N_bad_hists ({len(N_bad_hists)}):')
+  for idx, nbh_ii in enumerate(N_bad_hists):
+    print(f'+++++[{idx+1}/{len(N_bad_hists)}] - N={nbh_ii}:')
     tFRF_ROC_good_X_init = []
     tFRF_ROC_bad_Y_init = []
-    for cutoff_index in range(len(cutoffs_across_hists[0,:])):
+    for cutoff_index in tqdm(range(len(cutoffs_across_hists[0,:]))):
       t_cutoff_index_g_FRF_rc = count_fraction_runs_above(sse_df_good, cutoffs_across_hists[:,cutoff_index], nbh_ii)
       t_cutoff_index_b_FRF_rc = count_fraction_runs_above(sse_df_bad, cutoffs_across_hists[:,cutoff_index], nbh_ii)
       tFRF_ROC_good_X_init.append(t_cutoff_index_g_FRF_rc)
@@ -122,10 +128,11 @@ def main(args):
 
     tFRF_ROC_good_X.append(tFRF_ROC_good_X_init)
     tFRF_ROC_bad_Y.append(tFRF_ROC_bad_Y_init)
-
+
+  print("[4/5] Iterating through cutoff indices:")
   tMHF_ROC_good_X = []
   tMHF_ROC_bad_Y = []
-  for cutoff_index in range(len(cutoffs_across_hists[0,:])):
+  for cutoff_index in tqdm(range(len(cutoffs_across_hists[0,:]))):
     #if not cutoff_index % 8:
     t_cutoff_index_g_MHF_rc = count_mean_runs_above(sse_df_good, cutoffs_across_hists[:,cutoff_index])
     t_cutoff_index_b_MHF_rc = count_mean_runs_above(sse_df_bad, cutoffs_across_hists[:,cutoff_index])
@@ -138,7 +145,7 @@ def main(args):
   #print("Mean values")
   #print(tMHF_ROC_good_X)
   #print(tMHF_ROC_bad_Y)
-
+  print('[5/5] Plotting ROC Curves...')
   fig, axs = plt.subplots(ncols=2,nrows=1,figsize=(12,6))
 
   axs[1].set_xlabel('Fraction of good runs with at least N histogram flags')
@@ -166,7 +173,7 @@ def main(args):
   axs[0].legend(loc='lower right')
 
   plt.savefig(args.output_dir + "/RF_HF_ROC_comparison_" + algorithm_name + ".pdf",bbox_inches='tight')
-  print("SAVED: " + args.output_dir + "/RF_HF_ROC_comparison_" + algorithm_name + ".pdf")
+  print("[SAVED] " + args.output_dir + "RF_HF_ROC_comparison_" + algorithm_name + ".pdf")
 
 if __name__ == "__main__":
   args = parse_arguments()