From c6ccb3951b101c5ec6845e57d76d0b463020ce88 Mon Sep 17 00:00:00 2001
From: RudraCodesForU <rudrajyotichakraborty459@gmail.com>
Date: Wed, 10 Sep 2025 17:39:44 +0530
Subject: [PATCH 1/2] Updated the app.js

---
 web/app.js | 329 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 233 insertions(+), 96 deletions(-)

diff --git a/web/app.js b/web/app.js
index 7eada7f..fe33ada 100644
--- a/web/app.js
+++ b/web/app.js
@@ -1,14 +1,57 @@
+const DGM_META = {
+    "linear_gaussian": {
+        name: "Linear Gaussian",
+        equation: `\\begin{align}
+        X &= \\alpha^T Z + \\varepsilon_1 \\\\
+        Y &= \\beta^T Z + \\text{effect\\_size} \\cdot X + \\varepsilon_2
+        \\end{align}
+        \\text{where } \\varepsilon_1, \\varepsilon_2 \\sim \\mathcal{N}(0, \\sigma^2) \\text{ (Gaussian noise)}`
+    },
+    "nonlinear_gaussian": {
+        name: "Nonlinear Gaussian",
+        equation: `\\begin{align}
+        X &= \\sin(\\text{effect\\_size} \\cdot \\sum_j Z_j) + \\varepsilon_1 \\\\
+        Y &= \\exp(\\text{effect\\_size} \\cdot \\sum_j Z_j \\cdot 0.2) + \\varepsilon_2
+        \\end{align}
+        \\text{where } \\varepsilon_1, \\varepsilon_2 \\sim \\mathcal{N}(0, \\sigma^2) \\text{ (Gaussian noise)}`
+    },
+    "discrete_categorical": {
+        name: "Discrete Categorical",
+        equation: `\\begin{align}
+        Z_j &\\sim \\text{DiscreteUniform}(0, n_{\\text{categories}}-1) \\\\
+        X &= \\sum_j Z_j + \\varepsilon_1 \\\\
+        Y &= \\sum_j Z_j + \\varepsilon_2
+        \\end{align}
+        \\text{where } \\varepsilon_1, \\varepsilon_2 \\sim \\mathcal{N}(0, \\sigma^2) \\text{ (Gaussian noise)}`
+    },
+    "mixed_data": {
+        name: "Mixed Data",
+        equation: `\\begin{align}
+        X &= Z^T \\alpha + \\varepsilon_1 \\\\
+        Y &= Z^T \\beta + \\text{effect\\_size} \\cdot X + \\varepsilon_2
+        \\end{align}
+        \\text{where } \\varepsilon_1, \\varepsilon_2 \\sim \\mathcal{N}(0, \\sigma^2) \\text{ (Gaussian noise)}`
+    },
+    "non_gaussian_continuous": {
+        name: "Non-Gaussian Continuous",
+        equation: `\\begin{align}
+        X &= |Z^T \\alpha| + \\varepsilon_1 \\\\
+        Y &= (Z^T \\beta)^2 + \\text{effect\\_size} \\cdot X + \\varepsilon_2
+        \\end{align}
+        \\text{where } \\varepsilon_1, \\varepsilon_2 \\sim \\text{Exponential}(\\lambda = 1.0) \\text{ (Exponential noise)}`
+    }
+};
+
 let allData = [];
 let dgmMap = {};
 let chartCalibration, chartPower;
 
 const dgmSelect = document.getElementById('dgm-select');
-const ciTestSelect = document.getElementById('ci-test-select');
-const effectSizeSelect = document.getElementById('effect-size-select');
-const significanceSelect = document.getElementById('significance-level-select');
+const sampleSizeSelect = document.getElementById('sample-size-select');
 const csvInput = document.getElementById('csv-input');
+const dgmEquationEl = document.getElementById('dgm-equation');
 
-const DEFAULT_CSV = "results/default_ci_benchmark_summaries.csv"; 
+const DEFAULT_CSV = "results/ci_benchmark_summaries.csv";
 
 window.addEventListener('DOMContentLoaded', () => {
     fetch(DEFAULT_CSV)
@@ -21,20 +64,15 @@ window.addEventListener('DOMContentLoaded', () => {
             buildDGMMap(allData);
             populateDropdowns();
             renderCharts();
-            // Optional: show a small message to user
-            showStatus("Loaded default benchmark results.", "success");
         })
-        .catch(err => {
-            showStatus("Default results file not found. Please upload a CSV.", "warning");
+        .catch(() => {
+            // file not found, don't crash
         });
 });
 
 csvInput.addEventListener('change', handleCSVUpload);
-
-dgmSelect.addEventListener('change', refreshControls);
-ciTestSelect.addEventListener('change', refreshControls);
-effectSizeSelect.addEventListener('change', refreshControls);
-significanceSelect.addEventListener('change', refreshControls);
+dgmSelect.addEventListener('change', onDGMChange);
+sampleSizeSelect.addEventListener('change', renderCharts);
 
 function handleCSVUpload(event) {
     const file = event.target.files[0];
@@ -58,6 +96,7 @@ function parseCSV(csvText) {
         )
     );
 }
+
 function parseValue(v) {
     if (!isNaN(parseFloat(v)) && isFinite(v)) return parseFloat(v);
     if (v === 'True') return true;
@@ -68,131 +107,229 @@ function parseValue(v) {
 function buildDGMMap(data) {
     dgmMap = {};
     data.forEach(row => {
-        const { dgm, ci_test, effect_size, significance_level } = row;
-        if (!dgmMap[dgm]) dgmMap[dgm] = {};
-        if (!dgmMap[dgm][ci_test]) dgmMap[dgm][ci_test] = {};
-        if (!dgmMap[dgm][ci_test][effect_size]) dgmMap[dgm][ci_test][effect_size] = {};
-        dgmMap[dgm][ci_test][effect_size][significance_level] = true;
+        const { dgm, sample_size } = row;
+        if (!dgmMap[dgm]) dgmMap[dgm] = new Set();
+        dgmMap[dgm].add(row.sample_size);
+    });
+    Object.keys(dgmMap).forEach(dgm => {
+        dgmMap[dgm] = Array.from(dgmMap[dgm]).sort((a, b) => a - b);
     });
 }
 
+function setSelectOptions(select, list) {
+    select.innerHTML = '';
+    for (const obj of list) {
+        const option = document.createElement('option');
+        option.value = obj.value;
+        option.textContent = obj.label;
+        select.appendChild(option);
+    }
+}
+
 function populateDropdowns() {
-    setSelectOptions(dgmSelect, Object.keys(dgmMap));
+    setSelectOptions(dgmSelect, Object.keys(dgmMap).map(d => ({ value: d, label: DGM_META[d]?.name || d })));
     onDGMChange();
 }
 
 function onDGMChange() {
     const dgm = dgmSelect.value;
-    const tests = dgm ? Object.keys(dgmMap[dgm]) : [];
-    setSelectOptions(ciTestSelect, tests);
-    onCITestChange();
-}
-function onCITestChange() {
-    const dgm = dgmSelect.value, test = ciTestSelect.value;
-    const effects = dgm && test ? Object.keys(dgmMap[dgm][test]) : [];
-    setSelectOptions(effectSizeSelect, effects);
-    onEffectSizeChange();
-}
-function onEffectSizeChange() {
-    const dgm = dgmSelect.value, test = ciTestSelect.value, effect = effectSizeSelect.value;
-    const sigs = dgm && test && effect ? Object.keys(dgmMap[dgm][test][effect]) : [];
-    setSelectOptions(significanceSelect, sigs);
+    const sizes = dgm ? dgmMap[dgm] : [];
+    setSelectOptions(sampleSizeSelect, sizes.map(s => ({ value: s, label: s })));
+    updateDGMEquation();
+    renderCharts();
 }
 
-dgmSelect.addEventListener('change', onDGMChange);
-ciTestSelect.addEventListener('change', onCITestChange);
-effectSizeSelect.addEventListener('change', onEffectSizeChange);
-
-function setSelectOptions(select, list) {
-    select.innerHTML = '';
-    for (const v of list) {
-        const option = document.createElement('option');
-        option.value = v;
-        option.textContent = v;
-        select.appendChild(option);
-    }
+function updateDGMEquation() {
+    const dgm = dgmSelect.value;
+    dgmEquationEl.innerHTML = DGM_META[dgm]?.equation || '';
+    if (window.MathJax) MathJax.typesetPromise([dgmEquationEl]);
 }
 
-function refreshControls() {
-    renderCharts();
+function computeStats(data, groupBy, valueCol) {
+    const grouped = {};
+    data.forEach(row => {
+        const key = groupBy(row);
+        if (!grouped[key]) grouped[key] = [];
+        grouped[key].push(row[valueCol]);
+    });
+    
+    return Object.entries(grouped).map(([key, values]) => {
+        const mean = values.reduce((a, b) => a + b, 0) / values.length;
+        const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
+        const stderr = Math.sqrt(variance / values.length);
+        return {
+            key: parseFloat(key),
+            mean,
+            stderr,
+            upper: mean + stderr,
+            lower: Math.max(0, mean - stderr)
+        };
+    }).sort((a, b) => a.key - b.key);
 }
 
 function renderCharts() {
     if (!allData.length) return;
-    const dgm = dgmSelect.value, test = ciTestSelect.value;
-    const effect = parseFloat(effectSizeSelect.value);
-    const significance = parseFloat(significanceSelect.value);
-
-    const filtered = allData.filter(row =>
-        row.dgm === dgm && row.ci_test === test
-    );
-
-    // Calibration Plot (Type I error vs Significance Level, effect_size == 0)
-    const calibData = filtered.filter(r => r.effect_size === 0);
-    const calibSLs = calibData.map(r => r.significance_level);
-    const calibT1 = calibData.map(r => r.type1_error);
+    const dgm = dgmSelect.value;
+    const sampleSize = Number(sampleSizeSelect.value);
 
-    // Power Plot (Power vs Sample Size, user-selected effect size & significance level)
-    const powerData = allData.filter(row =>
-        row.dgm === dgm &&
-        row.ci_test === test &&
-        row.effect_size === effect &&
-        row.significance_level === significance
-    );
-    const sampleSizes = powerData.map(r => r.sample_size);
-    const powers = powerData.map(r => r.power);
+    // Calibration plot: Type I error vs significance_level, effect_size == 0
+    const calibData = allData.filter(r => r.dgm === dgm && r.sample_size === sampleSize && r.effect_size === 0);
+    const ciTestsCalib = Array.from(new Set(calibData.map(r => r.ci_test)));
+    
+    const calibDatasets = ciTestsCalib.map((ciTest, idx) => {
+        const testData = calibData.filter(r => r.ci_test === ciTest);
+        const stats = computeStats(testData, row => row.significance_level, 'type1_error');
+        
+        return [
+            {
+                label: `${ciTest}`,
+                data: stats.map(s => ({ x: s.key, y: s.mean })),
+                borderColor: chartColor(idx, 1),
+                backgroundColor: chartColor(idx, 0.2),
+                pointRadius: 4,
+                fill: false,
+                tension: 0.2
+            },
+            // Added ribbon plot for standard error
+            {
+                label: `${ciTest} - Error Band`,
+                data: stats.map(s => ({ x: s.key, y: s.upper })),
+                borderColor: chartColor(idx, 0),
+                backgroundColor: chartColor(idx, 0.15),
+                fill: '+1',
+                pointRadius: 0,
+                tension: 0.2
+            },
+            {
+                label: `${ciTest} - Lower`,
+                data: stats.map(s => ({ x: s.key, y: s.lower })),
+                borderColor: chartColor(idx, 0),
+                backgroundColor: chartColor(idx, 0.15),
+                fill: false,
+                pointRadius: 0,
+                tension: 0.2
+            }
+        ];
+    }).flat();
 
-    // Rendering Calibration Plot 
     if (chartCalibration) chartCalibration.destroy();
     chartCalibration = new Chart(document.getElementById('calibration-plot').getContext('2d'), {
         type: 'line',
         data: {
-            labels: calibSLs,
-            datasets: [{
-                label: 'Type I Error',
-                data: calibT1,
-                borderColor: 'rgba(54, 162, 235, 1)',
-                backgroundColor: 'rgba(54, 162, 235, 0.2)',
-                pointRadius: 4,
-                fill: true,
-            }]
+            datasets: calibDatasets
         },
         options: {
             responsive: true,
-            plugins: {
-                legend: { display: false }
+            maintainAspectRatio: true,
+            aspectRatio: 1.6, 
+            plugins: { 
+                legend: { 
+                    display: true,
+                    filter: (legendItem) => !legendItem.text.includes('Band') && !legendItem.text.includes('Lower')
+                } 
             },
             scales: {
-                x: { title: { display: true, text: 'Significance Level' } },
-                y: { title: { display: true, text: 'Type I Error' }, min: 0, max: 1 }
+                x: { 
+                    title: { display: true, text: 'Significance Level' }, 
+                    min: 0, 
+                    max: 1,
+                    type: 'linear'
+                },
+                y: { 
+                    title: { display: true, text: 'Type I Error' }, 
+                    min: 0, 
+                    max: 1 
+                }
             }
         }
     });
 
-    //  Rendering Power Plot 
+    // Power plot: Power vs effect size, significance_level == 0.05
+    const powerData = allData.filter(row =>
+        row.dgm === dgm &&
+        row.sample_size === sampleSize &&
+        row.significance_level === 0.05
+    );
+    
+    const ciTestsPower = Array.from(new Set(powerData.map(r => r.ci_test)));
+    const powerDatasets = ciTestsPower.map((ciTest, idx) => {
+        const testData = powerData.filter(r => r.ci_test === ciTest);
+        const stats = computeStats(testData, row => row.effect_size, 'power');
+        
+        return [
+            {
+                label: `${ciTest}`,
+                data: stats.map(s => ({ x: s.key, y: s.mean })),
+                borderColor: chartColor(idx, 1),
+                backgroundColor: chartColor(idx, 0.2),
+                pointRadius: 3,
+                fill: false,
+                tension: 0.2
+            },
+            {
+                label: `${ciTest} - Error Band`,
+                data: stats.map(s => ({ x: s.key, y: s.upper })),
+                borderColor: chartColor(idx, 0),
+                backgroundColor: chartColor(idx, 0.15),
+                fill: '+1',
+                pointRadius: 0,
+                tension: 0.2
+            },
+            {
+                label: `${ciTest} - Lower`,
+                data: stats.map(s => ({ x: s.key, y: s.lower })),
+                borderColor: chartColor(idx, 0),
+                backgroundColor: chartColor(idx, 0.15),
+                fill: false,
+                pointRadius: 0,
+                tension: 0.2
+            }
+        ];
+    }).flat();
+
     if (chartPower) chartPower.destroy();
     chartPower = new Chart(document.getElementById('power-plot').getContext('2d'), {
         type: 'line',
         data: {
-            labels: sampleSizes,
-            datasets: [{
-                label: 'Power',
-                data: powers,
-                borderColor: 'rgba(255,99,132,1)',
-                backgroundColor: 'rgba(255,99,132,0.2)',
-                pointRadius: 4,
-                fill: true,
-            }]
+            datasets: powerDatasets
         },
         options: {
             responsive: true,
-            plugins: {
-                legend: { display: false }
+            maintainAspectRatio: true,
+            aspectRatio: 1.6,
+            plugins: { 
+                legend: { 
+                    display: true,
+                    filter: (legendItem) => !legendItem.text.includes('Band') && !legendItem.text.includes('Lower')
+                } 
             },
             scales: {
-                x: { title: { display: true, text: 'Sample Size' } },
-                y: { title: { display: true, text: 'Power' }, min: 0, max: 1 }
+                x: { 
+                    title: { display: true, text: 'Effect Size' }, 
+                    min: 0, 
+                    max: 1,
+                    type: 'linear'
+                },
+                y: { 
+                    title: { display: true, text: 'Power' }, 
+                    min: 0, 
+                    max: 1 
+                }
             }
         }
     });
+}
+
+function chartColor(idx, alpha = 1) {
+    const colors = [
+        `rgba(54, 162, 235, ${alpha})`,   // blue
+        `rgba(255,99,132,${alpha})`,      // red
+        `rgba(255, 205, 86, ${alpha})`,   // yellow
+        `rgba(75, 192, 192, ${alpha})`,   // teal
+        `rgba(153, 102, 255, ${alpha})`,  // purple
+        `rgba(201, 203, 207, ${alpha})`,  // grey
+        `rgba(0, 200, 83, ${alpha})`,     // green
+        `rgba(255, 87, 34, ${alpha})`,    // deep orange
+    ];
+    return colors[idx % colors.length];
 }
\ No newline at end of file

From 70ca190e09d0267ad2a3a1955e0dec6188524769 Mon Sep 17 00:00:00 2001
From: RudraCodesForU <rudrajyotichakraborty459@gmail.com>
Date: Wed, 10 Sep 2025 17:41:09 +0530
Subject: [PATCH 2/2] Updated the ci_benchmark.py

---
 benchmarks/ci_benchmark.py | 172 +++++++++++++++++++++++++++----------
 1 file changed, 127 insertions(+), 45 deletions(-)

diff --git a/benchmarks/ci_benchmark.py b/benchmarks/ci_benchmark.py
index d9ee660..748306c 100644
--- a/benchmarks/ci_benchmark.py
+++ b/benchmarks/ci_benchmark.py
@@ -47,7 +47,7 @@ def run_benchmark(
     effect_sizes=np.linspace(0, 1, 6),
     n_repeats=10,
 ):
-
+ 
     results = []
 
     dgm_pbar = tqdm(dgms.items())
@@ -57,7 +57,6 @@ def run_benchmark(
         compatible_tests = dgm_to_citests[dgm_name]
         for n_cond_var in tqdm(n_cond_vars, desc="No. of conditional variables", leave=False):
             for n in tqdm(sample_sizes, desc="Sample Size", leave=False):
-                # Null case (conditionally independent, effect size = 0)
                 for rep in range(n_repeats):
                     df = dgm(n_samples=n,
                              effect_size=0.0,
@@ -70,16 +69,23 @@ def run_benchmark(
                     for test_name in compatible_tests:
                         ci_func = ci_tests[test_name]
                         result = ci_func("X", "Y", z_cols, df, boolean=False)
-                        # Robust extraction of p-value
+
                         if isinstance(result, tuple):
-                            # Heuristic: p-value is usually last in tuple
-                            if isinstance(result[-1], float):
-                                p_val = result[-1]
+                            if len(result) >= 2 and isinstance(result[-1], (float, np.floating)):
+                                p_val = float(result[-1])
+                            elif isinstance(result[0], (float, np.floating)):
+                                p_val = float(result[0])
                             else:
-                                # fallback to first item
-                                p_val = result[0]
+                                float_vals = [x for x in result if isinstance(x, (float, np.floating))]
+                                if float_vals:
+                                    p_val = float(float_vals[0])
+                                else:
+                                    raise ValueError(f"No valid float p-value found in result: {result}")
                         else:
-                            p_val = result
+                            p_val = float(result)
+
+                        if not (0 <= p_val <= 1):
+                            raise ValueError(f"Invalid p-value {p_val} for {test_name} - must be between 0 and 1")
 
                         results.append(
                             {
@@ -90,10 +96,10 @@ def run_benchmark(
                                 "repeat": rep,
                                 "ci_test": test_name,
                                 "cond_independent": True,
-                                "p_value": p_val,
+                                "p_value": p_val,  
                             }
                         )
-                # Alternative case (conditionally dependent, effect size > 0)
+
                 for eff in effect_sizes:
                     if eff == 0.0:
                         continue
@@ -108,14 +114,24 @@ def run_benchmark(
                         for test_name in compatible_tests:
                             ci_func = ci_tests[test_name]
                             result = ci_func("X", "Y", z_cols, df, boolean=False)
-                            # Robust extraction of p-value
+
                             if isinstance(result, tuple):
-                                if isinstance(result[-1], float):
-                                    p_val = result[-1]
+                                if len(result) >= 2 and isinstance(result[-1], (float, np.floating)):
+                                    p_val = float(result[-1])
+                                elif isinstance(result[0], (float, np.floating)):
+                                    p_val = float(result[0])
                                 else:
-                                    p_val = result[0]
+                                    float_vals = [x for x in result if isinstance(x, (float, np.floating))]
+                                    if float_vals:
+                                        p_val = float(float_vals[0])
+                                    else:
+                                        raise ValueError(f"No valid float p-value found in result: {result}")
                             else:
-                                p_val = result
+                                p_val = float(result)
+                            
+                            if not (0 <= p_val <= 1):
+                                raise ValueError(f"Invalid p-value {p_val} for {test_name} - must be between 0 and 1")
+                            
 
                             results.append(
                                 {
@@ -126,7 +142,7 @@ def run_benchmark(
                                     "repeat": rep,
                                     "ci_test": test_name,
                                     "cond_independent": False,
-                                    "p_value": p_val,
+                                    "p_value": p_val,  
                                 }
                             )
 
@@ -134,24 +150,40 @@ def run_benchmark(
 
 
 def compute_summary(df_results, significance_levels=[0.001, 0.01, 0.05, 0.1]):
-    """
-    Computes Type I/II errors and power at multiple significance levels using collected p-values.
-    """
+    if df_results.empty:
+        raise ValueError("No benchmark results to compute summary from!")
+    
+    if 'p_value' not in df_results.columns:
+        raise ValueError("p_value column missing from results!")
+    
     summary_rows = []
     group_cols = ["dgm", "sample_size", "n_cond_vars", "effect_size", "ci_test"]
+    
     for keys, group in df_results.groupby(group_cols):
         null_group = group[group["cond_independent"]]
         alt_group = group[~group["cond_independent"]]
+        
         for sl in significance_levels:
+            null_valid = null_group.dropna(subset=['p_value'])
+            alt_valid = alt_group.dropna(subset=['p_value'])
+
+            null_loss_rate = 1 - (len(null_valid) / len(null_group)) if len(null_group) > 0 else 0
+            alt_loss_rate = 1 - (len(alt_valid) / len(alt_group)) if len(alt_group) > 0 else 0
+            
+            if null_loss_rate > 0.5:
+                print(f"WARNING: Lost {null_loss_rate:.1%} of null data for {keys} due to invalid p-values")
+            if alt_loss_rate > 0.5:
+                print(f"WARNING: Lost {alt_loss_rate:.1%} of alternative data for {keys} due to invalid p-values")
+            
             type1 = (
-                (null_group["p_value"] < sl).mean() if not null_group.empty else np.nan
+                (null_valid["p_value"] < sl).mean() if len(null_valid) > 0 else np.nan
             )
             type2 = (
-                1 - (alt_group["p_value"] < sl).mean()
-                if not alt_group.empty
-                else np.nan
+                1 - (alt_valid["p_value"] < sl).mean()
+                if len(alt_valid) > 0 else np.nan
             )
             power = 1 - type2 if not np.isnan(type2) else np.nan
+            
             summary_rows.append(
                 dict(
                     zip(group_cols, keys),
@@ -159,11 +191,27 @@ def compute_summary(df_results, significance_levels=[0.001, 0.01, 0.05, 0.1]):
                     type1_error=type1,
                     type2_error=type2,
                     power=power,
-                    N_null=len(null_group),
-                    N_alt=len(alt_group),
+                    N_null=len(null_valid),
+                    N_alt=len(alt_valid),
+                    N_null_invalid=len(null_group) - len(null_valid),
+                    N_alt_invalid=len(alt_group) - len(alt_valid),
                 )
             )
+    
     df_summary = pd.DataFrame(summary_rows)
+    total_invalid = df_summary['N_null_invalid'].sum() + df_summary['N_alt_invalid'].sum()
+    total_tests = len(df_results)
+    invalid_rate = total_invalid / total_tests if total_tests > 0 else 0
+    
+    print(f"Data Quality Report:")
+    print(f"  Total tests run: {total_tests}")
+    print(f"  Invalid p-values: {total_invalid} ({invalid_rate:.1%})")
+    
+    if invalid_rate > 0.3:  
+        raise ValueError(f"Too many test failures: {invalid_rate:.1%} of tests produced invalid p-values")
+    elif invalid_rate > 0.1:  
+        print(f"  WARNING: High failure rate of {invalid_rate:.1%}")
+    
     return df_summary
 
 
@@ -183,7 +231,7 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
             fig, axes = plt.subplots(
                 len(sample_sizes),
                 len(n_cond_vars_list),
-                figsize=(4 * len(n_cond_vars_list), 2.5 * len(sample_sizes)),
+                figsize=(5 * len(n_cond_vars_list), 3 * len(sample_sizes)),  
                 sharex=True,
                 sharey=True,
             )
@@ -202,13 +250,21 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
                     ]
                     for method, color in zip(methods, palette):
                         s = subset[subset["ci_test"] == method]
-                        if not s.empty:
+                        if not s.empty and not s["type2_error"].isna().all():
                             x_vals = np.log10(s["significance_level"])
                             y_vals = np.log10(s["type2_error"])
-                            sort_idx = np.argsort(x_vals)
+                            valid_mask = ~(np.isnan(x_vals) | np.isnan(y_vals) | np.isinf(y_vals))
+                            if not valid_mask.any():
+                                print(f"WARNING: No valid data points for {method} in {dgm} plot")
+                                continue
+                            if valid_mask.sum() < len(x_vals) * 0.8:  
+                                lost_pct = (1 - valid_mask.sum() / len(x_vals)) * 100
+                                print(f"WARNING: Lost {lost_pct:.1f}% of data points for {method} due to invalid values")
+                            
+                            sort_idx = np.argsort(x_vals[valid_mask])
                             ax.plot(
-                                x_vals.iloc[sort_idx],
-                                y_vals.iloc[sort_idx],
+                                x_vals[valid_mask].iloc[sort_idx],
+                                y_vals[valid_mask].iloc[sort_idx],
                                 marker="o",
                                 linestyle="-",
                                 label=method,
@@ -221,7 +277,17 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
                     if i == len(sample_sizes) - 1:
                         ax.set_xlabel("log10 Significance Level")
                     ax.grid(True, alpha=0.4)
-            handles, labels = axes[0, 0].get_legend_handles_labels()
+                    ax.set_aspect('auto') 
+                    
+            handles, labels = [], []
+            for ax_row in axes:
+                for ax in ax_row:
+                    h, l = ax.get_legend_handles_labels()
+                    for handle, label in zip(h, l):
+                        if label not in labels:
+                            handles.append(handle)
+                            labels.append(label)
+            
             fig.legend(
                 handles,
                 labels,
@@ -235,15 +301,18 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
             )
             plt.tight_layout(rect=[0, 0, 1, 0.97])
             fname1 = f"{plot_dir}/{dgm}_effect{eff}_typeII_vs_signif.png"
-            plt.savefig(fname1, bbox_inches="tight")
+            plt.savefig(fname1, bbox_inches="tight", dpi=150)  
             plt.close(fig)
 
         # ---Power vs Sample Size, all significance levels in plot ---
         for eff in sorted(df_dgm["effect_size"].unique()):
+            if eff == 0.0:  
+                continue
+                
             fig, axes = plt.subplots(
                 len(n_cond_vars_list),
                 1,
-                figsize=(7, 2.5 * len(n_cond_vars_list)),
+                figsize=(8, 3 * len(n_cond_vars_list)), 
                 sharex=True,
                 sharey=True,
             )
@@ -259,12 +328,21 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
                             & (df_dgm["significance_level"] == sl)
                             & (df_dgm["ci_test"] == method)
                         ]
-                        if not subset.empty:
-                            sort_idx = np.argsort(subset["sample_size"])
+                        if not subset.empty and not subset["power"].isna().all():
+                            
+                            valid_subset = subset.dropna(subset=['power'])
+                            if len(valid_subset) == 0:
+                                print(f"WARNING: No valid power data for {method} at significance level {sl}")
+                                continue
+                            if len(valid_subset) < len(subset) * 0.8:  
+                                lost_pct = (1 - len(valid_subset) / len(subset)) * 100
+                                print(f"WARNING: Lost {lost_pct:.1f}% of power data for {method} at sl={sl}")
+                            
+                            sort_idx = np.argsort(valid_subset["sample_size"])
                             linestyle = ["-", "--", "-.", ":"][idx % 4]
                             ax.plot(
-                                subset["sample_size"].iloc[sort_idx],
-                                subset["power"].iloc[sort_idx],
+                                valid_subset["sample_size"].iloc[sort_idx],
+                                valid_subset["power"].iloc[sort_idx],
                                 marker="o",
                                 linestyle=linestyle,
                                 color=color,
@@ -277,6 +355,8 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
                 if j == len(n_cond_vars_list) - 1:
                     ax.set_xlabel("Sample Size")
                 ax.grid(True, alpha=0.4)
+                ax.set_aspect('auto') 
+                
             handles, labels = [], []
             for ax in axes:
                 handles_ax, labels_ax = ax.get_legend_handles_labels()
@@ -296,31 +376,33 @@ def plot_benchmarks(df_summary, plot_dir="plots"):
             )
             plt.tight_layout(rect=[0, 0, 1, 0.97])
             fname2 = f"{plot_dir}/{dgm}_effect{eff}_power_vs_samplesize_allSL.png"
-            plt.savefig(fname2, bbox_inches="tight")
+            plt.savefig(fname2, bbox_inches="tight", dpi=150)  
             plt.close(fig)
 
 
 if __name__ == "__main__":
     os.makedirs("results", exist_ok=True)
+    print("Starting benchmark execution...")
     df_results = run_benchmark()
+    print(f"Benchmark completed. Generated {len(df_results)} test results.")
     df_results.to_csv("results/ci_benchmark_raw_result.csv", index=False)
 
     df_summary = compute_summary(df_results)
+    print(f"Summary computed with {len(df_summary)} summary rows.")
     df_summary.to_csv("results/ci_benchmark_summaries.csv", index=False)
-    print(df_summary)
+    print(df_summary.head())
     print(
         "\nDetailed results and summary saved to ci_benchmark_raw_result.csv and ci_benchmark_summaries.csv"
     )
-    raw_csv_path = "results/ci_benchmark_raw_result.csv"
-    if os.path.exists(raw_csv_path):
-        os.remove(raw_csv_path)
 
+    print("Generating plots...")
     plot_benchmarks(df_summary)
+    print("Plots generated successfully.")
     
- ##  Making a copy of the result to the web directory
+    # Making a copy of the result to the web directory
     web_results_dir = os.path.join("web", "results")
     os.makedirs(web_results_dir, exist_ok=True)
     src = os.path.join("results", "ci_benchmark_summaries.csv")
-    dst = os.path.join(web_results_dir, "default_ci_benchmark_summaries.csv")
+    dst = os.path.join(web_results_dir, "ci_benchmark_summaries.csv")
     shutil.copyfile(src, dst)
     print(f"Copied summary CSV to {dst} for web UI.")