omarelfiki · omarelfiki · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/.github/workflows/data-analysis.yml b/.github/workflows/data-analysis.yml
@@ -6,7 +6,7 @@ on:
       - main
     types: [opened, synchronize, reopened, ready_for_review]
     paths:
-      - '**/combined_results.csv'
+      - '**/collected_results.csv'
   workflow_dispatch:
 
 permissions:
@@ -33,10 +33,10 @@ jobs:
           pip install pandas tabulate matplotlib numpy pathlib
 
       - name: Run validation script
-        run: python training/scripts/workflows/validate_results.py
+        run: python data/workflows/validate_results.py
 
       - name: Generate summary report
-        run: python training/scripts/workflows/results_summary.py
+        run: python data/workflows/results_summary.py
 
       - name: Determine target branch
         id: branch

diff --git a/.github/workflows/json-backup.yml b/.github/workflows/json-backup.yml
@@ -30,7 +30,7 @@ jobs:
           pip install pandas
 
       - name: Run JSON script
-        run: python training/scripts/workflows/backup_data.py
+        run: python data/workflows/backup_data.py
 
       - name: Commit & push updates
         uses: actions-js/push@master

diff --git a/.github/workflows/sync-upstream.yml b/.github/workflows/sync-upstream.yml
@@ -19,13 +19,13 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Verify combined_results.csv exists
+      - name: Verify collected_results.csv exists
         run: |
-          if [ ! -f "data/combined_results.csv" ]; then
-            echo "No combined_results.csv found. Exiting."
+          if [ ! -f "data/collected_results.csv" ]; then
+            echo "No collected_results.csv found. Exiting."
             exit 1
           fi
-          echo "Found data/combined_results.csv"
+          echo "Found data/collected_results.csv"
 
       - name: Clone the shared dataset fork
         run: |
@@ -37,8 +37,8 @@ jobs:
 
       - name: Copy updated results to Group 6
         run: |
-          cp "data/combined_results.csv" "shared_repo/Group 6/combined_results.csv"
-          echo "Copied combined_results.csv to Group 6 folder."
+          cp "data/collected_results.csv" "shared_repo/Group 6/combined_results.csv"
+          echo "Copied collected_results.csv to Group 6 folder as combined_results.csv."
 
       - name: Commit and push updated results to fork
         run: |

diff --git a/.github/workflows/threshold-update.yml b/.github/workflows/threshold-update.yml
@@ -27,7 +27,7 @@ jobs:
           pip install pandas tabulate matplotlib numpy pathlib
 
       - name: Generate thresholds
-        run: python training/scripts/workflows/compute_threshold.py
+        run: python data/workflows/compute_threshold.py
 
       - name: Commit & push all updates
         uses: actions-js/push@master

diff --git a/data/CONTRIBUTING.md b/data/CONTRIBUTING.md
@@ -2,8 +2,7 @@
 
 All training runs automatically append a new entry to:
 
-data/combined_results.csv &
-data/combined_results.json
+data/collected_results.csv
 
 These files are shared by the whole team — so follow this workflow to keep everyone’s results organized and avoid data loss.
 
@@ -24,7 +23,7 @@ The script automatically adds your new results to the files above.
 Commit and push your updates:
 
 ```bash
-git add data/combined_results.csv data/combined_results.json
+git add data/collected_results.csv
 git commit -m "Add results for Run [YourRunID] ([Environment/ConfigName])"
 git push
 ```
@@ -33,12 +32,12 @@ git push
 
 If Git reports something like:
 ```abpublidot
-CONFLICT (content): Merge conflict in data/combined_results.csv
+CONFLICT (content): Merge conflict in data/collected_results.csv
 ```
 
 Open the file, keep both lines (each is a valid run), then:
 ```abpublidot
-git add data/combined_results.csv
+git add data/collected_results.csv
 git commit -m "Resolve CSV merge conflict"
 git push
 ```

diff --git a/data/combined_results.csv → data/collected_results.csv b/data/combined_results.csv → data/collected_results.csv
diff --git a/data/combined_results.json → data/collected_results.json b/data/combined_results.json → data/collected_results.json
diff --git a/data/combined_results_old.csv → data/deprecated_results.csv b/data/combined_results_old.csv → data/deprecated_results.csv
diff --git a/data/plots/Average_Ram_vs_Average_Cpu.png b/data/plots/Average_Ram_vs_Average_Cpu.png
diff --git a/data/plots/Reward_Mean_vs_Steps.png b/data/plots/Reward_Mean_vs_Steps.png
diff --git a/data/plots/Reward_Mean_vs_Total_Time.png b/data/plots/Reward_Mean_vs_Total_Time.png
diff --git a/data/summary_report.md → data/reports/summary_report.md b/data/summary_report.md → data/reports/summary_report.md
diff --git a/data/reports/validation_report.md b/data/reports/validation_report.md
@@ -0,0 +1,11 @@
+# Data Validation Report
+
+Generated on 2026-01-14 16:42:39
+
+⚠️ **Issues found:**
+- Duplicate values in 'run_id': 702
+
+## Dataset Overview
+- Rows: 3913
+- Columns: 34
+- Missing values: 0
diff --git a/data/prediction_snapshot.csv → data/snapshots/prediction_snapshot.csv b/data/prediction_snapshot.csv → data/snapshots/prediction_snapshot.csv
diff --git a/data/thresholds/thresholds_2025-11-21_09-33.json b/data/thresholds/thresholds_2025-11-21_09-33.json
@@ -1,7 +1,7 @@
 {
   "version": "2025-11-21_09-33",
   "generated_at": "2025-11-21T09:33:28",
-  "source_file": "/home/runner/work/unity-ml-drl-data/unity-ml-drl-data/data/combined_results.csv",
+  "source_file": "/home/runner/work/unity-ml-drl-data/unity-ml-drl-data/data/collected_results.csv",
   "method": "empirical_reference",
   "alpha": 0.8,
   "window_last_steps": 50000,

diff --git a/data/validation_report.md b/data/validation_report.md
diff --git a/data/workflows/.gitignore b/data/workflows/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/training/scripts/FinalSnapshot.py → data/workflows/FinalSnapshot.py b/training/scripts/FinalSnapshot.py → data/workflows/FinalSnapshot.py
@@ -4,7 +4,7 @@
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..'))
 data_dir = os.path.join(project_root, 'data')
-csv_path = os.path.join(data_dir, 'combined_results.csv')
+csv_path = os.path.join(data_dir, 'collected_results.csv')
 
 df = pd.read_csv(csv_path)
 thresholds = {

diff --git a/training/scripts/workflows/__init__.py → data/workflows/__init__.py b/training/scripts/workflows/__init__.py → data/workflows/__init__.py
diff --git a/data/workflows/backup_data.py b/data/workflows/backup_data.py
@@ -0,0 +1,16 @@
+import os
+import pandas as pd
+import json
+
+from paths import CSV_FILE, JSON_FILE
+
+df = pd.read_csv(CSV_FILE, dtype=str)
+df = df.fillna("")
+df = df.astype(str)
+data = df.to_dict(orient="records")
+
+with open(JSON_FILE, "w") as jf:
+    json.dump(data, jf, indent=4, ensure_ascii=False)
+
+print(f"Converted '{CSV_FILE}' to '{JSON_FILE}' successfully.")
+print(f"Total records: {len(data)}")
diff --git a/...ng/scripts/workflows/compute_threshold.py → data/workflows/compute_threshold.py b/...ng/scripts/workflows/compute_threshold.py → data/workflows/compute_threshold.py
@@ -3,13 +3,11 @@
 """
 
 import json
-from pathlib import Path
 import numpy as np
 import pandas as pd
 from datetime import datetime
 
-DATA_FILE = Path(__file__).resolve().parent.parent.parent.parent / "data" / "combined_results.csv"
-THRESHOLDS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "data" / "thresholds"
+from paths import CSV_FILE, THRESHOLD_DIR
 
 # Default Parameters
 TAIL_STEPS_DEFAULT = 50_000
@@ -34,11 +32,11 @@ def find_first_step_reaching_threshold(steps, rewards, threshold):
     return None
 
 def main():
-    if not DATA_FILE.exists():
-        print(f"Error: Data file not found: {DATA_FILE}")
+    if not CSV_FILE.exists():
+        print(f"Error: Data file not found: {CSV_FILE}")
         return
 
-    df = pd.read_csv(DATA_FILE)
+    df = pd.read_csv(CSV_FILE)
 
     if not {'environment', 'steps', 'reward_mean'}.issubset(df.columns):
         print("Error: CSV missing required columns 'environment', 'steps', or 'reward_mean'")
@@ -83,19 +81,19 @@ def main():
     payload = {
         "version": version_ts,
         "generated_at": datetime.now().isoformat(timespec="seconds"),
-        "source_file": str(DATA_FILE.resolve()),
+        "source_file": str(CSV_FILE.resolve()),
         "method": "empirical_reference",
         "alpha": ALPHA_DEFAULT,
         "window_last_steps": TAIL_STEPS_DEFAULT,
         "thresholds": thresholds,
     }
 
     # Ensure output directory exists
-    THRESHOLDS_DIR.mkdir(parents=True, exist_ok=True)
+    THRESHOLD_DIR.mkdir(parents=True, exist_ok=True)
 
     # Write timestamped and "latest" files
-    timestamped_path = THRESHOLDS_DIR / f"thresholds_{version_ts}.json"
-    latest_path = THRESHOLDS_DIR / "latest_thresholds.json"
+    timestamped_path = THRESHOLD_DIR / f"thresholds_{version_ts}.json"
+    latest_path = THRESHOLD_DIR / "latest_thresholds.json"
 
     with open(timestamped_path, "w") as f:
         json.dump(payload, f, indent=2)

diff --git a/training/scripts/workflows/csv_snapshot.py → data/workflows/csv_snapshot.py b/training/scripts/workflows/csv_snapshot.py → data/workflows/csv_snapshot.py
@@ -1,11 +1,8 @@
 from pathlib import Path
 import pandas as pd
+from paths import CSV_FILE, SNAPSHOTS_DIR
 
-SCRIPT_DIR = Path(__file__).resolve().parent
-PROJECT_ROOT = SCRIPT_DIR.parents[2]
-DATA_DIR = PROJECT_ROOT / "data"
-INPUT_CSV = DATA_DIR / "combined_results.csv"
-OUTPUT_CSV = DATA_DIR / "prediction_snapshot.csv"
+OUTPUT_CSV = SNAPSHOTS_DIR / "prediction_snapshot.csv"
 
 threshold_cols = [
     "threshold_value",
@@ -33,4 +30,4 @@ def filter_runs_with_threshold(input_path: Path, output_path: Path) -> None:
     df_filtered.to_csv(output_path, index=False)
 
 if __name__ == "__main__":
-    filter_runs_with_threshold(INPUT_CSV, OUTPUT_CSV)
+    filter_runs_with_threshold(CSV_FILE, OUTPUT_CSV)
diff --git a/data/workflows/paths.py b/data/workflows/paths.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+DATA_DIR = Path(__file__).resolve().parent.parent.parent / "data"
+CSV_FILE = DATA_DIR / "collected_results.csv"
+JSON_FILE = DATA_DIR / "collected_results.json"
+THRESHOLD_DIR = DATA_DIR / "thresholds"
+SNAPSHOTS_DIR = DATA_DIR / "snapshots"
+REPORTS_DIR = DATA_DIR / "reports"
+PLOTS_DIR = DATA_DIR / "plots"
diff --git a/...ning/scripts/workflows/results_summary.py → data/workflows/results_summary.py b/...ning/scripts/workflows/results_summary.py → data/workflows/results_summary.py
@@ -1,18 +1,19 @@
 """
 Generates a Markdown summary report for training runs.
 
-Reads data/combined_results.csv, calculates stats (mean, std),
+Reads data/collected_results.csv, calculates stats (mean, std),
 creates plots, finds insights/anomalies, and writes the report
 to data/summary_report.md.
 
-Dependency for Github Actions workflows on repo.
+Dependency for GitHub Actions workflows on repo.
 """
 
 import pandas as pd
 import matplotlib.pyplot as plt
 import base64
 import os
 from datetime import datetime
+from paths import REPORTS_DIR, CSV_FILE, PLOTS_DIR
 
 # CSV Header Definition
 CSV_HEADERS = [
@@ -42,13 +43,7 @@ def log(message):
     print(f"[{timestamp}] {message}")
 
 log("=== Starting Summary Generation ====")
-
-# Configuration and Paths
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-project_root = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..'))
-data_dir = os.path.join(project_root, 'data')
-csv_path = os.path.join(data_dir, 'combined_results.csv')
-summary_md = os.path.join(data_dir, 'summary_report.md')
+SUMMARY_MD = os.path.join(REPORTS_DIR, 'summary_report.md')
 
 # Column name configuration (must match CSV headers)
 TIMESTAMP_COL = None
@@ -74,15 +69,14 @@ def log(message):
 
 KEY_INSIGHT_COLS = ["reward_mean", "p_loss_mean", "average_cpu", "total_time"]
 
-log(f"Project root: {project_root}")
-log(f"Input CSV: {csv_path}")
-log(f"Output Markdown: {summary_md}")
+log(f"Input CSV: {CSV_FILE}")
+log(f"Output Markdown: {SUMMARY_MD}")
 
 # Load Data
 df = pd.DataFrame()
-if os.path.exists(csv_path):
+if os.path.exists(CSV_FILE):
     try:
-        df = pd.read_csv(csv_path)
+        df = pd.read_csv(CSV_FILE)
         log("Successfully loaded data from CSV")
 
         # Convert numeric columns, coercing errors to NaN
@@ -92,7 +86,7 @@ def log(message):
             elif col not in df.columns:
                 log(f"[WARNING] Configured numeric column '{col}' not found in CSV.")
 
-        # Sort by timestamp, if it exists
+        # Sort by timestamp if it exists
         if TIMESTAMP_COL and TIMESTAMP_COL in df.columns:
             df[TIMESTAMP_COL] = pd.to_datetime(df[TIMESTAMP_COL], errors='coerce')
             df = df.dropna(subset=[TIMESTAMP_COL])
@@ -105,7 +99,7 @@ def log(message):
     except Exception as e:
         log(f"[ERROR] Failed during CSV loading: {e}")
 else:
-    log(f"[ERROR] CSV file not found at: {csv_path}.")
+    log(f"[ERROR] CSV file not found at: {CSV_FILE}.")
 
 # Compute Statistics
 summary_stats = {}
@@ -135,7 +129,7 @@ def log(message):
 
 # === Combined plot generation: saves to disk AND embeds in markdown ===
 def create_chart_base64(df_plot, x_col, y_col, title):
-    """Creates scatter plot, saves to file, and returns Base64 Markdown image."""
+    """Creates a scatter plot, saves to file, and returns Base64 Markdown image."""
     # Validate input
     if df_plot is None or df_plot.empty:
         return None
@@ -158,11 +152,10 @@ def create_chart_base64(df_plot, x_col, y_col, title):
         plt.grid(True, linestyle="--", alpha=0.5)
         plt.tight_layout()
 
-        # Ensure plots directory exists (cross-platform)
-        plots_dir = os.path.join(project_root, "data", "plots")
-        os.makedirs(plots_dir, exist_ok=True)
+        # Ensure the plots directory exists (cross-platform)
+        os.makedirs(PLOTS_DIR, exist_ok=True)
         safe_title = title.replace(" ", "_").replace("/", "_")
-        file_path = os.path.join(plots_dir, f"{safe_title}.png")
+        file_path = os.path.join(PLOTS_DIR, f"{safe_title}.png")
 
         # Save plot directly to disk
         plt.savefig(file_path, dpi=90)
@@ -227,7 +220,7 @@ def generate_insights(df, summary_stats):
     if not anomalies_found:
         insights.append("No significant anomalies detected (all values within 3 standard deviations).")
 
-    # Trend: Compare recent performance to overall average
+    # Trend: Compare recent performance to the overall average
     if reward_col in averages:
         recent_avg = df.tail(5)[reward_col].mean()
         overall_avg = averages[reward_col]
@@ -302,8 +295,8 @@ def generate_insights(df, summary_stats):
         report_parts.append(f"- {insight}")
 
     # Write Markdown
-    os.makedirs(os.path.dirname(summary_md), exist_ok=True)
-    with open(summary_md, "w", encoding="utf-8") as f:
+    os.makedirs(os.path.dirname(SUMMARY_MD), exist_ok=True)
+    with open(SUMMARY_MD, "w", encoding="utf-8") as f:
         f.write("\n".join(report_parts))
     log("Successfully wrote summary report.")
 except Exception as e: