diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml
index 04946c6..6179857 100644
--- a/docker/docker-compose.rocm70_9-1.yaml
+++ b/docker/docker-compose.rocm70_9-1.yaml
@@ -3,7 +3,7 @@ services:
     container_name: training-overlap-bugs-rocm70
     build:
       context: .
-      dockerfile: Dockerfile.rocm70
+      dockerfile: Dockerfile.rocm70_9-1
     user: root
     privileged: true
     network_mode: host
@@ -15,8 +15,6 @@ services:
     security_opt:
       - seccomp=unconfined
     environment:
-      - RCCL_FOLDER=/rccl
-      - LD_LIBRARY_PATH=/rccl/build/release:$LD_LIBRARY_PATH
       - TORCH_NCCL_HIGH_PRIORITY=1
 
     volumes:
diff --git a/scripts/tracelens_single_config/README.md b/scripts/tracelens_single_config/README.md
new file mode 100644
index 0000000..2c7ca54
--- /dev/null
+++ b/scripts/tracelens_single_config/README.md
@@ -0,0 +1,166 @@
+# RCCL Warp Speed Performance Testing
+
+Test RCCL warp_speed_v1 branch from https://github.com/mustafabar/rccl.git
+
+## Prerequisites
+
+```bash
+pip install pandas openpyxl matplotlib seaborn numpy
+```
+
+## Run Tests
+
+### Step 1: Start Container and Build RCCL
+
+```bash
+cd docker
+docker-compose -f docker-compose.rocm70_9-1.yaml build
+docker-compose -f docker-compose.rocm70_9-1.yaml up -d
+docker-compose -f docker-compose.rocm70_9-1.yaml exec torchenv-rocm70 bash
+
+# Inside container - build warp_speed_v1 (always rebuild)
+# Note: Set --amdgpu_targets to match your GPU architecture
+# Run 'rocminfo | grep gfx' to find your GPU target (e.g., gfx942, gfx950)
+cd /opt
+if [ -d "rccl" ]; then
+    cd rccl
+    git checkout warp_speed_v1
+    git pull
+else
+    git clone --recursive https://github.com/mustafabar/rccl.git
+    cd rccl
+    git checkout warp_speed_v1
+fi
+./install.sh -l --amdgpu_targets=gfx950
+
+cd /workspace/aorta
+```
+
+### Step 2: Run RCCL Tests
+
+```bash
+# Default 3 configurations
+./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
+
+# Custom configurations (CU_count,threads pairs)
+./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512" -c ./config/single_node/gemm_overlap_comm.yaml
+```
+
+Output structure:
+```
+experiments/
+  rccl_warp_speed_YYYYMMDD_HHMMSS/
+    56cu_256threads/
+      torch_profiler/       # Raw profiler traces
+      run_output.log       # Training output log
+    37cu_384threads/
+    32cu_512threads/
+    rccl_warp_speed_summary_YYYYMMDD_HHMMSS.txt
+```
+
+### Step 3: Generate Reports (Outside Container)
+
+```bash
+# Exit container
+exit
+
+# Run complete analysis
+python scripts/tracelens_single_config/run_full_analysis.py \
+  --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \
+  --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \
+  --output comparison_results \
+  --all
+
+# Or skip TraceLens if already done
+python scripts/tracelens_single_config/run_full_analysis.py \
+  --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \
+  --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \
+  --output comparison_results \
+  --all --skip-tracelens
+```
+
+## Generated Excel Reports
+
+### Individual TraceLens Reports (per configuration)
+Each configuration generates:
+- `tracelens_analysis/individual_reports/perf_rank*.xlsx` - Per-rank performance breakdown
+- `tracelens_analysis/collective_reports/collective_all_ranks.xlsx` - Collective operations summary
+- `tracelens_analysis/gpu_timeline_summary_mean.xlsx` - GPU timeline averages
+
+### Final Analysis Report (`final_analysis_report.xlsx`)
+
+Contains multiple sheets:
+
+**Summary Sheets:**
+- `Summary_Dashboard` - High-level comparison metrics with percentage changes
+- `Summary_Comparison` - Side-by-side summary comparison
+- `GPU_ByRank_Comparison` - Detailed per-rank performance comparison
+- `Comparison_By_Rank` - Rank-wise metric comparison with differences
+
+**GPU Timeline Sheets:**
+- `All_Ranks_Combined` - Combined GPU timeline data from all ranks
+- `Summary` - Aggregated GPU timeline summary
+- `Rank_*` - Individual rank GPU timelines
+
+**Collective/NCCL Sheets:**
+- `nccl_summary_implicit_sync` - NCCL operations with implicit synchronization
+- `nccl_summary_long` - Long-running NCCL operations
+- `nccl_summary_implicit_sync_comparison` - Comparison of implicit sync operations
+- `nccl_summary_long_comparison` - Comparison of long operations
+
+**Raw Data Sheets (hidden by default):**
+- `gpu_timeline_combined` - Raw combined GPU timeline data
+- `gpu_timeline_comparison` - Raw GPU timeline comparison data
+- `collective_combined` - Raw collective operations data
+- `collective_comparison` - Raw collective comparison data
+
+### Comparison Reports
+
+- `gpu_timeline_combined.xlsx` - Baseline and test GPU metrics combined
+- `gpu_timeline_comparison.xlsx` - GPU metrics with comparison analysis
+- `collective_combined.xlsx` - Baseline and test collective operations combined
+- `collective_comparison.xlsx` - Collective operations with comparison analysis
+
+## Generated Visualizations
+
+### HTML Report
+- `performance_analysis_report.html` - Complete report with all embedded plots
+
+### Individual Plot Files (12 Total)
+1. `plot1_percentage_change_overview.png` - Horizontal bar chart showing performance changes
+2. `plot2_absolute_time_comparison.png` - Bar chart comparing absolute times
+3. `plot3_performance_heatmap.png` - Heatmap of performance by rank
+4. `plot4_total_execution_time.png` - Line plot of total execution time per rank
+5. `plot5_computation_time.png` - Line plot of computation time across ranks
+6. `plot6_communication_time.png` - Line plot of communication time across ranks
+7. `plot7_idle_time.png` - Line plot of idle time across ranks
+8. `plot8_percentage_difference_all_metrics.png` - Bar plot showing percentage differences for all metrics
+9. `plot9_nccl_latency.png` - Line plot of latency vs message size
+10. `plot10_algorithm_bandwidth.png` - Line plot of algorithm bandwidth vs message size
+11. `plot11_bus_bandwidth.png` - Line plot of bus bandwidth vs message size
+12. `plot12_nccl_summary.png` - Combined percentage summary and total latency
+
+## Key Metrics Analyzed
+
+**GPU Metrics:**
+- `computation_time` - Time spent in computation
+- `total_comm_time` - Total communication time
+- `exposed_comm_time` - Non-overlapped communication time
+- `idle_time` - GPU idle time
+- `total_memcpy_time` - Memory copy time
+- `exposed_memcpy_time` - Non-overlapped memory copy time
+- `busy_time` - Total GPU busy time
+- `total_time` - Total execution time
+
+**NCCL Metrics:**
+- `comm_latency_mean` - Average communication latency
+- `algo bw (GB/s)_mean` - Algorithm bandwidth
+- `bus bw (GB/s)_mean` - Bus bandwidth
+- `Total comm latency (ms)` - Total communication latency
+- `count` - Number of operations
+
+## Convert to PDF
+
+1. Open `performance_analysis_report.html` in browser
+2. Print to PDF (Ctrl+P or Cmd+P)
+3. Choose landscape orientation for better plot visibility
diff --git a/scripts/tracelens_single_config/add_collective_comparison.py b/scripts/tracelens_single_config/add_collective_comparison.py
new file mode 100644
index 0000000..6f3f310
--- /dev/null
+++ b/scripts/tracelens_single_config/add_collective_comparison.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_collective_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'):
+    print(f"Loading: {input_path}")
+    print(f"  Baseline label: {baseline_label}")
+    print(f"  Test label: {test_label}")
+
+    xl = pd.ExcelFile(input_path)
+
+    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+        # Copy only summary sheets
+        for sheet_name in xl.sheet_names:
+            # Only keep sheets with 'summary' in the name
+            if 'summary' not in sheet_name.lower():
+                print(f"  Skip {sheet_name} (keeping only summary sheets)")
+                continue
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+            print(f"  Copied {sheet_name}")
+
+        # Process summary sheets for comparison
+        for sheet_name in ['nccl_summary_implicit_sync', 'nccl_summary_long']:
+            if sheet_name not in xl.sheet_names:
+                continue
+
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+
+            # Get actual source values from the dataframe
+            sources = df['source'].unique()
+            # Determine which is baseline and which is test (baseline should be first)
+            if len(sources) >= 2:
+                actual_baseline = sources[0]
+                actual_test = sources[1]
+            else:
+                actual_baseline = baseline_label
+                actual_test = test_label
+
+            # Separate baseline and test
+            baseline_df = df[df['source'] == actual_baseline].copy()
+            test_df = df[df['source'] == actual_test].copy()
+
+            if len(baseline_df) == 0 or len(test_df) == 0:
+                print(f"  Skip {sheet_name} - missing data")
+                continue
+
+            # Create comparison dataframe
+            comparison = pd.DataFrame()
+
+            # Identify key columns for grouping
+            group_cols = ['Collective name', 'dtype', 'In msg nelems']
+            if not all(col in baseline_df.columns for col in group_cols):
+                group_cols = ['Collective name']
+
+            # Group and compare
+            baseline_grouped = baseline_df.groupby(group_cols, as_index=False)
+            test_grouped = test_df.groupby(group_cols, as_index=False)
+
+            for name, base_group in baseline_grouped:
+                # Find matching test group
+                if isinstance(name, tuple):
+                    mask = pd.Series([True] * len(test_df), index=test_df.index)
+                    for col, val in zip(group_cols, name):
+                        mask = mask & (test_df[col] == val)
+                else:
+                    mask = (test_df[group_cols[0]] == name)
+
+                test_group = test_df.loc[mask]
+
+                if len(test_group) == 0:
+                    continue
+
+                # Create comparison row
+                comp_row = {}
+
+                # Copy grouping columns
+                if isinstance(name, tuple):
+                    for col, val in zip(group_cols, name):
+                        comp_row[col] = val
+                else:
+                    comp_row[group_cols[0]] = name
+
+                # Compare numeric columns
+                numeric_cols = ['comm_latency_mean', 'algo bw (GB/s)_mean', 'bus bw (GB/s)_mean',
+                               'Total comm latency (ms)', 'count']
+
+                for col in numeric_cols:
+                    if col not in base_group.columns or col not in test_group.columns:
+                        continue
+
+                    base_val = base_group[col].values[0]
+                    test_val = test_group[col].values[0]
+
+                    comp_row[f'{baseline_label}_{col}'] = base_val
+                    comp_row[f'{test_label}_{col}'] = test_val
+                    comp_row[f'diff_{col}'] = test_val - base_val
+
+                    # For latency/time: positive percent_change means faster (less time)
+                    # For bandwidth: positive percent_change means better (more bandwidth)
+                    if 'latency' in col.lower() or 'time' in col.lower():
+                        # Lower is better - positive when saleelk is faster
+                        pct_change = (base_val - test_val) / base_val * 100 if base_val != 0 else 0
+                        comp_row[f'percent_change_{col}'] = pct_change
+                    elif 'bw' in col.lower() or 'bandwidth' in col.lower():
+                        # Higher is better - positive when saleelk is better
+                        pct_change = (test_val - base_val) / base_val * 100 if base_val != 0 else 0
+                        comp_row[f'percent_change_{col}'] = pct_change
+
+                    comp_row[f'ratio_{col}'] = test_val / base_val if base_val != 0 else 0
+
+                comparison = pd.concat([comparison, pd.DataFrame([comp_row])], ignore_index=True)
+
+            # Write comparison sheet (shorten name to fit Excel's 31 char limit)
+            # Replace 'nccl_summary_' with 'nccl_' and '_comparison' with '_cmp'
+            comparison_sheet_name = sheet_name.replace('nccl_summary_', 'nccl_') + '_cmp'
+            comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False)
+            print(f"  Added {comparison_sheet_name}")
+
+            # Add conditional formatting to percent_change columns
+            print(f"    Applying conditional formatting to {comparison_sheet_name}...")
+
+            ws = writer.sheets[comparison_sheet_name]
+
+            # Format all percent_change columns with color scale
+            for col_idx, col in enumerate(comparison.columns, start=1):
+                if 'percent_change' in col:
+                    # Convert column index to Excel letter (A, B, C, ...)
+                    if col_idx <= 26:
+                        col_letter = chr(64 + col_idx)
+                    else:
+                        col_letter = chr(64 + (col_idx // 26)) + chr(64 + (col_idx % 26))
+
+                    data_range = f'{col_letter}2:{col_letter}{len(comparison)+1}'
+
+                    # Color scale: red (min/negative) -> white (0) -> green (max/positive)
+                    ws.conditional_formatting.add(data_range,
+                        ColorScaleRule(
+                            start_type='min', start_color='F8696B',  # Red
+                            mid_type='num', mid_value=0, mid_color='FFFFFF',  # White
+                            end_type='max', end_color='63BE7B'  # Green
+                        ))
+
+                    print(f"      Formatted {col}")
+
+    print(f"\nSaved: {output_path}")
+    print("\nNew comparison sheets added")
+    print("percent_change interpretation:")
+    print("  For latency/time: Positive = faster (less time)")
+    print("  For bandwidth: Positive = better (more bandwidth)")
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add comparison sheets to combined collective reports')
+    parser.add_argument('--input', required=True, help='Input combined collective Excel file')
+    parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets')
+    parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data')
+    parser.add_argument('--test-label', default='test', help='Label for test data')
+
+    args = parser.parse_args()
+
+    return add_collective_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/scripts/tracelens_single_config/add_comparison_sheets.py b/scripts/tracelens_single_config/add_comparison_sheets.py
new file mode 100755
index 0000000..765f391
--- /dev/null
+++ b/scripts/tracelens_single_config/add_comparison_sheets.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'):
+    print(f"Loading: {input_path}")
+    print(f"  Baseline label: {baseline_label}")
+    print(f"  Test label: {test_label}")
+
+    xl = pd.ExcelFile(input_path)
+
+    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+        # Copy all original sheets
+        for sheet_name in xl.sheet_names:
+            df = pd.read_excel(input_path, sheet_name=sheet_name)
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+            print(f"  Copied {sheet_name}")
+
+        # Add comparison sheets
+        all_combined = pd.read_excel(input_path, sheet_name='All_Ranks_Combined')
+
+        # Get actual source values from the dataframe
+        sources = all_combined['source'].unique()
+        # Determine which is baseline and which is test (baseline should be first)
+        if len(sources) >= 2:
+            actual_baseline = sources[0]
+            actual_test = sources[1]
+        else:
+            actual_baseline = baseline_label
+            actual_test = test_label
+
+        # Comparison 1: Side-by-side by rank
+        baseline_data = all_combined[all_combined['source'] == actual_baseline]
+        test_data = all_combined[all_combined['source'] == actual_test]
+
+        comparison_by_rank = pd.DataFrame()
+        for rank in sorted(baseline_data['rank'].unique()):
+            base_rank = baseline_data[baseline_data['rank'] == rank].set_index('type')
+            test_rank = test_data[test_data['rank'] == rank].set_index('type')
+
+            for metric_type in base_rank.index:
+                if metric_type in test_rank.index:
+                    base_time = base_rank.loc[metric_type, 'time ms']
+                    test_time = test_rank.loc[metric_type, 'time ms']
+                    ratio_val = test_time / base_time if base_time != 0 else 0
+                    # Percentage change: positive when test is faster (takes less time)
+                    pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+
+                    # Determine if better or worse
+                    if pct_change > 1:
+                        status = 'Better'
+                    elif pct_change < -1:
+                        status = 'Worse'
+                    else:
+                        status = 'Similar'
+
+                    comparison_by_rank = pd.concat([comparison_by_rank, pd.DataFrame({
+                        'rank': [rank],
+                        'type': [metric_type],
+                        f'{baseline_label}_time_ms': [base_time],
+                        f'{test_label}_time_ms': [test_time],
+                        'diff_time_ms': [test_time - base_time],
+                        'percent_change': [pct_change],
+                        'status': [status],
+                        'ratio': [ratio_val],
+                        f'{baseline_label}_percent': [base_rank.loc[metric_type, 'percent']],
+                        f'{test_label}_percent': [test_rank.loc[metric_type, 'percent']],
+                        'diff_percent': [test_rank.loc[metric_type, 'percent'] - base_rank.loc[metric_type, 'percent']]
+                    })], ignore_index=True)
+
+        comparison_by_rank.to_excel(writer, sheet_name='Comparison_By_Rank', index=False)
+        print(f"  Added Comparison_By_Rank")
+
+        # Comparison 2: Summary comparison
+        summary = pd.read_excel(input_path, sheet_name='Summary')
+        baseline_summary = summary[summary['source'] == actual_baseline].set_index('type')
+        test_summary = summary[summary['source'] == actual_test].set_index('type')
+
+        summary_comparison = pd.DataFrame()
+        for metric_type in baseline_summary.index:
+            if metric_type in test_summary.index:
+                base_time = baseline_summary.loc[metric_type, 'time ms']
+                test_time = test_summary.loc[metric_type, 'time ms']
+                ratio_val = test_time / base_time if base_time != 0 else 0
+                # Percentage change: positive when test is faster (takes less time)
+                pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+
+                summary_comparison = pd.concat([summary_comparison, pd.DataFrame({
+                    'type': [metric_type],
+                    f'{baseline_label}_time_ms': [base_time],
+                    f'{test_label}_time_ms': [test_time],
+                    'diff_time_ms': [test_time - base_time],
+                    'percent_change': [pct_change],
+                    'ratio': [ratio_val],
+                    f'{baseline_label}_percent': [baseline_summary.loc[metric_type, 'percent']],
+                    f'{test_label}_percent': [test_summary.loc[metric_type, 'percent']],
+                    'diff_percent': [test_summary.loc[metric_type, 'percent'] - baseline_summary.loc[metric_type, 'percent']]
+                })], ignore_index=True)
+
+        summary_comparison.to_excel(writer, sheet_name='Summary_Comparison', index=False)
+        print(f"  Added Summary_Comparison")
+
+        # Add conditional formatting to percent_change columns
+        print("\n  Applying conditional formatting...")
+
+        # Create color scale: Red (negative) -> White (0) -> Green (positive)
+
+        # Format Comparison_By_Rank
+        ws_rank = writer.sheets['Comparison_By_Rank']
+        # Find percent_change column
+        for col_idx, col in enumerate(comparison_by_rank.columns, start=1):
+            if col == 'percent_change':
+                col_letter = chr(64 + col_idx)  # Convert to Excel column letter
+                data_range = f'{col_letter}2:{col_letter}{len(comparison_by_rank)+1}'
+                # Color scale: red (min) -> white (0) -> green (max)
+                ws_rank.conditional_formatting.add(data_range,
+                    ColorScaleRule(
+                        start_type='min', start_color='F8696B',  # Red
+                        mid_type='num', mid_value=0, mid_color='FFFFFF',  # White
+                        end_type='max', end_color='63BE7B'  # Green
+                    ))
+                print(f"    Formatted Comparison_By_Rank column {col}")
+                break
+
+        # Format Summary_Comparison
+        ws_summary = writer.sheets['Summary_Comparison']
+        for col_idx, col in enumerate(summary_comparison.columns, start=1):
+            if col == 'percent_change':
+                col_letter = chr(64 + col_idx)
+                data_range = f'{col_letter}2:{col_letter}{len(summary_comparison)+1}'
+                # Color scale: red (min) -> white (0) -> green (max)
+                ws_summary.conditional_formatting.add(data_range,
+                    ColorScaleRule(
+                        start_type='min', start_color='F8696B',  # Red
+                        mid_type='num', mid_value=0, mid_color='FFFFFF',  # White
+                        end_type='max', end_color='63BE7B'  # Green
+                    ))
+                print(f"    Formatted Summary_Comparison column {col}")
+                break
+
+    print(f"\nSaved: {output_path}")
+    print("\nNew sheets:")
+    print("  Comparison_By_Rank - Side-by-side comparison for each rank")
+    print("  Summary_Comparison - Overall comparison")
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add comparison sheets to combined GPU timeline')
+    parser.add_argument('--input', required=True, help='Input combined Excel file')
+    parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets')
+    parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data')
+    parser.add_argument('--test-label', default='test', help='Label for test data')
+
+    args = parser.parse_args()
+
+    return add_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/scripts/tracelens_single_config/combine_reports.py b/scripts/tracelens_single_config/combine_reports.py
new file mode 100755
index 0000000..e5a4a95
--- /dev/null
+++ b/scripts/tracelens_single_config/combine_reports.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from pathlib import Path
+
+
+def combine_collective_reports(baseline_path, test_path, output_path):
+    # Extract folder names from paths for labels
+    baseline_label = Path(baseline_path).parent.parent.name  # Get the config folder name
+    test_label = Path(test_path).parent.parent.name  # Get the config folder name
+
+    print(f"Loading baseline ({baseline_label}): {baseline_path}")
+    baseline_xl = pd.ExcelFile(baseline_path)
+
+    print(f"Loading test ({test_label}): {test_path}")
+    test_xl = pd.ExcelFile(test_path)
+
+    print(f"\nBaseline sheets: {baseline_xl.sheet_names}")
+    print(f"Test sheets: {test_xl.sheet_names}")
+
+    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+        for sheet_name in baseline_xl.sheet_names:
+            if sheet_name not in test_xl.sheet_names:
+                print(f"  Skip {sheet_name} - not in test file")
+                continue
+
+            baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
+            test_df = pd.read_excel(test_path, sheet_name=sheet_name)
+
+            baseline_df['source'] = baseline_label
+            test_df['source'] = test_label
+
+            combined = pd.concat([baseline_df, test_df], ignore_index=True)
+
+            combined.to_excel(writer, sheet_name=sheet_name, index=False)
+            print(f"  Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows")
+
+    print(f"\nSaved: {output_path}")
+    return 0  # Return success code
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Combine two collective reports')
+    parser.add_argument('--baseline', required=True, help='Path to baseline collective_all_ranks.xlsx')
+    parser.add_argument('--test', required=True, help='Path to test collective_all_ranks.xlsx')
+    parser.add_argument('--output', required=True, help='Output path for combined Excel file')
+
+    args = parser.parse_args()
+
+    return combine_collective_reports(args.baseline, args.test, args.output)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/scripts/tracelens_single_config/create_final_report.py b/scripts/tracelens_single_config/create_final_report.py
new file mode 100755
index 0000000..16caac9
--- /dev/null
+++ b/scripts/tracelens_single_config/create_final_report.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+Create final comprehensive report with combined and comparison data.
+Raw data sheets are hidden and all data is formatted as Excel tables.
+"""
+import pandas as pd
+import argparse
+from pathlib import Path
+from openpyxl import load_workbook
+from openpyxl.worksheet.table import Table, TableStyleInfo
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def get_column_letter(col_num):
+    """Convert column number to Excel column letter."""
+    result = ""
+    while col_num > 0:
+        col_num -= 1
+        result = chr(65 + (col_num % 26)) + result
+        col_num //= 26
+    return result
+
+
+def add_excel_table(worksheet, table_name, start_row=1):
+    """Convert worksheet data to Excel table format."""
+    # Find data range
+    max_row = worksheet.max_row
+    max_col = worksheet.max_column
+
+    if max_row <= start_row:
+        return  # No data
+
+    # Ensure all column headers are strings
+    for col_idx in range(1, max_col + 1):
+        cell = worksheet.cell(row=start_row, column=col_idx)
+        if cell.value is not None and not isinstance(cell.value, str):
+            cell.value = str(cell.value)
+
+    # Create table reference using proper column letter conversion
+    start_cell = f"A{start_row}"
+    end_col_letter = get_column_letter(max_col)
+    end_cell = f"{end_col_letter}{max_row}"
+    table_ref = f"{start_cell}:{end_cell}"
+
+    # Create table with style
+    try:
+        tab = Table(displayName=table_name, ref=table_ref)
+        style = TableStyleInfo(
+            name="TableStyleMedium2",
+            showFirstColumn=False,
+            showLastColumn=False,
+            showRowStripes=True,
+            showColumnStripes=False
+        )
+        tab.tableStyleInfo = style
+
+        # Add table to worksheet
+        worksheet.add_table(tab)
+    except Exception as e:
+        print(f"    Warning: Could not create table {table_name}: {e}")
+
+
+def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label='Baseline', test_label='Test'):
+    """Create comprehensive report with all data."""
+
+    print("Creating comprehensive final report...")
+    print(f"  Output: {output_file}")
+    print(f"  Baseline: {baseline_label}")
+    print(f"  Test: {test_label}")
+
+    # Track sheet info for hiding/organizing
+    raw_sheets = []
+    comparison_sheets = []
+    summary_sheets = []
+
+    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
+
+        # === GPU TIMELINE SHEETS ===
+        print("\nAdding GPU Timeline sheets...")
+
+        # Read GPU combined (raw data)
+        gpu_comb_xl = pd.ExcelFile(gpu_combined)
+        sheet_mapping = {
+            'Summary': 'GPU_Summary_Raw',
+            'All_Ranks_Combined': 'GPU_AllRanks_Raw',
+            'Per_Rank_Time_ms': 'GPU_Time_Raw',
+            'Per_Rank_Percent': 'GPU_Pct_Raw'
+        }
+        for sheet_name in gpu_comb_xl.sheet_names:
+            df = pd.read_excel(gpu_combined, sheet_name=sheet_name)
+            new_name = sheet_mapping.get(sheet_name, f"GPU_{sheet_name}_Raw")
+            df.to_excel(writer, sheet_name=new_name, index=False)
+            raw_sheets.append(new_name)
+            print(f"  Added {new_name} (will be hidden)")
+
+        # Read GPU comparison
+        gpu_comp_xl = pd.ExcelFile(gpu_comparison)
+        comp_mapping = {
+            'Summary_Comparison': 'GPU_Summary_Cmp',
+            'Comparison_By_Rank': 'GPU_ByRank_Cmp'
+        }
+        for sheet_name in gpu_comp_xl.sheet_names:
+            if 'Comparison' in sheet_name:
+                df = pd.read_excel(gpu_comparison, sheet_name=sheet_name)
+                new_name = comp_mapping.get(sheet_name, f"GPU_{sheet_name}")
+                df.to_excel(writer, sheet_name=new_name, index=False)
+                comparison_sheets.append(new_name)
+                print(f"  Added {new_name}")
+
+        # === COLLECTIVE SHEETS ===
+        print("\nAdding Collective/NCCL sheets...")
+
+        # Read collective combined (raw data for hidden sheets)
+        coll_comb_xl = pd.ExcelFile(coll_combined)
+        coll_mapping = {
+            'nccl_summary_implicit_sync': 'NCCL_ImplSync_Raw',
+            'nccl_summary_long': 'NCCL_Long_Raw'
+        }
+        for sheet_name in coll_comb_xl.sheet_names:
+            if 'summary' in sheet_name.lower():
+                df = pd.read_excel(coll_combined, sheet_name=sheet_name)
+                new_name = coll_mapping.get(sheet_name, f"NCCL_{sheet_name}_Raw")
+                df.to_excel(writer, sheet_name=new_name, index=False)
+                raw_sheets.append(new_name)
+                print(f"  Added {new_name} (will be hidden)")
+
+        # Read collective comparison - include ALL sheets
+        coll_comp_xl = pd.ExcelFile(coll_comparison)
+        for sheet_name in coll_comp_xl.sheet_names:
+            df = pd.read_excel(coll_comparison, sheet_name=sheet_name)
+
+            # Determine appropriate naming
+            if 'nccl' in sheet_name.lower():
+                if '_cmp' in sheet_name or 'comparison' in sheet_name.lower():
+                    new_name = f"NCCL_{sheet_name.replace('nccl_', '').title().replace('_', '')}"
+                else:
+                    new_name = f"NCCL_{sheet_name}"
+            else:
+                new_name = sheet_name
+
+            df.to_excel(writer, sheet_name=new_name, index=False)
+
+            if '_cmp' in sheet_name.lower() or 'comparison' in sheet_name.lower():
+                comparison_sheets.append(new_name)
+            else:
+                raw_sheets.append(new_name)
+
+            print(f"  Added {new_name}")
+
+        # === CREATE SUMMARY DASHBOARD ===
+        print("\nCreating Summary Dashboard...")
+
+        # Read key metrics for dashboard
+        gpu_summary = pd.read_excel(gpu_comparison, sheet_name='Summary_Comparison')
+
+        # Create dashboard data
+        dashboard_data = {
+            'Metric': [],
+            baseline_label: [],
+            test_label: [],
+            'Improvement (%)': [],
+            'Status': []
+        }
+
+        # Add GPU metrics
+        # Find the actual column names (they may be config-specific like '32cu_512threads_time_ms')
+        time_cols = [col for col in gpu_summary.columns if 'time_ms' in col and 'diff' not in col and 'percent' not in col]
+        if len(time_cols) >= 2:
+            baseline_col = time_cols[0]
+            test_col = time_cols[1]
+        else:
+            # Fallback to default names
+            baseline_col = 'baseline_time_ms' if 'baseline_time_ms' in gpu_summary.columns else time_cols[0] if time_cols else None
+            test_col = 'test_time_ms' if 'test_time_ms' in gpu_summary.columns else time_cols[1] if len(time_cols) > 1 else None
+
+        if baseline_col and test_col:
+            for _, row in gpu_summary.iterrows():
+                metric_type = row['type']
+                dashboard_data['Metric'].append(f"GPU_{metric_type}")
+                dashboard_data[baseline_label].append(round(row[baseline_col], 2))
+                dashboard_data[test_label].append(round(row[test_col], 2))
+                dashboard_data['Improvement (%)'].append(round(row['percent_change'], 2) if 'percent_change' in row else 0)
+
+                pct_val = row['percent_change'] if 'percent_change' in row else 0
+                dashboard_data['Status'].append('Better' if pct_val > 0 else 'Worse' if pct_val < -1 else 'Similar')
+
+        dashboard_df = pd.DataFrame(dashboard_data)
+        dashboard_df.to_excel(writer, sheet_name='Summary_Dashboard', index=False)
+        summary_sheets.append('Summary_Dashboard')
+        print(f"  Added Summary_Dashboard")
+
+    # Now modify the workbook to hide sheets and add tables
+    print("\nApplying formatting...")
+    wb = load_workbook(output_file)
+
+    # Hide raw data sheets
+    for sheet_name in raw_sheets:
+        if sheet_name in wb.sheetnames:
+            wb[sheet_name].sheet_state = 'hidden'
+            print(f"  Hidden: {sheet_name}")
+
+    # Convert all sheets to tables
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+
+        # Skip if sheet is empty
+        if ws.max_row <= 1:
+            continue
+
+        # Create unique table name from sheet name (remove special chars)
+        table_name = sheet_name.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
+        # Ensure name starts with letter and is max 255 chars
+        if not table_name[0].isalpha():
+            table_name = 'Tbl_' + table_name
+        table_name = table_name[:255]
+
+        add_excel_table(ws, table_name)
+        print(f"  Converted to table: {sheet_name}")
+
+        # Add conditional formatting for percent_change columns
+        if 'Cmp' in sheet_name or 'Comparison' in sheet_name:
+            # Find percent_change columns
+            for col_idx in range(1, ws.max_column + 1):
+                cell_value = ws.cell(row=1, column=col_idx).value
+                if cell_value and 'percent_change' in str(cell_value):
+                    col_letter = get_column_letter(col_idx)
+                    data_range = f'{col_letter}2:{col_letter}{ws.max_row}'
+
+                    # Apply color scale: red (min/negative) -> white (0) -> green (max/positive)
+                    try:
+                        ws.conditional_formatting.add(data_range,
+                            ColorScaleRule(
+                                start_type='min', start_color='F8696B',  # Red
+                                mid_type='num', mid_value=0, mid_color='FFFFFF',  # White
+                                end_type='max', end_color='63BE7B'  # Green
+                            ))
+                        print(f"    Applied color scale to {sheet_name} column {cell_value}")
+                    except Exception as e:
+                        print(f"    Warning: Could not apply formatting to {cell_value}: {e}")
+
+    # Move Summary Dashboard to first position
+    if 'Summary_Dashboard' in wb.sheetnames:
+        dashboard_sheet = wb['Summary_Dashboard']
+        wb.move_sheet(dashboard_sheet, offset=-(len(wb.sheetnames)-1))
+        wb.active = 0  # Set dashboard as active sheet
+        print("\n  Moved Summary_Dashboard to first position")
+
+    # Save workbook
+    wb.save(output_file)
+    print(f"\nFinal report saved: {output_file}")
+
+    # Report structure
+    print("\nReport Structure:")
+    print("  Visible Sheets (Analysis):")
+    print(f"    - Summary_Dashboard")
+    for sheet in comparison_sheets:
+        print(f"    - {sheet}")
+    print("\n  Hidden Sheets (Raw Data):")
+    for sheet in raw_sheets:
+        print(f"    - {sheet}")
+    print("\n  All data formatted as Excel tables with filters")
+    print("  Percent change columns are color-coded (green=better, red=worse)")
+    print("\nUsers can unhide raw data sheets in Excel: Right-click any sheet tab → Unhide")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Create final comprehensive report with all data',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example:
+  python create_final_report.py \\
+    --gpu-combined gpu_timeline_combined.xlsx \\
+    --gpu-comparison gpu_timeline_comparison.xlsx \\
+    --coll-combined collective_combined.xlsx \\
+    --coll-comparison collective_comparison.xlsx \\
+    --output final_analysis_report.xlsx
+        """
+    )
+
+    parser.add_argument('--gpu-combined', required=True,
+                       help='Path to GPU timeline combined file')
+    parser.add_argument('--gpu-comparison', required=True,
+                       help='Path to GPU timeline comparison file')
+    parser.add_argument('--coll-combined', required=True,
+                       help='Path to collective combined file')
+    parser.add_argument('--coll-comparison', required=True,
+                       help='Path to collective comparison file')
+    parser.add_argument('--output', required=True,
+                       help='Output path for final report')
+    parser.add_argument('--baseline-label', default='Baseline',
+                       help='Label for baseline configuration')
+    parser.add_argument('--test-label', default='Test',
+                       help='Label for test configuration')
+
+    args = parser.parse_args()
+
+    # Validate inputs
+    for file_arg in ['gpu_combined', 'gpu_comparison', 'coll_combined', 'coll_comparison']:
+        file_path = getattr(args, file_arg)
+        if not Path(file_path).exists():
+            print(f"Error: File not found: {file_path}")
+            return 1
+
+    create_final_report(
+        args.gpu_combined,
+        args.gpu_comparison,
+        args.coll_combined,
+        args.coll_comparison,
+        args.output,
+        args.baseline_label.replace('_', ' '),
+        args.test_label.replace('_', ' ')
+    )
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/scripts/tracelens_single_config/generate_enhanced_plots.py b/scripts/tracelens_single_config/generate_enhanced_plots.py
new file mode 100755
index 0000000..226310d
--- /dev/null
+++ b/scripts/tracelens_single_config/generate_enhanced_plots.py
@@ -0,0 +1,766 @@
+#!/usr/bin/env python3
+"""
+Enhanced plot generation matching the PDF report style.
+Generates exactly 12 plots as specified.
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import argparse
+from pathlib import Path
+import warnings
+import base64
+from datetime import datetime
+warnings.filterwarnings('ignore')
+
+plt.style.use('seaborn-v0_8-whitegrid')
+sns.set_palette("husl")
+
+
+def plot1_percentage_change(summary_data, output_dir):
+    """Plot 1: Percentage Change Overview."""
+    print("\nGenerating Plot 1: Percentage Change Overview")
+
+    columns = summary_data.columns.tolist()
+    baseline_label = columns[1] if len(columns) > 1 else 'Baseline'
+    test_label = columns[2] if len(columns) > 2 else 'Test'
+
+    if 'Improvement (%)' not in summary_data.columns:
+        print("  No improvement data found")
+        return
+
+    metrics = summary_data['Metric'].values
+    values = summary_data['Improvement (%)'].values
+
+    fig, ax = plt.subplots(figsize=(12, 8))
+    colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+    bars = ax.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+    for bar, val in zip(bars, values):
+        x_pos = bar.get_width()
+        ax.text(x_pos + (0.5 if x_pos > 0 else -0.5), bar.get_y() + bar.get_height()/2,
+                f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold')
+
+    ax.axvline(x=0, color='black', linestyle='-', linewidth=1)
+    ax.set_xlabel('Percent Change (%)', fontsize=12)
+    ax.set_title(f'GPU Metrics: Percent Change ({baseline_label} vs {test_label})\nPositive = Improvement ({test_label} Faster)',
+                 fontsize=14, fontweight='bold')
+    ax.grid(True, alpha=0.3, axis='x')
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot1_percentage_change_overview.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot1_percentage_change_overview.png")
+
+
+def plot2_absolute_time_comparison(summary_data, output_dir):
+    """Plot 2: Absolute Time Comparison."""
+    print("\nGenerating Plot 2: Absolute Time Comparison")
+
+    columns = summary_data.columns.tolist()
+    baseline_label = columns[1] if len(columns) > 1 else 'Baseline'
+    test_label = columns[2] if len(columns) > 2 else 'Test'
+
+    metrics = summary_data['Metric'].values
+    baseline_values = summary_data[baseline_label].values
+    test_values = summary_data[test_label].values
+
+    fig, ax = plt.subplots(figsize=(14, 8))
+
+    x = np.arange(len(metrics))
+    width = 0.35
+
+    bars1 = ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+    bars2 = ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+
+    ax.set_xlabel('Metric Type', fontsize=12)
+    ax.set_ylabel('Time (ms)', fontsize=12)
+    ax.set_title('GPU Metrics: Absolute Time Comparison', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels(metrics, rotation=45, ha='right')
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot2_absolute_time_comparison.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot2_absolute_time_comparison.png")
+
+
+def plot3_performance_heatmap(byrank_data, output_dir):
+    """Plot 3: Performance Heatmap by Rank."""
+    print("\nGenerating Plot 3: Performance Heatmap by Rank")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    metrics = byrank_data['type'].unique() if 'type' in byrank_data.columns else []
+    ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else []
+
+    time_cols = [col for col in byrank_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+    time_col = time_cols[-1] if len(time_cols) > 1 else time_cols[0] if time_cols else None
+
+    if not time_col:
+        print("  No time column found")
+        return
+
+    heatmap_data = byrank_data.pivot_table(index='type', columns='rank', values=time_col, aggfunc='mean')
+
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlOrRd', cbar_kws={'label': 'Time (ms)'}, ax=ax)
+
+    ax.set_title('Performance Heatmap by Rank (Time in ms)', fontsize=14, fontweight='bold')
+    ax.set_xlabel('Rank', fontsize=12)
+    ax.set_ylabel('Metric Type', fontsize=12)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot3_performance_heatmap.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot3_performance_heatmap.png")
+
+
+def plot4_total_execution_time(byrank_data, output_dir):
+    """Plot 4: Total Execution Time by Rank (Line Plot)."""
+    print("\nGenerating Plot 4: Total Execution Time by Rank")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    total_time_data = byrank_data[byrank_data['type'] == 'total_time']
+    if total_time_data.empty:
+        print("  No total_time data found")
+        return
+
+    ranks = sorted(total_time_data['rank'].unique())
+    time_cols = [col for col in total_time_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    for col in time_cols[:2]:
+        times = [total_time_data[total_time_data['rank'] == r][col].values[0] if not total_time_data[total_time_data['rank'] == r].empty else 0 for r in ranks]
+        label = col.replace('_time_ms', '').replace('_', ' ')
+        ax.plot(ranks, times, marker='o', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+    ax.set_xlabel('Rank', fontsize=12)
+    ax.set_ylabel('Total Execution Time (ms)', fontsize=12)
+    ax.set_title('Total Execution Time by Rank', fontsize=14, fontweight='bold')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    ax.set_xticks(ranks)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot4_total_execution_time.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot4_total_execution_time.png")
+
+
+def plot5_computation_time(byrank_data, output_dir):
+    """Plot 5: Computation Time Across Ranks."""
+    print("\nGenerating Plot 5: Computation Time Across Ranks")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    comp_data = byrank_data[byrank_data['type'] == 'computation_time']
+    if comp_data.empty:
+        print("  No computation_time data found")
+        return
+
+    ranks = sorted(comp_data['rank'].unique())
+    time_cols = [col for col in comp_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    for col in time_cols[:2]:
+        times = [comp_data[comp_data['rank'] == r][col].values[0] if not comp_data[comp_data['rank'] == r].empty else 0 for r in ranks]
+        label = col.replace('_time_ms', '').replace('_', ' ')
+        ax.plot(ranks, times, marker='s', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+    ax.set_xlabel('Rank', fontsize=12)
+    ax.set_ylabel('Computation Time (ms)', fontsize=12)
+    ax.set_title('Computation Time Across Ranks', fontsize=14, fontweight='bold')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    ax.set_xticks(ranks)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot5_computation_time.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot5_computation_time.png")
+
+
+def plot6_communication_time(byrank_data, output_dir):
+    """Plot 6: Total Communication Time Across Ranks."""
+    print("\nGenerating Plot 6: Total Communication Time Across Ranks")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    comm_data = byrank_data[byrank_data['type'] == 'total_comm_time']
+    if comm_data.empty:
+        print("  No total_comm_time data found")
+        return
+
+    ranks = sorted(comm_data['rank'].unique())
+    time_cols = [col for col in comm_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    for col in time_cols[:2]:
+        times = [comm_data[comm_data['rank'] == r][col].values[0] if not comm_data[comm_data['rank'] == r].empty else 0 for r in ranks]
+        label = col.replace('_time_ms', '').replace('_', ' ')
+        ax.plot(ranks, times, marker='^', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+    ax.set_xlabel('Rank', fontsize=12)
+    ax.set_ylabel('Communication Time (ms)', fontsize=12)
+    ax.set_title('Total Communication Time Across Ranks', fontsize=14, fontweight='bold')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    ax.set_xticks(ranks)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot6_communication_time.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot6_communication_time.png")
+
+
+def plot7_idle_time(byrank_data, output_dir):
+    """Plot 7: Idle Time Across Ranks."""
+    print("\nGenerating Plot 7: Idle Time Across Ranks")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    idle_data = byrank_data[byrank_data['type'] == 'idle_time']
+    if idle_data.empty:
+        print("  No idle_time data found")
+        return
+
+    ranks = sorted(idle_data['rank'].unique())
+    time_cols = [col for col in idle_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    for col in time_cols[:2]:
+        times = [idle_data[idle_data['rank'] == r][col].values[0] if not idle_data[idle_data['rank'] == r].empty else 0 for r in ranks]
+        label = col.replace('_time_ms', '').replace('_', ' ')
+        ax.plot(ranks, times, marker='D', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+    ax.set_xlabel('Rank', fontsize=12)
+    ax.set_ylabel('Idle Time (ms)', fontsize=12)
+    ax.set_title('Idle Time Across Ranks', fontsize=14, fontweight='bold')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    ax.set_xticks(ranks)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot7_idle_time.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot7_idle_time.png")
+
+
+def plot8_percentage_time_difference(byrank_data, output_dir):
+    """Plot 8: Percentage Time Difference Across Ranks (8 subplots in 2x4 grid)."""
+    print("\nGenerating Plot 8: Percentage Time Difference (8 subplots)")
+
+    if byrank_data is None or byrank_data.empty:
+        print("  No by-rank data available")
+        return
+
+    metrics = ['busy_time', 'computation_time', 'total_comm_time', 'exposed_comm_time',
+               'idle_time', 'total_memcpy_time', 'exposed_memcpy_time', 'total_time']
+
+    pct_cols = [col for col in byrank_data.columns if 'percent_change' in col.lower()]
+    if not pct_cols:
+        print("  No percent_change column found")
+        return
+
+    pct_col = pct_cols[0]
+    ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else []
+
+    # Create 2x4 subplot grid
+    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
+    axes = axes.flatten()
+
+    for idx, metric in enumerate(metrics):
+        ax = axes[idx]
+        metric_data = byrank_data[byrank_data['type'] == metric]
+
+        if not metric_data.empty:
+            values = [metric_data[metric_data['rank'] == r][pct_col].values[0] if not metric_data[metric_data['rank'] == r].empty else 0 for r in ranks]
+
+            colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+            ax.bar(ranks, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+            ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
+            ax.set_xlabel('Rank', fontsize=10)
+            ax.set_ylabel('Percent Change (%)', fontsize=10)
+            ax.set_title(metric.replace('_', ' ').title(), fontsize=11, fontweight='bold')
+            ax.grid(True, alpha=0.3, axis='y')
+            ax.set_xticks(ranks)
+
+    plt.suptitle('Percentage Time Difference Across Ranks (All Metrics)', fontsize=16, fontweight='bold')
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot8_percentage_difference_all_metrics.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot8_percentage_difference_all_metrics.png")
+
+
+def plot9_nccl_latency(nccl_data, output_dir):
+    """Plot 9: Communication Latency Comparison per Message Size."""
+    print("\nGenerating Plot 9: Communication Latency vs Message Size")
+
+    if nccl_data is None or nccl_data.empty:
+        print("  No NCCL data available")
+        return
+
+    if 'In msg nelems' not in nccl_data.columns:
+        print("  Required columns not found")
+        return
+
+    latency_cols = [col for col in nccl_data.columns if 'comm_latency' in col.lower() or 'latency_mean' in col.lower()]
+    if not latency_cols:
+        print("  No latency columns found")
+        return
+
+    fig, ax = plt.subplots(figsize=(14, 7))
+
+    nccl_sorted = nccl_data.sort_values('In msg nelems')
+    msg_sizes = nccl_sorted['In msg nelems'].values
+
+    x = np.arange(len(msg_sizes))
+    width = 0.35
+
+    if len(latency_cols) >= 2:
+        baseline_values = nccl_sorted[latency_cols[0]].values
+        test_values = nccl_sorted[latency_cols[1]].values
+
+        baseline_label = latency_cols[0].replace('_comm_latency_mean', '').replace('_', ' ').title()
+        test_label = latency_cols[1].replace('_comm_latency_mean', '').replace('_', ' ').title()
+
+        ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+        ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+    else:
+        ax.bar(x, nccl_sorted[latency_cols[0]].values, alpha=0.8, color='steelblue')
+
+    ax.set_xlabel('Message Size (elements)', fontsize=12)
+    ax.set_ylabel('Communication Latency (ms)', fontsize=12)
+    ax.set_title('Communication Latency Comparison per Message Size', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot9_nccl_latency.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot9_nccl_latency.png")
+
+
+def plot10_algorithm_bandwidth(nccl_data, output_dir):
+    """Plot 10: Algorithm Bandwidth."""
+    print("\nGenerating Plot 10: Algorithm Bandwidth")
+
+    if nccl_data is None or nccl_data.empty:
+        print("  No NCCL data available")
+        return
+
+    algo_bw_cols = [col for col in nccl_data.columns if 'algo bw' in col.lower()]
+    if not algo_bw_cols or 'In msg nelems' not in nccl_data.columns:
+        print("  Required columns not found")
+        return
+
+    fig, ax = plt.subplots(figsize=(14, 7))
+
+    nccl_sorted = nccl_data.sort_values('In msg nelems')
+    msg_sizes = nccl_sorted['In msg nelems'].values
+
+    x = np.arange(len(msg_sizes))
+    width = 0.35
+
+    if len(algo_bw_cols) >= 2:
+        baseline_values = nccl_sorted[algo_bw_cols[0]].values
+        test_values = nccl_sorted[algo_bw_cols[1]].values
+
+        baseline_label = algo_bw_cols[0].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title()
+        test_label = algo_bw_cols[1].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title()
+
+        ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+        ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+    else:
+        ax.bar(x, nccl_sorted[algo_bw_cols[0]].values, alpha=0.8, color='steelblue')
+
+    ax.set_xlabel('Message Size (elements)', fontsize=12)
+    ax.set_ylabel('Algorithm Bandwidth (GB/s)', fontsize=12)
+    ax.set_title('Algorithm Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot10_algorithm_bandwidth.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot10_algorithm_bandwidth.png")
+
+
+def plot11_bus_bandwidth(nccl_data, output_dir):
+    """Plot 11: Bus Bandwidth."""
+    print("\nGenerating Plot 11: Bus Bandwidth")
+
+    if nccl_data is None or nccl_data.empty:
+        print("  No NCCL data available")
+        return
+
+    bus_bw_cols = [col for col in nccl_data.columns if 'bus bw' in col.lower()]
+    if not bus_bw_cols or 'In msg nelems' not in nccl_data.columns:
+        print("  Required columns not found")
+        return
+
+    fig, ax = plt.subplots(figsize=(14, 7))
+
+    nccl_sorted = nccl_data.sort_values('In msg nelems')
+    msg_sizes = nccl_sorted['In msg nelems'].values
+
+    x = np.arange(len(msg_sizes))
+    width = 0.35
+
+    if len(bus_bw_cols) >= 2:
+        baseline_values = nccl_sorted[bus_bw_cols[0]].values
+        test_values = nccl_sorted[bus_bw_cols[1]].values
+
+        baseline_label = bus_bw_cols[0].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title()
+        test_label = bus_bw_cols[1].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title()
+
+        ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+        ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+    else:
+        ax.bar(x, nccl_sorted[bus_bw_cols[0]].values, alpha=0.8, color='steelblue')
+
+    ax.set_xlabel('Message Size (elements)', fontsize=12)
+    ax.set_ylabel('Bus Bandwidth (GB/s)', fontsize=12)
+    ax.set_title('Bus Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot11_bus_bandwidth.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot11_bus_bandwidth.png")
+
+
+def plot12_nccl_summary(nccl_data, output_dir):
+    """Plot 12: NCCL Percentage Summary and Total Communication Latency."""
+    print("\nGenerating Plot 12: NCCL Summary (Percentage & Total Latency)")
+
+    if nccl_data is None or nccl_data.empty:
+        print("  No NCCL data available")
+        return
+
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
+
+    # Left: Percentage change summary for key metrics
+    pct_cols = [col for col in nccl_data.columns if 'percent_change' in col.lower()]
+    if pct_cols and len(pct_cols) > 0:
+        metrics = []
+        values = []
+
+        for col in pct_cols:
+            metric_name = col.replace('percent_change_', '').replace('_', ' ').title()
+            metrics.append(metric_name)
+            avg_value = nccl_data[col].mean()
+            values.append(avg_value)
+
+        if metrics:
+            colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+            bars = ax1.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+            for bar, val in zip(bars, values):
+                x_pos = bar.get_width()
+                ax1.text(x_pos + (1 if x_pos > 0 else -1), bar.get_y() + bar.get_height()/2,
+                        f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold')
+
+            ax1.axvline(x=0, color='black', linestyle='-', linewidth=1)
+            ax1.set_xlabel('Percent Change (%)', fontsize=12)
+            ax1.set_title('NCCL Metrics: Average Percent Change', fontsize=13, fontweight='bold')
+            ax1.grid(True, alpha=0.3)
+    else:
+        ax1.text(0.5, 0.5, 'No percentage change data available',
+                ha='center', va='center', transform=ax1.transAxes, fontsize=12)
+
+    # Right: Total communication latency comparison
+    total_latency_cols = [col for col in nccl_data.columns if ('Total comm latency' in col or 'total_latency' in col.lower()) and 'percent' not in col.lower()]
+
+    if total_latency_cols and len(total_latency_cols) >= 1:
+        labels = []
+        totals = []
+
+        for col in total_latency_cols[:2]:
+            label = col.replace('_Total comm latency (ms)', '').replace('_total_latency', '').replace('_', ' ').strip().title()
+            if not label:
+                label = 'Total'
+            total = nccl_data[col].sum()
+            labels.append(label)
+            totals.append(total)
+
+        if totals:
+            colors = ['steelblue', 'darkorange'] if len(totals) > 1 else ['steelblue']
+            bars = ax2.bar(labels, totals, color=colors[:len(totals)], alpha=0.8, edgecolor='black', linewidth=1)
+
+            for bar, val in zip(bars, totals):
+                ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height(),
+                        f'{val:.1f} ms', ha='center', va='bottom', fontsize=12, fontweight='bold')
+
+            if len(totals) == 2 and totals[0] > 0:
+                improvement = (totals[0] - totals[1]) / totals[0] * 100
+                y_pos = max(totals) * 0.6
+                ax2.text(0.5, y_pos, f'Improvement: {improvement:.1f}%',
+                        ha='center', fontsize=13, fontweight='bold',
+                        bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.6, edgecolor='black'))
+
+            ax2.set_ylabel('Total Communication Latency (ms)', fontsize=12)
+            ax2.set_title('Total Communication Latency Comparison', fontsize=13, fontweight='bold')
+            ax2.grid(True, alpha=0.3, axis='y')
+    else:
+        ax2.text(0.5, 0.5, 'No total latency data available',
+                ha='center', va='center', transform=ax2.transAxes, fontsize=12)
+
+    plt.tight_layout()
+    plt.savefig(output_dir / 'plot12_nccl_summary.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: plot12_nccl_summary.png")
+
+
+def generate_html_report(input_path, output_dir, baseline_label='Baseline', test_label='Test'):
+    """Generate HTML report with all plots embedded."""
+    print("\nGenerating HTML Report...")
+
+    plot_files = sorted(output_dir.glob('plot*.png'))
+
+    html_content = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>RCCL Performance Analysis: {baseline_label} vs {test_label}</title>
+    <style>
+        body {{
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1600px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }}
+        h1 {{
+            color: #2c3e50;
+            border-bottom: 3px solid #3498db;
+            padding-bottom: 10px;
+            text-align: center;
+        }}
+        h2 {{
+            color: #34495e;
+            border-bottom: 2px solid #95a5a6;
+            padding-bottom: 5px;
+            margin-top: 40px;
+        }}
+        .plot-container {{
+            background: white;
+            padding: 20px;
+            margin: 20px 0;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            text-align: center;
+        }}
+        .plot-container img {{
+            max-width: 100%;
+            height: auto;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+        }}
+        .plot-title {{
+            font-size: 18px;
+            font-weight: bold;
+            color: #2c3e50;
+            margin-bottom: 10px;
+        }}
+        @media print {{
+            .plot-container {{
+                page-break-inside: avoid;
+            }}
+        }}
+    </style>
+</head>
+<body>
+    <h1>RCCL Performance Analysis Report</h1>
+    <h2 style="text-align: center; color: #3498db;">Comparing: {baseline_label} vs {test_label}</h2>
+    <p style="text-align: center;"><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+
+    <h2>GPU Performance Metrics</h2>
+"""
+
+    plot_titles = {
+        'plot1': 'Percentage Change Overview',
+        'plot2': 'Absolute Time Comparison',
+        'plot3': 'Performance Heatmap by Rank',
+        'plot4': 'Total Execution Time by Rank',
+        'plot5': 'Computation Time Across Ranks',
+        'plot6': 'Communication Time Across Ranks',
+        'plot7': 'Idle Time Across Ranks',
+        'plot8': 'Percentage Time Difference (All Metrics)',
+        'plot9': 'NCCL Communication Latency',
+        'plot10': 'NCCL Algorithm Bandwidth',
+        'plot11': 'NCCL Bus Bandwidth',
+        'plot12': 'NCCL Summary'
+    }
+
+    # Add GPU plots first (plot1-plot8)
+    for plot_file in plot_files:
+        plot_num = plot_file.stem.split('_')[0]
+        if plot_num not in ['plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7', 'plot8']:
+            continue
+
+        title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title())
+
+        with open(plot_file, 'rb') as f:
+            img_data = base64.b64encode(f.read()).decode()
+
+        html_content += f"""
+    <div class="plot-container">
+        <div class="plot-title">{title}</div>
+        <img src="data:image/png;base64,{img_data}" alt="{title}">
+    </div>
+"""
+
+    # Add NCCL section
+    html_content += "\n    <h2>NCCL/Collective Performance</h2>\n"
+
+    # Add NCCL plots (plot9-plot12)
+    for plot_file in plot_files:
+        plot_num = plot_file.stem.split('_')[0]
+        if plot_num not in ['plot9', 'plot10', 'plot11', 'plot12']:
+            continue
+
+        title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title())
+
+        with open(plot_file, 'rb') as f:
+            img_data = base64.b64encode(f.read()).decode()
+
+        html_content += f"""
+    <div class="plot-container">
+        <div class="plot-title">{title}</div>
+        <img src="data:image/png;base64,{img_data}" alt="{title}">
+    </div>
+"""
+
+    html_content += """
+    <p style="text-align: center; margin-top: 50px; color: #7f8c8d;">
+        Generated by TraceLens Analysis Pipeline
+    </p>
+</body>
+</html>
+"""
+
+    html_path = output_dir / 'performance_analysis_report.html'
+    with open(html_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+    print(f"  HTML report saved to: {html_path}")
+    return html_path
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate 12 analysis plots')
+    parser.add_argument('--input', required=True, help='Path to final_analysis_report.xlsx')
+    parser.add_argument('--output', default='plots', help='Output directory for plots')
+
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if not input_path.exists():
+        print(f"Error: Input file not found: {input_path}")
+        return 1
+
+    sheets = pd.read_excel(input_path, sheet_name=None)
+
+    print(f"\nGenerating 12 plots from {input_path.name}...")
+
+    # Extract baseline and test labels from Summary_Dashboard
+    baseline_label = 'Baseline'
+    test_label = 'Test'
+
+    summary_sheet = sheets.get('Summary_Dashboard')
+    if summary_sheet is not None:
+        columns = summary_sheet.columns.tolist()
+        if len(columns) >= 3:
+            baseline_label = columns[1]
+            test_label = columns[2]
+
+        plot1_percentage_change(summary_sheet, output_dir)
+        plot2_absolute_time_comparison(summary_sheet, output_dir)
+
+    # GPU by-rank data
+    byrank_sheet = None
+    for name in ['GPU_ByRank_Cmp', 'GPU_ByRank_Comparison', 'Comparison_By_Rank']:
+        if name in sheets:
+            byrank_sheet = sheets[name]
+            break
+
+    if byrank_sheet is not None:
+        plot3_performance_heatmap(byrank_sheet, output_dir)
+        plot4_total_execution_time(byrank_sheet, output_dir)
+        plot5_computation_time(byrank_sheet, output_dir)
+        plot6_communication_time(byrank_sheet, output_dir)
+        plot7_idle_time(byrank_sheet, output_dir)
+        plot8_percentage_time_difference(byrank_sheet, output_dir)
+
+    # NCCL data
+    nccl_sheet = None
+    for name in sheets:
+        if 'nccl' in name.lower() and ('cmp' in name.lower() or 'comparison' in name.lower()):
+            nccl_sheet = sheets[name]
+            break
+
+    # Try to get the actual NCCL data sheets (not just comparison)
+    if not nccl_sheet or nccl_sheet.empty:
+        for name in sheets:
+            if 'nccl' in name.lower() and 'summary' in name.lower():
+                nccl_sheet = sheets[name]
+                break
+
+    if nccl_sheet is not None and not nccl_sheet.empty:
+        plot9_nccl_latency(nccl_sheet, output_dir)
+        plot10_algorithm_bandwidth(nccl_sheet, output_dir)
+        plot11_bus_bandwidth(nccl_sheet, output_dir)
+        plot12_nccl_summary(nccl_sheet, output_dir)
+
+    # Generate HTML report with configuration labels
+    html_path = generate_html_report(input_path, output_dir, baseline_label, test_label)
+
+    print(f"\n{'='*60}")
+    print(f"All 12 plots generated successfully!")
+    print(f"Output directory: {output_dir}")
+    print(f"\nHTML Report: {html_path}")
+    print("  - Open in browser to view all plots")
+    print("  - Print to PDF: Ctrl+P or Cmd+P")
+    print(f"{'='*60}")
+
+    return 0
+
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main())
diff --git a/scripts/tracelens_single_config/generate_merged_html.py b/scripts/tracelens_single_config/generate_merged_html.py
new file mode 100644
index 0000000..1bb89f6
--- /dev/null
+++ b/scripts/tracelens_single_config/generate_merged_html.py
@@ -0,0 +1,182 @@
+import os
+from pathlib import Path
+import base64
+import argparse
+
+
+def get_image_data(image_path):
+    try:
+        with open(image_path, "rb") as f:
+            return base64.b64encode(f.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error getting image data: {e}")
+        return None
+
+
+def create_final_html(plot_file_path, output_path):
+    html_header = """ <!DOCTYPE html>
+    <html lang="en">
+    <head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Performance Analysis Report</title>
+    <style>
+        body {
+            font-family: sans-serif;
+            line-height: 1.6;
+            margin: 0 auto;
+            padding: 20px;
+            max-width: 800px;
+        }
+        h1, h2, h3 {
+            border-bottom: 1px solid #eee;
+            padding-bottom: 10px;
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+        }
+    </style>
+    </head>
+    <body>
+
+    <h1> Performance Analysis Report </h1>
+
+    <hr>
+
+    <h2> Executive Summary </h2>
+
+    Comparison of GPU performance metrics
+    implementations across 8 ranks.
+    """
+
+    summary_section = f"""
+    <h3> 1. Overall GPU Metrics Comparison </h3>
+    """
+
+    summary_chart = get_image_data(plot_file_path / "improvement_chart.png")
+    if summary_chart is not None:
+        summary_section += f"""
+        <h4> Percentage Change Overview </h4>
+        <img src="data:image/png;base64,{summary_chart}" alt="Summary Chart" class="chart-image">
+        Overall performance change across key GPU metrics. Negetive values indicate improvement
+        """
+    absolute_time_chart = get_image_data(plot_file_path / "abs_time_comparison.png")
+    if absolute_time_chart is not None:
+        summary_section += f"""
+        <h4> Absolute Time Comparison </h4>
+        <img src="data:image/png;base64,{absolute_time_chart}" alt="Absolute Time Comparison" class="chart-image">
+        Side-by-side comparison of absolute execution times for all GPU metrics.
+        """
+
+    cross_rank_comparison_section = f"""
+    <h3> 2. Cross-Rank Performance Comparison </h3>
+    """
+    gpu_time_heatmap = get_image_data(plot_file_path / "gpu_time_heatmap.png")
+    if gpu_time_heatmap is not None:
+        cross_rank_comparison_section += f"""
+        <h4> Performance Heatmap by Rank  </h4>
+        <img src="data:image/png;base64,{gpu_time_heatmap}" alt="GPU Metric Percentage Change by Rank (HeatMap)" class="chart-image">
+        Comprehensive heatmap showing percent change for all metrics across all ranks. Green indicates better performance (positive % change).
+        """
+
+    item_list = {
+        "total_time": {
+            "name": "Total Time",
+            "description": "Total execution time comparison across all ranks, showing end-to-end performance characteristics.",
+            "chart_path": plot_file_path / "total_time_by_rank.png",
+        },
+        "computation_time": {
+            "name": "Computation Time",
+            "description": "Pure computation time excluding communication overhead, analyzed per rank.",
+            "chart_path": plot_file_path / "computation_time_by_rank.png",
+        },
+        "total_comm_time": {
+            "name": "Communication Time",
+            "description": "Total time spent in collective communication operations across ranks.",
+            "chart_path": plot_file_path / "total_comm_time_by_rank.png",
+        },
+        "idle_time": {
+            "name": "Idle Time",
+            "description": "GPU idle time comparison showing resource utilization efficiency per rank.",
+            "chart_path": plot_file_path / "idle_time_by_rank.png",
+        },
+        "gpu_time_change_percentage_summaryby_rank": {
+            "name": "Detailed Percentage Change by Metric",
+            "description": "Detailed breakdown of percent change for each metric type across all ranks.",
+            "chart_path": plot_file_path
+            / "gpu_time_change_percentage_summaryby_rank.png",
+        },
+    }
+    for item in item_list.keys():
+        cross_rank_comparison_chart = get_image_data(item_list[item]["chart_path"])
+        if cross_rank_comparison_chart is not None:
+            cross_rank_comparison_section += f"""
+            <h4> {item_list[item]['name']}  </h4>
+            <img src="data:image/png;base64,{cross_rank_comparison_chart}" alt="{item} by Rank" class="chart-image">
+            {item_list[item]['description']}.
+            """
+
+    summary_section += cross_rank_comparison_section
+
+    nccl_charst_section = f"""
+    <h3> 3. NCCL Collective Operations Analysis </h3>
+    """
+    nccl_chart_item_list = {
+        "NCCL Communication Latency": "Mean communication latency for NCCL allreduce operations across different message sizes",
+        "NCCL Algorithm Bandwidth": "Algorithm bandwidth achieved for different message sizes in NCCL collective operations.",
+        "NCCL Bus Bandwidth": "Bus bandwidth utilization across NCCL operations and message sizes.",
+        "NCCL Performance Percentage Change": "Percent change in communication latency and bandwidth metrics for each message sizec configuration",
+        "NCCL Total Communication Latency": "Aggregate communication latency summed across all operations for each message size.",
+    }
+    for item in nccl_chart_item_list.keys():
+        nccl_image_data = get_image_data(
+            plot_file_path / f'{item.replace(" ", "_")}_comparison.png'
+        )
+        if nccl_image_data is not None:
+            nccl_charst_section += f"""
+            <h4> {item} </h4>
+            <img src="data:image/png;base64,{get_image_data(plot_file_path / f'{item.replace(" ", "_")}_comparison.png')}" alt="{item} Comparison" class="chart-image">
+            {nccl_chart_item_list[item]}
+            """
+
+    summary_section += nccl_charst_section
+
+    footer_section = f"""
+
+    </body>
+    </html>
+    """
+    summary_section += footer_section
+
+    final_html = html_header + summary_section
+    with open(output_path, "w") as f:
+        f.write(final_html)
+    print(f"Final HTML file created at: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create a final HTML file for the analysis report."
+    )
+    parser.add_argument(
+        "-p",
+        "--plot-files-directory",
+        type=Path,
+        required=True,
+        help="Path to the plot files direcotry.",
+    )
+    parser.add_argument(
+        "-o", "--output-html", type=None, default=None, help="Path to the output file."
+    )
+    args = parser.parse_args()
+    output_path = (
+        args.output_html
+        if args.output_html
+        else args.plot_files_directory.parent / "final_analysis_report.html"
+    )
+    create_final_html(args.plot_files_directory, output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tracelens_single_config/merge_tracelens_analysis.py b/scripts/tracelens_single_config/merge_tracelens_analysis.py
new file mode 100644
index 0000000..fade0d5
--- /dev/null
+++ b/scripts/tracelens_single_config/merge_tracelens_analysis.py
@@ -0,0 +1,465 @@
+import argparse
+from pathlib import Path
+import subprocess
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def run_command(cmd, description):
+    """Execute a command and handle errors."""
+    print(f"\n{'='*80}")
+    print(f"{description}")
+    print(f"{'='*80}")
+    print(f"Command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Error: {description} failed!")
+        print(f"Stderr: {result.stderr}")
+        return False
+
+    print(result.stdout)
+    return True
+
+
+def plot_nccl_data_per_msg(df, labels, output_dir: Path):
+    """
+    Plot comm_latency_mean for each message size from NCCL data.
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Get unique index values (Collective_MsgSize)
+    indices = df["index"].values
+
+    x = np.arange(len(indices))
+    width = 0.8 / len(labels)
+    # Vibrant color palette
+    vibrant_colors = [
+        "#E63946",
+        "#2A9D8F",
+        "#E9C46A",
+        "#264653",
+        "#F4A261",
+        "#8338EC",
+        "#06D6A0",
+        "#FF006E",
+    ]
+
+    plot_items = {
+        "NCCL Communication Latency": {
+            "x_label": "Collective Operation (Message Size)",
+            "y_label": "Communication Latency (ms)",
+            "y_col": "comm_latency_mean",
+        },
+        "NCCL Algorithm Bandwidth": {
+            "x_label": "Collective Operation (Message Size)",
+            "y_label": "Algorithm Bandwidth (GB/s)",
+            "y_col": "algo bw (GB/s)_mean",
+        },
+        "NCCL Bus Bandwidth": {
+            "x_label": "Collective Operation (Message Size)",
+            "y_label": "Bus Bandwidth (GB/s)",
+            "y_col": "bus bw (GB/s)_mean",
+        },
+        "NCCL Total Communication Latency": {
+            "x_label": "Collective Operation (Message Size)",
+            "y_label": "Total Communication Latency (ms)",
+            "y_col": "Total comm latency (ms)",
+        },
+    }
+
+    for plot_item in plot_items.keys():
+        fig, ax = plt.subplots(figsize=(14, 7))
+        for i, label in enumerate(labels):
+            col_name = f"{plot_items[plot_item]['y_col']}_{label}"
+            print(f"Plotting {col_name}")
+            if col_name in df.columns:
+                values = df[col_name].values
+                color = vibrant_colors[i % len(vibrant_colors)]
+                offset = (i - len(labels) / 2 + 0.5) * width
+                ax.bar(
+                    x + offset,
+                    values,
+                    width,
+                    label=label,
+                    color=color,
+                    alpha=0.85,
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+            else:
+                print(f"Column {col_name} not found in dataframe")
+
+        ax.set_xlabel(plot_items[plot_item]["x_label"], fontsize=12, fontweight="bold")
+        ax.set_ylabel(plot_items[plot_item]["y_label"], fontsize=12, fontweight="bold")
+        ax.set_title(f"{plot_item} per Message Size", fontsize=14, fontweight="bold")
+        ax.set_xticks(x)
+        ax.set_xticklabels(indices, rotation=45, ha="right", fontsize=9)
+        ax.legend(loc="upper left")
+        ax.grid(True, alpha=0.3, axis="y")
+
+        plt.tight_layout()
+        output_file = output_path / f'{plot_item.replace(" ", "_")}_comparison.png'
+        plt.savefig(output_file, dpi=150, bbox_inches="tight")
+        plt.close()
+        print(f"Saved: {output_file}")
+    print("Completed plotting NCCL data per message size")
+
+
+def plot_all_types_per_rank(df, labels, output_dir: Path):
+    """
+    Plot data for every rank, where every unique type is a different file.
+
+    Parameters:
+    -----------
+    df : DataFrame
+        Merged gpu_time_per_rank_df with columns like 'type', 'rank0_label1', 'rank0_label2', etc.
+    labels : list
+        List of configuration labels (e.g., ['32cu_512threads', '37cu_384threads'])
+    output_dir : str
+        Directory to save plots
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    unique_types = df["type"].unique()
+
+    # Find rank columns (extract rank numbers from column names)
+    # Columns are like: rank0_32cu_512threads, rank1_32cu_512threads, etc.
+    sample_label = labels[0]
+    rank_cols = [
+        col for col in df.columns if col.endswith(f"_{sample_label}") and col != "type"
+    ]
+    ranks = [col.replace(f"_{sample_label}", "") for col in rank_cols]
+
+    print(f"Found ranks: {ranks}")
+    print(f"Found types: {unique_types}")
+
+    for metric_type in unique_types:
+        type_data = df[df["type"] == metric_type]
+
+        if type_data.empty:
+            continue
+
+        fig, ax = plt.subplots(figsize=(12, 6))
+
+        x = np.arange(len(ranks))
+        # Vibrant color palette
+        vibrant_colors = [
+            "#E63946",
+            "#2A9D8F",
+            "#E9C46A",
+            "#264653",
+            "#F4A261",
+            "#8338EC",
+            "#06D6A0",
+            "#FF006E",
+        ]
+        markers = ["o", "s", "^", "D", "v", "p", "h", "*"]
+
+        for i, label in enumerate(labels):
+            values = []
+            for rank in ranks:
+                col_name = f"{rank}_{label}"
+                if col_name in type_data.columns:
+                    val = type_data[col_name].values[0]
+                    values.append(val if pd.notna(val) else 0)
+                else:
+                    values.append(0)
+
+            color = vibrant_colors[i % len(vibrant_colors)]
+            marker = markers[i % len(markers)]
+            ax.plot(
+                x,
+                values,
+                label=label,
+                color=color,
+                marker=marker,
+                markersize=8,
+                linewidth=2,
+                alpha=0.85,
+            )
+
+        ax.set_xlabel("Rank", fontsize=12, fontweight="bold")
+        ax.set_ylabel("Time (ms)", fontsize=12, fontweight="bold")
+        ax.set_title(f"{metric_type} - Time per Rank", fontsize=14, fontweight="bold")
+        ax.set_xticks(x)
+        ax.set_xticklabels(ranks)
+        ax.legend(loc="upper right")
+        ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+
+        # Save with sanitized filename
+        safe_name = metric_type.replace("/", "_").replace(" ", "_").replace(":", "_")
+        output_file = output_path / f"{safe_name}_by_rank.png"
+        plt.savefig(output_file, dpi=150, bbox_inches="tight")
+        plt.close()
+        print(f"Saved: {output_file}")
+
+
+def plot_gpu_time_summary(df, labels, output_dir: Path):
+
+    types = df["type"].values
+    values = []
+
+    for label in labels:
+        values.append(df[f"time ms_{label}"].values)
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+
+    x = np.arange(len(types))
+    width = 0.15
+    for i, value in enumerate(values):
+        offset = (i - len(labels) / 2 + 0.5) * width
+        bars = ax.bar(x + offset, value, width, label=labels[i])
+
+    ax.set_xlabel("Type")
+    ax.set_ylabel("Time (ms)")
+    ax.set_title("GPU Time Summary by Rank")
+    ax.set_xticks(x)
+    ax.set_xticklabels(types, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis="y")
+
+    plt.tight_layout()
+    plt.savefig(output_dir / "abs_time_comparison.png")
+    plt.close()
+
+
+"""
+def plot_improvement_chart(df, output_path):
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    # Color bars based on positive/negative values
+    colors = ['#2ecc71' if val > 0 else '#e74c3c' for val in df['Improvement (%)']]
+
+    bars = ax.barh(df['Metric'], df['Improvement (%)'], color=colors)
+    ax.yaxis.grid(True, linestyle='--', alpha=0.7, color='gray')
+    ax.set_axisbelow(True)
+
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['bottom'].set_visible(False)
+    ax.spines['left'].set_visible(False)
+
+    # Customize the chart
+    ax.set_ylabel('Metric', fontsize=12)
+    ax.set_xlabel('Change (%)', fontsize=12)
+    ax.set_title(
+        'GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)',
+        fontsize=14, fontweight='bold'
+    )
+
+    plt.tight_layout()
+    plt.savefig(output_path / 'improvement_chart.png', dpi=150)
+    plt.close()
+
+"""
+
+
+def plot_gpu_time_percentage_change(df, labels, output_dir: Path):
+    """
+    Create separate horizontal bar charts for each label comparing against baseline (labels[0]).
+    """
+    types = df["type"].values
+    base_label = labels[0]
+
+    # Vibrant color palette
+    vibrant_colors = [
+        "#E63946",
+        "#2A9D8F",
+        "#E9C46A",
+        "#264653",
+        "#F4A261",
+        "#8338EC",
+        "#06D6A0",
+        "#FF006E",
+    ]
+
+    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, max(8, len(types) * 0.5)))
+    for i, label in enumerate(labels[1:]):
+        ax = axes[i]
+        col_name = f"percentage_change_{label}"
+        if col_name not in df.columns:
+            print(f"Column {col_name} not found, skipping")
+            continue
+
+        values = df[col_name].values
+
+        # Create 1x2 subplot figure
+
+        # Color bars based on positive/negative values (green = improvement, red = regression)
+        colors = ["#2ecc71" if val < 0 else "#e74c3c" for val in values]
+
+        # Horizontal bar chart
+        y = np.arange(len(types))
+        bars = ax.barh(
+            y, values, color=colors, alpha=0.85, edgecolor="black", linewidth=0.5
+        )
+
+        # Add vertical line at 0
+        ax.axvline(x=0, color="black", linestyle="-", linewidth=1)
+
+        ax.set_yticks(y)
+        ax.set_yticklabels(types, fontsize=10)
+        ax.set_xlabel("Percentage Change (%)", fontsize=12, fontweight="bold")
+        ax.set_ylabel("Type", fontsize=12, fontweight="bold")
+        ax.set_title(
+            f"GPU Time Percentage Change: {label} vs {base_label}\n(Negative = Improvement)",
+            fontsize=14,
+            fontweight="bold",
+        )
+        ax.grid(True, alpha=0.3, axis="x")
+
+    plt.tight_layout()
+
+    output_file = output_dir / f"improvement_chart.png"
+    plt.savefig(output_file, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"Saved: {output_file}")
+
+
+def calculate_gpu_timepercentage_change(df, labels):
+    base_label = labels[0]
+    for label in labels[1:]:
+        df[f"percentage_change_{label}"] = (
+            (df[f"time ms_{label}"] - df[f"time ms_{base_label}"])
+            / df[f"time ms_{base_label}"]
+            * 100
+        )
+    return df
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inputs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="List of directories containing gpu_timeline_summary_mean.xlsx",
+    )
+
+    parser.add_argument(
+        "--output", type=Path, default="./output.xlsx", help="Output xls file name"
+    )
+    args = parser.parse_args()
+
+    labels = []
+    summary_dfs = []
+    gpu_time_per_rank_dfs = []
+    nccl_dfs = []
+
+    for directory in args.inputs:
+        dir_path = Path(directory)
+        label = dir_path.stem
+
+        if not dir_path.exists():
+            print(f"Directory not found: {dir_path}")
+            continue
+
+        input_excel_file = (
+            dir_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+        )
+        nccl_excel_file = (
+            dir_path
+            / "tracelens_analysis"
+            / "collective_reports"
+            / "collective_all_ranks.xlsx"
+        )
+        if not input_excel_file.exists():
+            print(f"Excel file not found: {input_excel_file}")
+            continue
+
+        labels.append(label)
+
+        # Read and rename columns with label suffix
+        summary = pd.read_excel(input_excel_file, sheet_name="Summary")
+        gpu_time = pd.read_excel(input_excel_file, sheet_name="Per_Rank_Time_ms")
+
+        # Rename non-key columns with label suffix
+        summary = summary.rename(
+            columns={col: f"{col}_{label}" for col in summary.columns if col != "type"}
+        )
+        gpu_time = gpu_time.rename(
+            columns={col: f"{col}_{label}" for col in gpu_time.columns if col != "type"}
+        )
+
+        summary_dfs.append(summary)
+        gpu_time_per_rank_dfs.append(gpu_time)
+        print(f"Loaded: {label}")
+
+        if nccl_excel_file.exists():
+            nccl_df = pd.read_excel(
+                nccl_excel_file, sheet_name="nccl_summary_implicit_sync"
+            )
+
+            # Create index column by appending "Collective name" and "In msg nelems"
+            nccl_df["index"] = (
+                nccl_df["Collective name"].astype(str)
+                + "_"
+                + nccl_df["In msg nelems"].astype(str)
+            )
+
+            # Rename non-key columns with label suffix (exclude 'index' as it's the merge key)
+            nccl_df = nccl_df.rename(
+                columns={
+                    col: f"{col}_{label}" for col in nccl_df.columns if col != "index"
+                }
+            )
+            nccl_dfs.append(nccl_df)
+            print(f"Loaded: {label} NCCL")
+        else:
+            print(f"NCCL file not found: {nccl_excel_file}")
+
+    # Merge all DataFrames on 'type'
+    summary_df = summary_dfs[0]
+    gpu_time_per_rank_df = gpu_time_per_rank_dfs[0]
+    nccl_df = nccl_dfs[0]
+
+    for i in range(1, len(summary_dfs)):
+        summary_df = pd.merge(summary_df, summary_dfs[i], on="type", how="outer")
+        gpu_time_per_rank_df = pd.merge(
+            gpu_time_per_rank_df, gpu_time_per_rank_dfs[i], on="type", how="outer"
+        )
+        nccl_df = pd.merge(nccl_df, nccl_dfs[i], on="index", how="outer")
+
+    summary_df = calculate_gpu_timepercentage_change(summary_df, labels)
+
+    with pd.ExcelWriter(args.output, engine="openpyxl") as writer:
+        summary_df.to_excel(writer, sheet_name="Summary", index=False)
+        gpu_time_per_rank_df.to_excel(
+            writer, sheet_name="Per_Rank_Time_ms", index=False
+        )
+        nccl_df.to_excel(writer, sheet_name="NCCL_Summary", index=False)
+
+    output_dir = Path(args.output).parent / "plots"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    plot_gpu_time_percentage_change(summary_df, labels, output_dir)
+    plot_gpu_time_summary(summary_df, labels, output_dir)
+    plot_all_types_per_rank(gpu_time_per_rank_df, labels, output_dir)
+    plot_nccl_data_per_msg(nccl_df, labels, output_dir)
+
+    html_script_path = Path(__file__).parent / "generate_merged_html.py"
+    cmd = [
+        "python3",
+        str(html_script_path),
+        "--plot-files-directory",
+        str(output_dir),
+        "--output-html",
+        str(args.output.parent / "final_analysis_report.html"),
+    ]
+    if run_command(cmd, "Creating final HTML"):
+        print(
+            f"Final HTML file created at: {args.output.parent / 'final_analysis_report.html'}"
+        )
+    else:
+        print("Failed to create final HTML file")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tracelens_single_config/process_gpu_timeline.py b/scripts/tracelens_single_config/process_gpu_timeline.py
new file mode 100755
index 0000000..7fd00b7
--- /dev/null
+++ b/scripts/tracelens_single_config/process_gpu_timeline.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import argparse
+from pathlib import Path
+
+
+def geometric_mean(values):
+    values = np.array(values)
+    values = np.where(values == 0, 1e-10, values)
+    return np.exp(np.mean(np.log(values)))
+
+
+def process_gpu_timeline(reports_dir, use_geo_mean=False):
+    reports_path = Path(reports_dir)
+
+    if not reports_path.exists():
+        print(f"Error: Directory not found: {reports_dir}")
+        return 1
+
+    print(f"Processing GPU timeline from: {reports_dir}")
+    print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}")
+
+    perf_files = sorted(reports_path.glob('perf_rank*.xlsx'))
+
+    if not perf_files:
+        print("Error: No perf_rank*.xlsx files found")
+        return 1
+
+    print(f"Found {len(perf_files)} rank files")
+
+    rank_data = []
+    for file_path in perf_files:
+        rank_num = int(file_path.stem.replace('perf_rank', ''))
+        try:
+            df = pd.read_excel(file_path, sheet_name='gpu_timeline')
+            df['rank'] = rank_num
+            rank_data.append(df)
+            print(f"  Rank {rank_num}: OK")
+        except Exception as e:
+            print(f"  Rank {rank_num}: Error - {e}")
+
+    if not rank_data:
+        print("Error: No valid data loaded")
+        return 1
+
+    combined = pd.concat(rank_data, ignore_index=True)
+
+    agg_func = geometric_mean if use_geo_mean else 'mean'
+    aggregated = combined.groupby('type').agg({
+        'time ms': agg_func,
+        'percent': agg_func
+    }).reset_index()
+
+    aggregated['num_ranks'] = len(perf_files)
+
+    method_suffix = 'geomean' if use_geo_mean else 'mean'
+    output_path = reports_path.parent / f'gpu_timeline_summary_{method_suffix}.xlsx'
+
+    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+        aggregated.to_excel(writer, sheet_name='Summary', index=False)
+
+        combined_sorted = combined.sort_values(['rank', 'type'])
+        combined_sorted.to_excel(writer, sheet_name='All_Ranks_Combined', index=False)
+
+        per_rank = combined.pivot_table(
+            values='time ms',
+            index='type',
+            columns='rank',
+            aggfunc='first'
+        )
+        per_rank.to_excel(writer, sheet_name='Per_Rank_Time_ms')
+
+        per_rank_pct = combined.pivot_table(
+            values='percent',
+            index='type',
+            columns='rank',
+            aggfunc='first'
+        )
+        per_rank_pct.to_excel(writer, sheet_name='Per_Rank_Percent')
+
+    print(f"\nSaved: {output_path}")
+    print("\nSummary:")
+    print(aggregated.to_string(index=False))
+
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Aggregate GPU timeline across ranks')
+    parser.add_argument('--reports-dir', required=True, help='Path to individual_reports directory')
+    parser.add_argument('--geo-mean', action='store_true', help='Use geometric mean')
+
+    args = parser.parse_args()
+
+    return process_gpu_timeline(args.reports_dir, args.geo_mean)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/scripts/tracelens_single_config/run_full_analysis.py b/scripts/tracelens_single_config/run_full_analysis.py
new file mode 100755
index 0000000..ec38581
--- /dev/null
+++ b/scripts/tracelens_single_config/run_full_analysis.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""
+Master script for complete TraceLens analysis pipeline.
+Runs analysis on baseline and test traces, then performs all comparisons.
+"""
+import argparse
+import subprocess
+import os
+import sys
+from pathlib import Path
+
+
+def run_command(cmd, description):
+    """Execute a command and handle errors."""
+    print(f"\n{'='*80}")
+    print(f"{description}")
+    print(f"{'='*80}")
+    print(f"Command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Error: {description} failed!")
+        print(f"Stderr: {result.stderr}")
+        return False
+
+    print(result.stdout)
+    return True
+
+
+def run_tracelens_analysis(trace_dir, output_name, individual_only=False, collective_only=False):
+    """Run TraceLens analysis on a single trace directory."""
+    print(f"\nAnalyzing: {trace_dir}")
+
+    # Build command
+    script_path = Path(__file__).parent / "run_tracelens_single_config.sh"
+    cmd = ["bash", str(script_path), trace_dir]
+
+    if individual_only:
+        cmd.append("--individual-only")
+    elif collective_only:
+        cmd.append("--collective-only")
+
+    return run_command(cmd, f"TraceLens analysis for {output_name}")
+
+
+def process_gpu_timeline(reports_dir):
+    """Process GPU timeline from individual reports."""
+    script_path = Path(__file__).parent / "process_gpu_timeline.py"
+    cmd = ["python3", str(script_path), "--reports-dir", reports_dir]
+
+    return run_command(cmd, "Processing GPU timeline")
+
+
+def combine_reports(baseline_file, test_file, output_file):
+    """Combine baseline and test reports."""
+    script_path = Path(__file__).parent / "combine_reports.py"
+    cmd = ["python3", str(script_path),
+           "--baseline", baseline_file,
+           "--test", test_file,
+           "--output", output_file]
+
+    return run_command(cmd, f"Combining reports to {output_file}")
+
+
+def add_comparison_sheets(input_file, output_file, baseline_label=None, test_label=None):
+    """Add comparison sheets for GPU timeline."""
+    script_path = Path(__file__).parent / "add_comparison_sheets.py"
+    cmd = ["python3", str(script_path),
+           "--input", input_file,
+           "--output", output_file]
+    if baseline_label:
+        cmd.extend(["--baseline-label", baseline_label])
+    if test_label:
+        cmd.extend(["--test-label", test_label])
+
+    return run_command(cmd, "Adding GPU timeline comparison sheets")
+
+
+def add_collective_comparison(input_file, output_file, baseline_label=None, test_label=None):
+    """Add comparison sheets for collective operations."""
+    script_path = Path(__file__).parent / "add_collective_comparison.py"
+    cmd = ["python3", str(script_path),
+           "--input", input_file,
+           "--output", output_file]
+    if baseline_label:
+        cmd.extend(["--baseline-label", baseline_label])
+    if test_label:
+        cmd.extend(["--test-label", test_label])
+
+    return run_command(cmd, "Adding collective comparison sheets")
+
+
+def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label=None, test_label=None):
+    """Create comprehensive final report with all data."""
+    script_path = Path(__file__).parent / "create_final_report.py"
+    cmd = ["python3", str(script_path),
+           "--gpu-combined", gpu_combined,
+           "--gpu-comparison", gpu_comparison,
+           "--coll-combined", coll_combined,
+           "--coll-comparison", coll_comparison,
+           "--output", output_file]
+
+    if baseline_label:
+        cmd.extend(["--baseline-label", baseline_label])
+    if test_label:
+        cmd.extend(["--test-label", test_label])
+
+    return run_command(cmd, "Creating comprehensive final report")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Complete TraceLens analysis pipeline with comparisons',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Full analysis with everything including final report
+  python run_full_analysis.py \\
+    --baseline /path/to/baseline/traces \\
+    --test /path/to/test/traces \\
+    --output /path/to/output \\
+    --all
+
+  # Only GPU timeline comparison
+  python run_full_analysis.py \\
+    --baseline /path/to/baseline \\
+    --test /path/to/test \\
+    --output /path/to/output \\
+    --gpu-timeline
+
+  # Create final report (skip TraceLens if already done)
+  python run_full_analysis.py \\
+    --baseline /path/to/baseline \\
+    --test /path/to/test \\
+    --output /path/to/output \\
+    --gpu-timeline --collective --final-report \\
+    --skip-tracelens
+
+        """
+    )
+
+    # Required arguments
+    parser.add_argument('--baseline', required=True,
+                       help='Path to baseline trace directory')
+    parser.add_argument('--test', required=True,
+                       help='Path to test trace directory')
+    parser.add_argument('--output', required=True,
+                       help='Output directory for comparison results')
+
+    # Analysis options
+    parser.add_argument('--skip-tracelens', action='store_true',
+                       help='Skip TraceLens report generation (if already done)')
+    parser.add_argument('--individual-only', action='store_true',
+                       help='Generate only individual reports')
+    parser.add_argument('--collective-only', action='store_true',
+                       help='Generate only collective reports')
+
+    # Comparison options
+    parser.add_argument('--gpu-timeline', action='store_true',
+                       help='Perform GPU timeline comparison')
+    parser.add_argument('--collective', action='store_true',
+                       help='Perform collective/NCCL comparison')
+    parser.add_argument('--final-report', action='store_true',
+                       help='Create comprehensive final report with tables and hidden raw data')
+    parser.add_argument('--generate-plots', action='store_true',
+                       help='Generate visualization plots and HTML report from final report')
+    parser.add_argument('--all', action='store_true',
+                       help='Perform all analyses and comparisons including final report, plots, and HTML report')
+
+    args = parser.parse_args()
+
+    # Handle --all flag
+    if args.all:
+        args.gpu_timeline = True
+        args.collective = True
+        args.final_report = True
+        args.generate_plots = True
+
+    # Validate inputs
+    baseline_path = Path(args.baseline)
+    test_path = Path(args.test)
+    output_path = Path(args.output)
+
+    if not baseline_path.exists():
+        print(f"Error: Baseline path not found: {args.baseline}")
+        return 1
+
+    if not test_path.exists():
+        print(f"Error: Test path not found: {args.test}")
+        return 1
+
+    # Create output directory
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    print("\n" + "="*80)
+    print("TRACELENS FULL ANALYSIS PIPELINE")
+    print("="*80)
+    print(f"Baseline: {args.baseline}")
+    print(f"Test: {args.test}")
+    print(f"Output: {args.output}")
+    print(f"Options:")
+    print(f"  Skip TraceLens: {args.skip_tracelens}")
+    print(f"  GPU timeline: {args.gpu_timeline}")
+    print(f"  Collective: {args.collective}")
+    print(f"  Final report: {args.final_report}")
+
+    # Step 1: Run TraceLens analysis on both directories
+    if not args.skip_tracelens:
+        print("\n" + "="*80)
+        print("STEP 1: Running TraceLens Analysis")
+        print("="*80)
+
+        if not run_tracelens_analysis(args.baseline, "baseline",
+                                     args.individual_only, args.collective_only):
+            return 1
+
+        if not run_tracelens_analysis(args.test, "test",
+                                     args.individual_only, args.collective_only):
+            return 1
+    else:
+        print("\nSkipping TraceLens report generation (--skip-tracelens flag)")
+
+    # Determine analysis directories
+    baseline_analysis = baseline_path / "tracelens_analysis"
+    test_analysis = test_path / "tracelens_analysis"
+
+    if not baseline_analysis.exists():
+        print(f"Error: Baseline analysis not found: {baseline_analysis}")
+        print("Run without --skip-tracelens flag first")
+        return 1
+
+    if not test_analysis.exists():
+        print(f"Error: Test analysis not found: {test_analysis}")
+        print("Run without --skip-tracelens flag first")
+        return 1
+
+    # Extract config labels from paths
+    baseline_label = baseline_path.name  # e.g., "56cu_256threads"
+    test_label = test_path.name  # e.g., "37cu_384threads"
+
+    # Step 2: GPU Timeline Comparison
+    if args.gpu_timeline:
+        print("\n" + "="*80)
+        print("STEP 2: GPU Timeline Comparison")
+        print(f"  Baseline: {baseline_label}")
+        print(f"  Test: {test_label}")
+        print("="*80)
+
+        # Process GPU timelines
+        baseline_reports = baseline_analysis / "individual_reports"
+        test_reports = test_analysis / "individual_reports"
+
+        if not baseline_reports.exists() or not test_reports.exists():
+            print("Error: Individual reports not found. Run without --individual-only flag")
+            return 1
+
+        print(f"\nProcessing baseline GPU timeline ({baseline_label})...")
+        if not process_gpu_timeline(str(baseline_reports)):
+            return 1
+
+        print(f"\nProcessing test GPU timeline ({test_label})...")
+        if not process_gpu_timeline(str(test_reports)):
+            return 1
+
+        # Combine GPU timeline summaries
+        baseline_gpu = baseline_analysis / "gpu_timeline_summary_mean.xlsx"
+        test_gpu = test_analysis / "gpu_timeline_summary_mean.xlsx"
+        combined_gpu = output_path / "gpu_timeline_combined.xlsx"
+
+        if not combine_reports(str(baseline_gpu), str(test_gpu), str(combined_gpu)):
+            return 1
+
+        # Add comparison sheets
+        gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+        if not add_comparison_sheets(str(combined_gpu), str(gpu_comparison), baseline_label, test_label):
+            return 1
+
+        print(f"\nGPU timeline comparison saved to: {gpu_comparison}")
+
+    # Step 3: Collective Comparison
+    if args.collective:
+        print("\n" + "="*80)
+        print("STEP 3: Collective/NCCL Comparison")
+        print(f"  Baseline: {baseline_label}")
+        print(f"  Test: {test_label}")
+        print("="*80)
+
+        baseline_collective = baseline_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+        test_collective = test_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+
+        if not baseline_collective.exists() or not test_collective.exists():
+            print("Error: Collective reports not found. Run without --collective-only flag")
+            return 1
+
+        # Combine collective reports
+        combined_collective = output_path / "collective_combined.xlsx"
+        if not combine_reports(str(baseline_collective), str(test_collective),
+                             str(combined_collective)):
+            return 1
+
+        # Add collective comparison
+        collective_comparison = output_path / "collective_comparison.xlsx"
+        if not add_collective_comparison(str(combined_collective),
+                                        str(collective_comparison), baseline_label, test_label):
+            return 1
+
+        print(f"\nCollective comparison saved to: {collective_comparison}")
+
+    # Step 4: Create final comprehensive report
+    if args.final_report and args.gpu_timeline and args.collective:
+        print("\n" + "="*80)
+        print("STEP 4: Creating Final Comprehensive Report")
+        print("="*80)
+
+        gpu_combined = output_path / "gpu_timeline_combined.xlsx"
+        gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+        collective_combined = output_path / "collective_combined.xlsx"
+        collective_comparison = output_path / "collective_comparison.xlsx"
+        final_report = output_path / "final_analysis_report.xlsx"
+
+        if not create_final_report(str(gpu_combined), str(gpu_comparison),
+                                  str(collective_combined), str(collective_comparison),
+                                  str(final_report), baseline_label, test_label):
+            return 1
+
+        print(f"\nFinal comprehensive report saved to: {final_report}")
+        print("  - Summary Dashboard as first sheet")
+        print("  - All comparison sheets visible")
+        print("  - Raw data sheets hidden (can be unhidden in Excel)")
+        print("  - All data formatted as Excel tables with filters")
+        print("  - Color coding applied (green=better, red=worse)")
+
+    # Step 5: Generate visualization plots
+    if args.generate_plots and args.final_report:
+        print("\n" + "="*80)
+        print("STEP 5: Generating Visualization Plots")
+        print("="*80)
+
+        final_report = output_path / "final_analysis_report.xlsx"
+        plots_dir = output_path / "plots"
+
+        if final_report.exists():
+            script_path = Path(__file__).parent / "generate_enhanced_plots.py"
+            cmd = ["python3", str(script_path),
+                   "--input", str(final_report),
+                   "--output", str(plots_dir)]
+
+            # The script generates HTML report by default
+            if run_command(cmd, "Generating visualization plots and HTML report"):
+                print(f"\nOutput saved to: {plots_dir}/")
+                print("\n  Generated plots:")
+                print("    - Percentage Change Overview")
+                print("    - Absolute Time Comparison")
+                print("    - Performance Heatmap by Rank")
+                print("    - Total Execution Time by Rank")
+                print("    - Time Breakdown by Rank")
+                print("    - Percentage Breakdown by Rank")
+                print("    - NCCL/Collective Metrics")
+                print("\n  HTML Report: plots/performance_analysis_report.html")
+                print("    - Open in browser to view complete report")
+                print("    - Print to PDF: Ctrl+P (or Cmd+P on Mac)")
+        else:
+            print("  Final report not found, skipping plot generation")
+
+    # Summary
+    print("\n" + "="*80)
+    print("ANALYSIS COMPLETE!")
+    print("="*80)
+    print(f"\nResults saved to: {output_path}")
+
+    files = list(output_path.glob("*.xlsx"))
+    if files:
+        print("\nGenerated Excel files:")
+        for f in sorted(files):
+            print(f"  - {f.name}")
+
+    if args.generate_plots:
+        plots_dir = output_path / "plots"
+        if plots_dir.exists():
+            plot_files = list(plots_dir.glob("*.png"))
+            if plot_files:
+                print("\nGenerated plots:")
+                for f in sorted(plot_files):
+                    print(f"  - plots/{f.name}")
+
+    print("\nAnalysis pipeline completed successfully!")
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
new file mode 100755
index 0000000..b30ac09
--- /dev/null
+++ b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+
+# Compare specific RCCL Warp Speed configurations
+# Usage: ./run_rccl_warp_speed_comparison.sh [OPTIONS]
+#   -c CONFIG_FILE    Config file (default: config/distributed.yaml)
+#   -p PAIRS          CU,threads pairs (e.g., "56,256 37,384 32,512")
+#   -h                Show help
+#
+# Examples:
+#   # Use default 3 configurations
+#   ./run_rccl_warp_speed_comparison.sh
+#
+#   # Custom configurations
+#   ./run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512"
+#
+#   # Different config file with custom pairs
+#   ./run_rccl_warp_speed_comparison.sh -c myconfig.yaml -p "40,256 30,512"
+
+CONFIG_FILE="config/distributed.yaml"
+CUSTOM_PAIRS=""
+
+# Parse command line arguments
+while getopts "c:p:h" opt; do
+    case $opt in
+        c)
+            CONFIG_FILE="$OPTARG"
+            ;;
+        p)
+            CUSTOM_PAIRS="$OPTARG"
+            ;;
+        h)
+            echo "Usage: $0 [OPTIONS]"
+            echo "  -c CONFIG_FILE    Config file (default: config/single_node/gemm_overlap_comm.yaml)"
+            echo "  -p PAIRS          CU,threads pairs (e.g., \"56,256 37,384 32,512\")"
+            echo "  -h                Show help"
+            echo ""
+            echo "Examples:"
+            echo "  # Use default 3 configurations"
+            echo "  $0"
+            echo ""
+            echo "  # Custom configurations"
+            echo "  $0 -p \"56,256 37,384 32,512\""
+            echo ""
+            echo "  # Different config file with custom pairs"
+            echo "  $0 -c myconfig.yaml -p \"40,256 30,512\""
+            exit 0
+            ;;
+        \?)
+            echo "Invalid option: -$OPTARG"
+            exit 1
+            ;;
+    esac
+done
+BASE_CMD="torchrun --nproc_per_node 8 train.py --config ${CONFIG_FILE}"
+BASE_OVERRIDES="--override training.max_steps=100 --override profiling.tensorboard=false"
+
+# Base output directory
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+BASE_OUTPUT_DIR="experiments/rccl_warp_speed_${TIMESTAMP}"
+
+# Create base output directory
+mkdir -p "${BASE_OUTPUT_DIR}"
+
+# Log file
+SWEEP_LOG="${BASE_OUTPUT_DIR}/rccl_warp_speed_comparison_${TIMESTAMP}.log"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+# Function to log with timestamp
+log() {
+    local message="$1"
+    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
+    echo "[${timestamp}] ${message}" | tee -a "${SWEEP_LOG}"
+}
+
+# Cleanup function for Ctrl+C
+cleanup() {
+    echo ""
+    echo -e "${RED}=== Caught interrupt signal (Ctrl+C) ===${NC}" | tee -a "${SWEEP_LOG}"
+    log "Cleaning up all training processes..."
+    sudo pkill -9 -f "train.py" 2>/dev/null || true
+    sudo pkill -9 -f "torchrun" 2>/dev/null || true
+    log "Cleanup complete. Exiting."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+echo -e "${GREEN}=== RCCL Warp Speed Configuration Comparison ===${NC}" | tee "${SWEEP_LOG}"
+log "Config file: ${CONFIG_FILE}"
+log "Results directory: ${BASE_OUTPUT_DIR}"
+echo ""
+
+# Check RCCL version and configuration
+echo -e "${BLUE}=== RCCL Version Check ===${NC}" | tee -a "${SWEEP_LOG}"
+
+# Check if custom RCCL is available
+if [ -d "/opt/rccl/build/release" ]; then
+    echo -e "${GREEN}[OK] Custom RCCL found at /opt/rccl/build/release${NC}" | tee -a "${SWEEP_LOG}"
+
+    # Check branch and commit
+    if [ -d "/opt/rccl/.git" ]; then
+        cd /opt/rccl
+        RCCL_BRANCH=$(git branch --show-current 2>/dev/null)
+        RCCL_COMMIT=$(git log --oneline -1 2>/dev/null)
+        cd - > /dev/null
+
+        echo "  Branch: ${RCCL_BRANCH}" | tee -a "${SWEEP_LOG}"
+        echo "  Commit: ${RCCL_COMMIT}" | tee -a "${SWEEP_LOG}"
+
+        # Verify it's warp_speed_v1
+        if [[ "${RCCL_BRANCH}" == "warp_speed_v1" ]]; then
+            echo -e "  ${GREEN}[OK] Using warp_speed_v1 branch${NC}" | tee -a "${SWEEP_LOG}"
+        else
+            echo -e "  ${YELLOW}[WARNING] Not on warp_speed_v1 branch (current: ${RCCL_BRANCH})${NC}" | tee -a "${SWEEP_LOG}"
+        fi
+    fi
+
+    # Check library size to verify it's built
+    RCCL_LIB_SIZE=$(ls -lh /opt/rccl/build/release/librccl.so.1.0 2>/dev/null | awk '{print $5}')
+    echo "  Library size: ${RCCL_LIB_SIZE}" | tee -a "${SWEEP_LOG}"
+else
+    echo -e "${YELLOW}[WARNING] Custom RCCL not found, will use PyTorch bundled version${NC}" | tee -a "${SWEEP_LOG}"
+    echo "  PyTorch's bundled RCCL may not have warp_speed features!" | tee -a "${SWEEP_LOG}"
+fi
+
+# Test if RCCL responds to warp_speed environment variables
+echo "" | tee -a "${SWEEP_LOG}"
+echo "Testing warp_speed environment variable response..." | tee -a "${SWEEP_LOG}"
+export RCCL_WARP_SPEED_ENABLE=1
+export RCCL_WARP_SPEED_CU_COUNT=56
+export NCCL_DEBUG=VERSION
+
+python -c "
+import torch
+print('PyTorch version:', torch.__version__)
+if torch.cuda.is_available():
+    print('ROCm/CUDA available:', True)
+    print('Device count:', torch.cuda.device_count())
+" 2>&1 | tee -a "${SWEEP_LOG}"
+
+# Clean up test variables
+unset RCCL_WARP_SPEED_CU_COUNT
+unset NCCL_DEBUG
+
+echo -e "${BLUE}===========================${NC}" | tee -a "${SWEEP_LOG}"
+echo ""
+
+# Define configurations to test
+# Format: "NAME|CU_COUNT|THREADS_PER_BLOCK"
+if [ -n "$CUSTOM_PAIRS" ]; then
+    # Parse custom pairs
+    CONFIGS=()
+    for pair in $CUSTOM_PAIRS; do
+        IFS=',' read -r cu threads <<< "$pair"
+        CONFIGS+=("${cu}cu_${threads}threads|${cu}|${threads}")
+    done
+    log "Using custom configurations: ${CUSTOM_PAIRS}"
+else
+    # Use default configurations
+    CONFIGS=(
+        "56cu_256threads|56|256"
+        "37cu_384threads|37|384"
+        "32cu_512threads|32|512"
+    )
+    log "Using default RCCL Warp Speed configurations"
+fi
+
+# Track results
+declare -A RUN_STATUS
+declare -A RUN_TIMES
+
+# Run each configuration
+for config in "${CONFIGS[@]}"; do
+    IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+
+    OUTPUT_DIR="${BASE_OUTPUT_DIR}/${NAME}"
+
+    echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}"
+    log "Running configuration: ${NAME}"
+    log "  RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}"
+    log "  RCCL_THREADS_PER_BLOCK=${THREADS}"
+    log "  Output directory: ${OUTPUT_DIR}"
+    echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}"
+
+    # Create output directory
+    mkdir -p "${OUTPUT_DIR}"
+
+    # Record start time
+    START_TIME=$(date +%s)
+
+    # Export environment variables so child processes inherit them
+    export RCCL_WARP_SPEED_ENABLE=1
+    export RCCL_UNROLL_FACTOR=1
+    export RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}
+    export RCCL_THREADS_PER_BLOCK=${THREADS}
+    export HSA_ENABLE_SDMA=0
+    export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1
+
+    # Use custom RCCL if available
+    if [ -d "/opt/rccl/build/release" ]; then
+        export LD_LIBRARY_PATH=/opt/rccl/build/release:${LD_LIBRARY_PATH:-}
+        log "  Using custom RCCL from /opt/rccl/build/release"
+    fi
+
+    # Run the command
+    ${BASE_CMD} ${BASE_OVERRIDES} \
+        --override training.output_dir=${OUTPUT_DIR} \
+        2>&1 | tee "${OUTPUT_DIR}/run_output.log"
+
+    EXIT_CODE=${PIPESTATUS[0]}
+    END_TIME=$(date +%s)
+    DURATION=$((END_TIME - START_TIME))
+
+    # Unset environment variables to avoid affecting next run
+    unset RCCL_WARP_SPEED_CU_COUNT
+    unset RCCL_THREADS_PER_BLOCK
+
+    RUN_TIMES[${NAME}]=${DURATION}
+
+    if [ $EXIT_CODE -eq 0 ]; then
+        log "[OK] Completed ${NAME} (duration: ${DURATION}s)"
+        RUN_STATUS[${NAME}]="SUCCESS"
+    else
+        log "[ERROR] Failed ${NAME} (exit code: $EXIT_CODE, duration: ${DURATION}s)"
+        RUN_STATUS[${NAME}]="FAILED"
+    fi
+
+    # Fix permissions if running as root in container
+    if [ "$EUID" -eq 0 ]; then
+        chmod -R 755 "${OUTPUT_DIR}" 2>/dev/null || true
+    fi
+
+    echo ""
+    log "Waiting 5 seconds before next run..."
+    sleep 5
+done
+
+# Generate summary report
+echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}"
+echo -e "${BLUE}SUMMARY REPORT${NC}" | tee -a "${SWEEP_LOG}"
+echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}"
+
+SUMMARY_FILE="${BASE_OUTPUT_DIR}/rccl_warp_speed_summary_${TIMESTAMP}.txt"
+{
+    echo "RCCL Warp Speed Configuration Comparison"
+    echo "Generated: $(date)"
+    echo ""
+    printf "%-20s %-10s %-15s %-10s\n" "CONFIGURATION" "CU_COUNT" "THREADS" "STATUS"
+    echo "----------------------------------------------------------------"
+
+    for config in "${CONFIGS[@]}"; do
+        IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+        STATUS="${RUN_STATUS[${NAME}]:-UNKNOWN}"
+        DURATION="${RUN_TIMES[${NAME}]:-N/A}"
+        printf "%-20s %-10s %-15s %-10s (duration: %ss)\n" "${NAME}" "${CU_COUNT}" "${THREADS}" "${STATUS}" "${DURATION}"
+    done
+
+    echo ""
+    echo "Output directories:"
+    for config in "${CONFIGS[@]}"; do
+        IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+        echo "  ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/"
+    done
+
+    echo ""
+    echo "Trace files for each configuration:"
+    for config in "${CONFIGS[@]}"; do
+        IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+        echo "  ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/torch_profiler/"
+    done
+} | tee "${SUMMARY_FILE}"
+
+log "Summary saved to: ${SUMMARY_FILE}"
+
+# Fix permissions for the entire output directory if running as root
+if [ "$EUID" -eq 0 ]; then
+    echo "Fixing permissions for output directory..." | tee -a "${SWEEP_LOG}"
+    chmod -R 755 "${BASE_OUTPUT_DIR}" 2>/dev/null || true
+fi
+
+echo ""
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}Next Steps: Run TraceLens Analysis${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo "To analyze and compare these configurations:"
+echo ""
+echo "./scripts/tracelens_single_config/run_tracelens_analysis.sh ${BASE_OUTPUT_DIR}"
+echo ""
+echo "This will generate:"
+echo "  - Individual reports for each rank (all 3 configs)"
+echo "  - Collective reports (all 3 configs)"
+echo "  - Comparison reports across the 3 configurations"
+echo ""
+
+log "All runs completed! Run TraceLens analysis next."
diff --git a/scripts/tracelens_single_config/run_tracelens_single_config.sh b/scripts/tracelens_single_config/run_tracelens_single_config.sh
new file mode 100755
index 0000000..96831ff
--- /dev/null
+++ b/scripts/tracelens_single_config/run_tracelens_single_config.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+# TraceLens Analysis for Single Configuration (No Sweep)
+# Usage: ./run_tracelens_single_config.sh <directory_path>
+#
+# The script accepts either:
+#   - Path to parent directory containing torch_profiler/
+#   - Path to torch_profiler/ directory directly
+#
+# Examples:
+#   ./run_tracelens_single_config.sh /path/to/traces
+#   ./run_tracelens_single_config.sh /path/to/traces/torch_profiler
+#
+# Note: Uses GEMM-patched TraceLens wrapper to recognize ROCm Tensile kernels
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Use patched TraceLens wrapper for GEMM recognition
+TRACELENS_WRAPPER="python $SCRIPT_DIR/../tracelens_with_gemm_patch.py"
+
+# Parse options
+RUN_INDIVIDUAL=true
+RUN_COLLECTIVE=true
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --individual-only)
+            RUN_COLLECTIVE=false
+            shift
+            ;;
+        --collective-only)
+            RUN_INDIVIDUAL=false
+            shift
+            ;;
+        *)
+            INPUT_DIR="$1"
+            shift
+            ;;
+    esac
+done
+
+# Check if directory provided
+if [ -z "$INPUT_DIR" ]; then
+    echo "Error: Please provide trace directory"
+    echo ""
+    echo "Usage: $0 <directory_path> [options]"
+    echo ""
+    echo "Options:"
+    echo "  --individual-only    Generate only individual reports"
+    echo "  --collective-only    Generate only collective report"
+    echo ""
+    echo "Examples:"
+    echo "  $0 /path/to/traces"
+    echo "  $0 /path/to/traces --individual-only"
+    echo "  $0 /path/to/traces --collective-only"
+    echo ""
+    exit 1
+fi
+
+# Verify directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+    echo "Error: Directory not found: $INPUT_DIR"
+    exit 1
+fi
+
+# Auto-detect structure: is this torch_profiler/ or its parent?
+TORCH_PROF_DIR=""
+BASE_DIR=""
+
+# Check if INPUT_DIR contains rank directories (i.e., it IS torch_profiler/)
+if find "$INPUT_DIR" -maxdepth 1 -type d -name "rank*" | grep -q .; then
+    TORCH_PROF_DIR="$INPUT_DIR"
+    BASE_DIR=$(dirname "$INPUT_DIR")
+    echo "Detected torch_profiler directory: $TORCH_PROF_DIR"
+# Check if INPUT_DIR contains torch_profiler/ subdirectory
+elif [ -d "$INPUT_DIR/torch_profiler" ]; then
+    TORCH_PROF_DIR="$INPUT_DIR/torch_profiler"
+    BASE_DIR="$INPUT_DIR"
+    echo "Found torch_profiler subdirectory: $TORCH_PROF_DIR"
+else
+    echo "Error: Cannot find rank directories in expected structure"
+    echo ""
+    echo "Expected one of:"
+    echo "  1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)"
+    echo "  2. Parent directory containing torch_profiler/rank0/, rank1/, ..."
+    echo ""
+    echo "Provided: $INPUT_DIR"
+    exit 1
+fi
+
+echo "════════════════════════════════════════════════════════════════"
+echo "           TraceLens Analysis - Single Configuration"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "Input directory: $INPUT_DIR"
+echo "Torch profiler traces: $TORCH_PROF_DIR"
+echo ""
+
+# Create output directory in the base directory
+OUTPUT_DIR="${BASE_DIR}/tracelens_analysis"
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR/individual_reports"
+mkdir -p "$OUTPUT_DIR/collective_reports"
+
+# Detect number of ranks
+NUM_RANKS=$(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | wc -l)
+
+if [ $NUM_RANKS -eq 0 ]; then
+    echo "Error: No rank directories found in $TORCH_PROF_DIR"
+    exit 1
+fi
+
+echo "Detected $NUM_RANKS ranks"
+
+# Show sample trace files
+echo ""
+echo "Sample trace files:"
+for rank_dir in $(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | sort | head -3); do
+    rank_name=$(basename "$rank_dir")
+    trace_file=$(find "$rank_dir" -name "*.json" | head -1)
+    if [ -n "$trace_file" ]; then
+        echo "  $rank_name: $(basename "$trace_file")"
+    fi
+done
+if [ "$RUN_INDIVIDUAL" = true ]; then
+    echo ""
+    echo "════════════════════════════════════════════════════════════════"
+    echo "Step 1: Generating Individual Performance Reports"
+    echo "════════════════════════════════════════════════════════════════"
+    echo ""
+
+# Process each rank
+for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+    # Try multiple directory naming patterns
+    RANK_DIR=""
+    if [ -d "$TORCH_PROF_DIR/rank${rank_idx}" ]; then
+        RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+    elif [ -d "$TORCH_PROF_DIR/rank_${rank_idx}" ]; then
+        RANK_DIR="$TORCH_PROF_DIR/rank_${rank_idx}"
+    elif [ -d "$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)" ]; then
+        RANK_DIR="$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)"
+    fi
+
+    if [ -z "$RANK_DIR" ] || [ ! -d "$RANK_DIR" ]; then
+        echo "  Skip rank ${rank_idx} - directory not found"
+        continue
+    fi
+
+    # Find trace file
+    TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+
+    if [ -z "$TRACE" ]; then
+        echo "⚠️  Skip rank ${rank_idx} - no trace file found"
+        continue
+    fi
+
+    OUTPUT="$OUTPUT_DIR/individual_reports/perf_rank${rank_idx}.xlsx"
+
+    echo "Processing rank ${rank_idx}..."
+    echo "  Trace: $(basename "$TRACE")"
+
+    $TRACELENS_WRAPPER generate_perf_report \
+        --profile_json_path "$TRACE" \
+        --output_xlsx_path "$OUTPUT" \
+        --include_unlinked_kernels \
+        --short_kernel_study \
+        --short_kernel_threshold_us 50 \
+        --topk_ops 100 \
+        --topk_roofline_ops 100
+
+    echo "  Done: $OUTPUT"
+    echo ""
+done
+
+fi
+
+if [ "$RUN_COLLECTIVE" = true ]; then
+    echo ""
+    echo "════════════════════════════════════════════════════════════════"
+    echo "Step 2: Generating Multi-Rank Collective Report"
+    echo "════════════════════════════════════════════════════════════════"
+    echo ""
+
+# Find a sample trace file to get the filename pattern
+SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank0" -name "*.json" -type f | head -1)
+if [ -z "$SAMPLE_TRACE" ]; then
+    # Try alternative rank naming
+    SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_0" -name "*.json" -type f | head -1)
+fi
+
+if [ -z "$SAMPLE_TRACE" ]; then
+    # Try rank_00
+    SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_00" -name "*.json" -type f | head -1)
+fi
+
+if [ -n "$SAMPLE_TRACE" ]; then
+    OUTPUT="$OUTPUT_DIR/collective_reports/collective_all_ranks.xlsx"
+
+    echo "Generating collective report for all $NUM_RANKS ranks..."
+
+    # Create symlinks with consistent names for collective report
+    for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+        RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+        if [ -d "$RANK_DIR" ]; then
+            TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+            if [ -n "$TRACE" ]; then
+                ln -sf "$(basename "$TRACE")" "$RANK_DIR/trace.json"
+            fi
+        fi
+    done
+
+    echo "  Trace pattern: rank*/trace.json"
+
+    $TRACELENS_WRAPPER generate_multi_rank_collective \
+        --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \
+        --world_size $NUM_RANKS \
+        --output_xlsx_path "$OUTPUT" \
+        --detailed_analysis \
+        --use_multiprocessing
+
+    echo "  Done: $OUTPUT"
+else
+    echo "  Could not generate collective report - no trace files found"
+fi
+
+fi
+
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "Analysis Complete!"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "📁 Results saved to:"
+echo "   $OUTPUT_DIR/"
+echo ""
+
+# Count generated reports
+INDIV_COUNT=$(find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+COLL_COUNT=$(find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+
+echo "Generated reports:"
+echo "  Individual reports (per rank): $INDIV_COUNT"
+echo "  Collective reports (all ranks): $COLL_COUNT"
+echo ""
+
+echo "📊 Report Files:"
+echo ""
+echo "Individual Performance Reports:"
+if [ $INDIV_COUNT -gt 0 ]; then
+    find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" | sort | sed 's/^/  /'
+else
+    echo "  (none generated)"
+fi
+echo ""
+
+echo "Collective Reports:"
+if [ $COLL_COUNT -gt 0 ]; then
+    find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" | sed 's/^/  /'
+else
+    echo "  (none generated)"
+fi
+
+echo ""
+echo "Done!"
diff --git a/scripts/tracelens_with_gemm_patch.py b/scripts/tracelens_with_gemm_patch.py
new file mode 100755
index 0000000..6a200d9
--- /dev/null
+++ b/scripts/tracelens_with_gemm_patch.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+TraceLens with GEMM Recognition Patches
+
+This script applies GEMM recognition patches and runs TraceLens commands.
+
+Usage:
+    python tracelens_with_gemm_patch.py generate_perf_report [args...]
+    python tracelens_with_gemm_patch.py generate_multi_rank_collective [args...]
+    python tracelens_with_gemm_patch.py compare_perf_reports [args...]
+"""
+
+import re
+import sys
+
+
+def apply_gemm_patches():
+    """Apply all GEMM recognition patches to TraceLens."""
+
+    print("Applying TraceLens GEMM recognition patches...")
+
+    # Patch kernel_name_parser for enhanced ROCm GEMM recognition
+    try:
+        from TraceLens.PerfModel import kernel_name_parser
+
+        def patched_is_rocm_gemm(kernel_name):
+            """
+            Enhanced ROCm GEMM pattern matching for Tensile kernels.
+            Recognizes: Cijk_Alik_Bljk_... and variants with arbitrary prefixes.
+            """
+            pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+            return bool(re.match(pattern, kernel_name))
+
+        def patched_parse_rocm_gemm(kernel_name):
+            """Parse ROCm GEMM kernel details."""
+            # Parse transpose flags
+            trans_a, trans_b = None, None
+            if "_Ailk_" in kernel_name:
+                trans_a = False
+            elif "_Alik_" in kernel_name:
+                trans_a = True
+            if "_Bljk_" in kernel_name:
+                trans_b = False
+            elif "_Bjlk_" in kernel_name:
+                trans_b = True
+
+            # Parse macro tile size (MT64x16x64)
+            macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name)
+            if macro_tile_match:
+                mt_m = int(macro_tile_match.group(1))
+                mt_n = int(macro_tile_match.group(2))
+                depth_u = int(macro_tile_match.group(3))
+            else:
+                mt_m, mt_n, depth_u = None, None, None
+
+            return {
+                "transpose": (trans_a, trans_b),
+                "mt_m": mt_m,
+                "mt_n": mt_n,
+                "depth_u": depth_u,
+            }
+
+        def patched_gemm_name_parser(kernel_name):
+            """Enhanced GEMM name parser with better ROCm support."""
+            if patched_is_rocm_gemm(kernel_name):
+                return patched_parse_rocm_gemm(kernel_name)
+            elif kernel_name_parser.is_cuda_gemm(kernel_name):
+                return kernel_name_parser.parse_cuda_gemm(kernel_name)
+            return None
+
+        kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm
+        kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm
+        kernel_name_parser.gemm_name_parser = patched_gemm_name_parser
+
+        print("  [OK] Patched kernel_name_parser (ROCm GEMM recognition)")
+    except ImportError as e:
+        print(f"  [WARN] Could not patch kernel_name_parser: {e}")
+
+    # Patch Trace2Tree util for is_gemm_kernel function
+    try:
+        from TraceLens.Trace2Tree import util as trace_util
+
+        def patched_is_gemm_kernel(kernel_event: dict) -> bool:
+            """Enhanced GEMM kernel detection."""
+            assert kernel_event["cat"] == "kernel"
+            kernel_name = kernel_event["name"]
+
+            # ROCm Tensile GEMM pattern: C[xyz]_A[xyz]_B[xyz]
+            pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+            is_rocm_gemm = bool(re.match(pattern, kernel_name))
+
+            # CUDA GEMM pattern
+            is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name
+
+            return is_rocm_gemm or is_cuda_gemm
+
+        trace_util.is_gemm_kernel = patched_is_gemm_kernel
+        print("  [OK] Patched Trace2Tree.util (is_gemm_kernel)")
+    except ImportError as e:
+        print(f"  [WARN] Could not patch Trace2Tree.util: {e}")
+
+    # Patch TraceEventUtils to enhance GEMM keys
+    try:
+        from TraceLens import util as tracelens_util
+
+        if hasattr(tracelens_util, 'TraceEventUtils'):
+            if hasattr(tracelens_util.TraceEventUtils, 'JaxOpKeys'):
+                original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys
+                enhanced_gemm_keys = ["Cijk", "gemm", "nvjet", "cublasLt", "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}"]
+
+                all_keys = list(set(original_gemm_keys + enhanced_gemm_keys))
+                tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys
+
+                print("  [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)")
+    except (ImportError, AttributeError) as e:
+        print(f"  [WARN] Could not patch TraceEventUtils: {e}")
+
+    # Patch torch_op_mapping for better categorization
+    try:
+        from TraceLens.PerfModel import torch_op_mapping
+
+        original_categorize = torch_op_mapping.categorize_torch_op
+
+        def patched_categorize_torch_op(row):
+            """Enhanced categorization with better GEMM detection."""
+            result = original_categorize(row)
+
+            # If result is 'other', check for GEMM patterns in kernel names
+            if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0:
+                kernel_name = row["kernel_details"][0]["name"]
+                pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+                if re.match(pattern, kernel_name):
+                    return "GEMM"
+
+            return result
+
+        torch_op_mapping.categorize_torch_op = patched_categorize_torch_op
+        print("  [OK] Patched torch_op_mapping (categorize_torch_op)")
+    except ImportError as e:
+        print(f"  [WARN] Could not patch torch_op_mapping: {e}")
+
+    print("[OK] All GEMM patches applied successfully!\n")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: tracelens_with_gemm_patch.py <command> [args...]")
+        print("")
+        print("Commands:")
+        print("  generate_perf_report              - Generate individual performance report")
+        print("  generate_multi_rank_collective    - Generate multi-rank collective report")
+        print("  compare_perf_reports              - Compare performance reports")
+        sys.exit(1)
+
+    # Apply patches before importing TraceLens reporting modules
+    apply_gemm_patches()
+
+    # Import TraceLens after patches are applied
+    from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_perf_report_main
+    from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import main as generate_multi_rank_collective_report_main
+    from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_perf_reports_main
+
+    command = sys.argv[1]
+
+    # Remove the command from argv so TraceLens sees only its args
+    sys.argv = [sys.argv[0]] + sys.argv[2:]
+
+    if command == "generate_perf_report":
+        generate_perf_report_main()
+    elif command == "generate_multi_rank_collective":
+        generate_multi_rank_collective_report_main()
+    elif command == "compare_perf_reports":
+        compare_perf_reports_main()
+    else:
+        print(f"Error: Unknown command '{command}'")
+        print("")
+        print("Available commands:")
+        print("  generate_perf_report")
+        print("  generate_multi_rank_collective")
+        print("  compare_perf_reports")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aorta/training/fsdp_trainer.py b/src/aorta/training/fsdp_trainer.py
index 0a6fbc5..d3a03e0 100644
--- a/src/aorta/training/fsdp_trainer.py
+++ b/src/aorta/training/fsdp_trainer.py
@@ -273,6 +273,7 @@ def build_ddp_model(
     if device.type == "cuda":
         device_ids = [device.index if device.index is not None else torch.cuda.current_device()]
 
+    print(f"===> {device_ids} {model}")
     ddp_model = DDP(
         model,
         device_ids=device_ids,
@@ -735,6 +736,7 @@ def main(args: Optional[argparse.Namespace] = None, *, enable_rocm_metrics: bool
         training_cfg.max_steps or training_cfg.epochs * len(dataloader),
     )
 
+    print(f"Calling main trainer with device {env['device']}")
     profiler = StreamProfiler(env["device"])
 
     try: