diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml
index 04946c6..6179857 100644
--- a/docker/docker-compose.rocm70_9-1.yaml
+++ b/docker/docker-compose.rocm70_9-1.yaml
@@ -3,7 +3,7 @@ services:
container_name: training-overlap-bugs-rocm70
build:
context: .
- dockerfile: Dockerfile.rocm70
+ dockerfile: Dockerfile.rocm70_9-1
user: root
privileged: true
network_mode: host
@@ -15,8 +15,6 @@ services:
security_opt:
- seccomp=unconfined
environment:
- - RCCL_FOLDER=/rccl
- - LD_LIBRARY_PATH=/rccl/build/release:$LD_LIBRARY_PATH
- TORCH_NCCL_HIGH_PRIORITY=1
volumes:
diff --git a/scripts/tracelens_single_config/README.md b/scripts/tracelens_single_config/README.md
new file mode 100644
index 0000000..2c7ca54
--- /dev/null
+++ b/scripts/tracelens_single_config/README.md
@@ -0,0 +1,166 @@
+# RCCL Warp Speed Performance Testing
+
+Test RCCL warp_speed_v1 branch from https://github.com/mustafabar/rccl.git
+
+## Prerequisites
+
+```bash
+pip install pandas openpyxl matplotlib seaborn numpy
+```
+
+## Run Tests
+
+### Step 1: Start Container and Build RCCL
+
+```bash
+cd docker
+docker-compose -f docker-compose.rocm70_9-1.yaml build
+docker-compose -f docker-compose.rocm70_9-1.yaml up -d
+docker-compose -f docker-compose.rocm70_9-1.yaml exec torchenv-rocm70 bash
+
+# Inside container - build warp_speed_v1 (always rebuild)
+# Note: Set --amdgpu_targets to match your GPU architecture
+# Run 'rocminfo | grep gfx' to find your GPU target (e.g., gfx942, gfx950)
+cd /opt
+if [ -d "rccl" ]; then
+ cd rccl
+ git checkout warp_speed_v1
+ git pull
+else
+ git clone --recursive https://github.com/mustafabar/rccl.git
+ cd rccl
+ git checkout warp_speed_v1
+fi
+./install.sh -l --amdgpu_targets=gfx950
+
+cd /workspace/aorta
+```
+
+### Step 2: Run RCCL Tests
+
+```bash
+# Default 3 configurations
+./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
+
+# Custom configurations (CU_count,threads pairs)
+./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512" -c ./config/single_node/gemm_overlap_comm.yaml
+```
+
+Output structure:
+```
+experiments/
+ rccl_warp_speed_YYYYMMDD_HHMMSS/
+ 56cu_256threads/
+ torch_profiler/ # Raw profiler traces
+ run_output.log # Training output log
+ 37cu_384threads/
+ 32cu_512threads/
+ rccl_warp_speed_summary_YYYYMMDD_HHMMSS.txt
+```
+
+### Step 3: Generate Reports (Outside Container)
+
+```bash
+# Exit container
+exit
+
+# Run complete analysis
+python scripts/tracelens_single_config/run_full_analysis.py \
+ --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \
+ --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \
+ --output comparison_results \
+ --all
+
+# Or skip TraceLens if already done
+python scripts/tracelens_single_config/run_full_analysis.py \
+ --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \
+ --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \
+ --output comparison_results \
+ --all --skip-tracelens
+```
+
+## Generated Excel Reports
+
+### Individual TraceLens Reports (per configuration)
+Each configuration generates:
+- `tracelens_analysis/individual_reports/perf_rank*.xlsx` - Per-rank performance breakdown
+- `tracelens_analysis/collective_reports/collective_all_ranks.xlsx` - Collective operations summary
+- `tracelens_analysis/gpu_timeline_summary_mean.xlsx` - GPU timeline averages
+
+### Final Analysis Report (`final_analysis_report.xlsx`)
+
+Contains multiple sheets:
+
+**Summary Sheets:**
+- `Summary_Dashboard` - High-level comparison metrics with percentage changes
+- `Summary_Comparison` - Side-by-side summary comparison
+- `GPU_ByRank_Comparison` - Detailed per-rank performance comparison
+- `Comparison_By_Rank` - Rank-wise metric comparison with differences
+
+**GPU Timeline Sheets:**
+- `All_Ranks_Combined` - Combined GPU timeline data from all ranks
+- `Summary` - Aggregated GPU timeline summary
+- `Rank_*` - Individual rank GPU timelines
+
+**Collective/NCCL Sheets:**
+- `nccl_summary_implicit_sync` - NCCL operations with implicit synchronization
+- `nccl_summary_long` - Long-running NCCL operations
+- `nccl_summary_implicit_sync_comparison` - Comparison of implicit sync operations
+- `nccl_summary_long_comparison` - Comparison of long operations
+
+**Raw Data Sheets (hidden by default):**
+- `gpu_timeline_combined` - Raw combined GPU timeline data
+- `gpu_timeline_comparison` - Raw GPU timeline comparison data
+- `collective_combined` - Raw collective operations data
+- `collective_comparison` - Raw collective comparison data
+
+### Comparison Reports
+
+- `gpu_timeline_combined.xlsx` - Baseline and test GPU metrics combined
+- `gpu_timeline_comparison.xlsx` - GPU metrics with comparison analysis
+- `collective_combined.xlsx` - Baseline and test collective operations combined
+- `collective_comparison.xlsx` - Collective operations with comparison analysis
+
+## Generated Visualizations
+
+### HTML Report
+- `performance_analysis_report.html` - Complete report with all embedded plots
+
+### Individual Plot Files (12 Total)
+1. `plot1_percentage_change_overview.png` - Horizontal bar chart showing performance changes
+2. `plot2_absolute_time_comparison.png` - Bar chart comparing absolute times
+3. `plot3_performance_heatmap.png` - Heatmap of performance by rank
+4. `plot4_total_execution_time.png` - Line plot of total execution time per rank
+5. `plot5_computation_time.png` - Line plot of computation time across ranks
+6. `plot6_communication_time.png` - Line plot of communication time across ranks
+7. `plot7_idle_time.png` - Line plot of idle time across ranks
+8. `plot8_percentage_difference_all_metrics.png` - Bar plot showing percentage differences for all metrics
+9. `plot9_nccl_latency.png` - Line plot of latency vs message size
+10. `plot10_algorithm_bandwidth.png` - Line plot of algorithm bandwidth vs message size
+11. `plot11_bus_bandwidth.png` - Line plot of bus bandwidth vs message size
+12. `plot12_nccl_summary.png` - Combined percentage summary and total latency
+
+## Key Metrics Analyzed
+
+**GPU Metrics:**
+- `computation_time` - Time spent in computation
+- `total_comm_time` - Total communication time
+- `exposed_comm_time` - Non-overlapped communication time
+- `idle_time` - GPU idle time
+- `total_memcpy_time` - Memory copy time
+- `exposed_memcpy_time` - Non-overlapped memory copy time
+- `busy_time` - Total GPU busy time
+- `total_time` - Total execution time
+
+**NCCL Metrics:**
+- `comm_latency_mean` - Average communication latency
+- `algo bw (GB/s)_mean` - Algorithm bandwidth
+- `bus bw (GB/s)_mean` - Bus bandwidth
+- `Total comm latency (ms)` - Total communication latency
+- `count` - Number of operations
+
+## Convert to PDF
+
+1. Open `performance_analysis_report.html` in browser
+2. Print to PDF (Ctrl+P or Cmd+P)
+3. Choose landscape orientation for better plot visibility
diff --git a/scripts/tracelens_single_config/add_collective_comparison.py b/scripts/tracelens_single_config/add_collective_comparison.py
new file mode 100644
index 0000000..6f3f310
--- /dev/null
+++ b/scripts/tracelens_single_config/add_collective_comparison.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_collective_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'):
+ print(f"Loading: {input_path}")
+ print(f" Baseline label: {baseline_label}")
+ print(f" Test label: {test_label}")
+
+ xl = pd.ExcelFile(input_path)
+
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+ # Copy only summary sheets
+ for sheet_name in xl.sheet_names:
+ # Only keep sheets with 'summary' in the name
+ if 'summary' not in sheet_name.lower():
+ print(f" Skip {sheet_name} (keeping only summary sheets)")
+ continue
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(f" Copied {sheet_name}")
+
+ # Process summary sheets for comparison
+ for sheet_name in ['nccl_summary_implicit_sync', 'nccl_summary_long']:
+ if sheet_name not in xl.sheet_names:
+ continue
+
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+
+ # Get actual source values from the dataframe
+ sources = df['source'].unique()
+ # Determine which is baseline and which is test (baseline should be first)
+ if len(sources) >= 2:
+ actual_baseline = sources[0]
+ actual_test = sources[1]
+ else:
+ actual_baseline = baseline_label
+ actual_test = test_label
+
+ # Separate baseline and test
+ baseline_df = df[df['source'] == actual_baseline].copy()
+ test_df = df[df['source'] == actual_test].copy()
+
+ if len(baseline_df) == 0 or len(test_df) == 0:
+ print(f" Skip {sheet_name} - missing data")
+ continue
+
+ # Create comparison dataframe
+ comparison = pd.DataFrame()
+
+ # Identify key columns for grouping
+ group_cols = ['Collective name', 'dtype', 'In msg nelems']
+ if not all(col in baseline_df.columns for col in group_cols):
+ group_cols = ['Collective name']
+
+ # Group and compare
+ baseline_grouped = baseline_df.groupby(group_cols, as_index=False)
+ test_grouped = test_df.groupby(group_cols, as_index=False)
+
+ for name, base_group in baseline_grouped:
+ # Find matching test group
+ if isinstance(name, tuple):
+ mask = pd.Series([True] * len(test_df), index=test_df.index)
+ for col, val in zip(group_cols, name):
+ mask = mask & (test_df[col] == val)
+ else:
+ mask = (test_df[group_cols[0]] == name)
+
+ test_group = test_df.loc[mask]
+
+ if len(test_group) == 0:
+ continue
+
+ # Create comparison row
+ comp_row = {}
+
+ # Copy grouping columns
+ if isinstance(name, tuple):
+ for col, val in zip(group_cols, name):
+ comp_row[col] = val
+ else:
+ comp_row[group_cols[0]] = name
+
+ # Compare numeric columns
+ numeric_cols = ['comm_latency_mean', 'algo bw (GB/s)_mean', 'bus bw (GB/s)_mean',
+ 'Total comm latency (ms)', 'count']
+
+ for col in numeric_cols:
+ if col not in base_group.columns or col not in test_group.columns:
+ continue
+
+ base_val = base_group[col].values[0]
+ test_val = test_group[col].values[0]
+
+ comp_row[f'{baseline_label}_{col}'] = base_val
+ comp_row[f'{test_label}_{col}'] = test_val
+ comp_row[f'diff_{col}'] = test_val - base_val
+
+ # For latency/time: positive percent_change means faster (less time)
+ # For bandwidth: positive percent_change means better (more bandwidth)
+ if 'latency' in col.lower() or 'time' in col.lower():
+ # Lower is better - positive when saleelk is faster
+ pct_change = (base_val - test_val) / base_val * 100 if base_val != 0 else 0
+ comp_row[f'percent_change_{col}'] = pct_change
+ elif 'bw' in col.lower() or 'bandwidth' in col.lower():
+ # Higher is better - positive when saleelk is better
+ pct_change = (test_val - base_val) / base_val * 100 if base_val != 0 else 0
+ comp_row[f'percent_change_{col}'] = pct_change
+
+ comp_row[f'ratio_{col}'] = test_val / base_val if base_val != 0 else 0
+
+ comparison = pd.concat([comparison, pd.DataFrame([comp_row])], ignore_index=True)
+
+ # Write comparison sheet (shorten name to fit Excel's 31 char limit)
+ # Replace 'nccl_summary_' with 'nccl_' and '_comparison' with '_cmp'
+ comparison_sheet_name = sheet_name.replace('nccl_summary_', 'nccl_') + '_cmp'
+ comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False)
+ print(f" Added {comparison_sheet_name}")
+
+ # Add conditional formatting to percent_change columns
+ print(f" Applying conditional formatting to {comparison_sheet_name}...")
+
+ ws = writer.sheets[comparison_sheet_name]
+
+ # Format all percent_change columns with color scale
+ for col_idx, col in enumerate(comparison.columns, start=1):
+ if 'percent_change' in col:
+ # Convert column index to Excel letter (A, B, C, ...)
+ if col_idx <= 26:
+ col_letter = chr(64 + col_idx)
+ else:
+ col_letter = chr(64 + (col_idx // 26)) + chr(64 + (col_idx % 26))
+
+ data_range = f'{col_letter}2:{col_letter}{len(comparison)+1}'
+
+ # Color scale: red (min/negative) -> white (0) -> green (max/positive)
+ ws.conditional_formatting.add(data_range,
+ ColorScaleRule(
+ start_type='min', start_color='F8696B', # Red
+ mid_type='num', mid_value=0, mid_color='FFFFFF', # White
+ end_type='max', end_color='63BE7B' # Green
+ ))
+
+ print(f" Formatted {col}")
+
+ print(f"\nSaved: {output_path}")
+ print("\nNew comparison sheets added")
+ print("percent_change interpretation:")
+ print(" For latency/time: Positive = faster (less time)")
+ print(" For bandwidth: Positive = better (more bandwidth)")
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Add comparison sheets to combined collective reports')
+ parser.add_argument('--input', required=True, help='Input combined collective Excel file')
+ parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets')
+ parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data')
+ parser.add_argument('--test-label', default='test', help='Label for test data')
+
+ args = parser.parse_args()
+
+ return add_collective_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label)
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/scripts/tracelens_single_config/add_comparison_sheets.py b/scripts/tracelens_single_config/add_comparison_sheets.py
new file mode 100755
index 0000000..765f391
--- /dev/null
+++ b/scripts/tracelens_single_config/add_comparison_sheets.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'):
+ print(f"Loading: {input_path}")
+ print(f" Baseline label: {baseline_label}")
+ print(f" Test label: {test_label}")
+
+ xl = pd.ExcelFile(input_path)
+
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+ # Copy all original sheets
+ for sheet_name in xl.sheet_names:
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(f" Copied {sheet_name}")
+
+ # Add comparison sheets
+ all_combined = pd.read_excel(input_path, sheet_name='All_Ranks_Combined')
+
+ # Get actual source values from the dataframe
+ sources = all_combined['source'].unique()
+ # Determine which is baseline and which is test (baseline should be first)
+ if len(sources) >= 2:
+ actual_baseline = sources[0]
+ actual_test = sources[1]
+ else:
+ actual_baseline = baseline_label
+ actual_test = test_label
+
+ # Comparison 1: Side-by-side by rank
+ baseline_data = all_combined[all_combined['source'] == actual_baseline]
+ test_data = all_combined[all_combined['source'] == actual_test]
+
+ comparison_by_rank = pd.DataFrame()
+ for rank in sorted(baseline_data['rank'].unique()):
+ base_rank = baseline_data[baseline_data['rank'] == rank].set_index('type')
+ test_rank = test_data[test_data['rank'] == rank].set_index('type')
+
+ for metric_type in base_rank.index:
+ if metric_type in test_rank.index:
+ base_time = base_rank.loc[metric_type, 'time ms']
+ test_time = test_rank.loc[metric_type, 'time ms']
+ ratio_val = test_time / base_time if base_time != 0 else 0
+ # Percentage change: positive when test is faster (takes less time)
+ pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+
+ # Determine if better or worse
+ if pct_change > 1:
+ status = 'Better'
+ elif pct_change < -1:
+ status = 'Worse'
+ else:
+ status = 'Similar'
+
+ comparison_by_rank = pd.concat([comparison_by_rank, pd.DataFrame({
+ 'rank': [rank],
+ 'type': [metric_type],
+ f'{baseline_label}_time_ms': [base_time],
+ f'{test_label}_time_ms': [test_time],
+ 'diff_time_ms': [test_time - base_time],
+ 'percent_change': [pct_change],
+ 'status': [status],
+ 'ratio': [ratio_val],
+ f'{baseline_label}_percent': [base_rank.loc[metric_type, 'percent']],
+ f'{test_label}_percent': [test_rank.loc[metric_type, 'percent']],
+ 'diff_percent': [test_rank.loc[metric_type, 'percent'] - base_rank.loc[metric_type, 'percent']]
+ })], ignore_index=True)
+
+ comparison_by_rank.to_excel(writer, sheet_name='Comparison_By_Rank', index=False)
+ print(f" Added Comparison_By_Rank")
+
+ # Comparison 2: Summary comparison
+ summary = pd.read_excel(input_path, sheet_name='Summary')
+ baseline_summary = summary[summary['source'] == actual_baseline].set_index('type')
+ test_summary = summary[summary['source'] == actual_test].set_index('type')
+
+ summary_comparison = pd.DataFrame()
+ for metric_type in baseline_summary.index:
+ if metric_type in test_summary.index:
+ base_time = baseline_summary.loc[metric_type, 'time ms']
+ test_time = test_summary.loc[metric_type, 'time ms']
+ ratio_val = test_time / base_time if base_time != 0 else 0
+ # Percentage change: positive when test is faster (takes less time)
+ pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0
+
+ summary_comparison = pd.concat([summary_comparison, pd.DataFrame({
+ 'type': [metric_type],
+ f'{baseline_label}_time_ms': [base_time],
+ f'{test_label}_time_ms': [test_time],
+ 'diff_time_ms': [test_time - base_time],
+ 'percent_change': [pct_change],
+ 'ratio': [ratio_val],
+ f'{baseline_label}_percent': [baseline_summary.loc[metric_type, 'percent']],
+ f'{test_label}_percent': [test_summary.loc[metric_type, 'percent']],
+ 'diff_percent': [test_summary.loc[metric_type, 'percent'] - baseline_summary.loc[metric_type, 'percent']]
+ })], ignore_index=True)
+
+ summary_comparison.to_excel(writer, sheet_name='Summary_Comparison', index=False)
+ print(f" Added Summary_Comparison")
+
+ # Add conditional formatting to percent_change columns
+ print("\n Applying conditional formatting...")
+
+ # Create color scale: Red (negative) -> White (0) -> Green (positive)
+
+ # Format Comparison_By_Rank
+ ws_rank = writer.sheets['Comparison_By_Rank']
+ # Find percent_change column
+ for col_idx, col in enumerate(comparison_by_rank.columns, start=1):
+ if col == 'percent_change':
+ col_letter = chr(64 + col_idx) # Convert to Excel column letter
+ data_range = f'{col_letter}2:{col_letter}{len(comparison_by_rank)+1}'
+ # Color scale: red (min) -> white (0) -> green (max)
+ ws_rank.conditional_formatting.add(data_range,
+ ColorScaleRule(
+ start_type='min', start_color='F8696B', # Red
+ mid_type='num', mid_value=0, mid_color='FFFFFF', # White
+ end_type='max', end_color='63BE7B' # Green
+ ))
+ print(f" Formatted Comparison_By_Rank column {col}")
+ break
+
+ # Format Summary_Comparison
+ ws_summary = writer.sheets['Summary_Comparison']
+ for col_idx, col in enumerate(summary_comparison.columns, start=1):
+ if col == 'percent_change':
+ col_letter = chr(64 + col_idx)
+ data_range = f'{col_letter}2:{col_letter}{len(summary_comparison)+1}'
+ # Color scale: red (min) -> white (0) -> green (max)
+ ws_summary.conditional_formatting.add(data_range,
+ ColorScaleRule(
+ start_type='min', start_color='F8696B', # Red
+ mid_type='num', mid_value=0, mid_color='FFFFFF', # White
+ end_type='max', end_color='63BE7B' # Green
+ ))
+ print(f" Formatted Summary_Comparison column {col}")
+ break
+
+ print(f"\nSaved: {output_path}")
+ print("\nNew sheets:")
+ print(" Comparison_By_Rank - Side-by-side comparison for each rank")
+ print(" Summary_Comparison - Overall comparison")
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Add comparison sheets to combined GPU timeline')
+ parser.add_argument('--input', required=True, help='Input combined Excel file')
+ parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets')
+ parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data')
+ parser.add_argument('--test-label', default='test', help='Label for test data')
+
+ args = parser.parse_args()
+
+ return add_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label)
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/scripts/tracelens_single_config/combine_reports.py b/scripts/tracelens_single_config/combine_reports.py
new file mode 100755
index 0000000..e5a4a95
--- /dev/null
+++ b/scripts/tracelens_single_config/combine_reports.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from pathlib import Path
+
+
+def combine_collective_reports(baseline_path, test_path, output_path):
+ # Extract folder names from paths for labels
+ baseline_label = Path(baseline_path).parent.parent.name # Get the config folder name
+ test_label = Path(test_path).parent.parent.name # Get the config folder name
+
+ print(f"Loading baseline ({baseline_label}): {baseline_path}")
+ baseline_xl = pd.ExcelFile(baseline_path)
+
+ print(f"Loading test ({test_label}): {test_path}")
+ test_xl = pd.ExcelFile(test_path)
+
+ print(f"\nBaseline sheets: {baseline_xl.sheet_names}")
+ print(f"Test sheets: {test_xl.sheet_names}")
+
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+ for sheet_name in baseline_xl.sheet_names:
+ if sheet_name not in test_xl.sheet_names:
+ print(f" Skip {sheet_name} - not in test file")
+ continue
+
+ baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
+ test_df = pd.read_excel(test_path, sheet_name=sheet_name)
+
+ baseline_df['source'] = baseline_label
+ test_df['source'] = test_label
+
+ combined = pd.concat([baseline_df, test_df], ignore_index=True)
+
+ combined.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(f" Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows")
+
+ print(f"\nSaved: {output_path}")
+ return 0 # Return success code
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Combine two collective reports')
+ parser.add_argument('--baseline', required=True, help='Path to baseline collective_all_ranks.xlsx')
+ parser.add_argument('--test', required=True, help='Path to test collective_all_ranks.xlsx')
+ parser.add_argument('--output', required=True, help='Output path for combined Excel file')
+
+ args = parser.parse_args()
+
+ return combine_collective_reports(args.baseline, args.test, args.output)
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/scripts/tracelens_single_config/create_final_report.py b/scripts/tracelens_single_config/create_final_report.py
new file mode 100755
index 0000000..16caac9
--- /dev/null
+++ b/scripts/tracelens_single_config/create_final_report.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+Create final comprehensive report with combined and comparison data.
+Raw data sheets are hidden and all data is formatted as Excel tables.
+"""
+import pandas as pd
+import argparse
+from pathlib import Path
+from openpyxl import load_workbook
+from openpyxl.worksheet.table import Table, TableStyleInfo
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def get_column_letter(col_num):
+ """Convert column number to Excel column letter."""
+ result = ""
+ while col_num > 0:
+ col_num -= 1
+ result = chr(65 + (col_num % 26)) + result
+ col_num //= 26
+ return result
+
+
+def add_excel_table(worksheet, table_name, start_row=1):
+ """Convert worksheet data to Excel table format."""
+ # Find data range
+ max_row = worksheet.max_row
+ max_col = worksheet.max_column
+
+ if max_row <= start_row:
+ return # No data
+
+ # Ensure all column headers are strings
+ for col_idx in range(1, max_col + 1):
+ cell = worksheet.cell(row=start_row, column=col_idx)
+ if cell.value is not None and not isinstance(cell.value, str):
+ cell.value = str(cell.value)
+
+ # Create table reference using proper column letter conversion
+ start_cell = f"A{start_row}"
+ end_col_letter = get_column_letter(max_col)
+ end_cell = f"{end_col_letter}{max_row}"
+ table_ref = f"{start_cell}:{end_cell}"
+
+ # Create table with style
+ try:
+ tab = Table(displayName=table_name, ref=table_ref)
+ style = TableStyleInfo(
+ name="TableStyleMedium2",
+ showFirstColumn=False,
+ showLastColumn=False,
+ showRowStripes=True,
+ showColumnStripes=False
+ )
+ tab.tableStyleInfo = style
+
+ # Add table to worksheet
+ worksheet.add_table(tab)
+ except Exception as e:
+ print(f" Warning: Could not create table {table_name}: {e}")
+
+
+def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label='Baseline', test_label='Test'):
+ """Create comprehensive report with all data."""
+
+ print("Creating comprehensive final report...")
+ print(f" Output: {output_file}")
+ print(f" Baseline: {baseline_label}")
+ print(f" Test: {test_label}")
+
+ # Track sheet info for hiding/organizing
+ raw_sheets = []
+ comparison_sheets = []
+ summary_sheets = []
+
+ with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
+
+ # === GPU TIMELINE SHEETS ===
+ print("\nAdding GPU Timeline sheets...")
+
+ # Read GPU combined (raw data)
+ gpu_comb_xl = pd.ExcelFile(gpu_combined)
+ sheet_mapping = {
+ 'Summary': 'GPU_Summary_Raw',
+ 'All_Ranks_Combined': 'GPU_AllRanks_Raw',
+ 'Per_Rank_Time_ms': 'GPU_Time_Raw',
+ 'Per_Rank_Percent': 'GPU_Pct_Raw'
+ }
+ for sheet_name in gpu_comb_xl.sheet_names:
+ df = pd.read_excel(gpu_combined, sheet_name=sheet_name)
+ new_name = sheet_mapping.get(sheet_name, f"GPU_{sheet_name}_Raw")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ raw_sheets.append(new_name)
+ print(f" Added {new_name} (will be hidden)")
+
+ # Read GPU comparison
+ gpu_comp_xl = pd.ExcelFile(gpu_comparison)
+ comp_mapping = {
+ 'Summary_Comparison': 'GPU_Summary_Cmp',
+ 'Comparison_By_Rank': 'GPU_ByRank_Cmp'
+ }
+ for sheet_name in gpu_comp_xl.sheet_names:
+ if 'Comparison' in sheet_name:
+ df = pd.read_excel(gpu_comparison, sheet_name=sheet_name)
+ new_name = comp_mapping.get(sheet_name, f"GPU_{sheet_name}")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ comparison_sheets.append(new_name)
+ print(f" Added {new_name}")
+
+ # === COLLECTIVE SHEETS ===
+ print("\nAdding Collective/NCCL sheets...")
+
+ # Read collective combined (raw data for hidden sheets)
+ coll_comb_xl = pd.ExcelFile(coll_combined)
+ coll_mapping = {
+ 'nccl_summary_implicit_sync': 'NCCL_ImplSync_Raw',
+ 'nccl_summary_long': 'NCCL_Long_Raw'
+ }
+ for sheet_name in coll_comb_xl.sheet_names:
+ if 'summary' in sheet_name.lower():
+ df = pd.read_excel(coll_combined, sheet_name=sheet_name)
+ new_name = coll_mapping.get(sheet_name, f"NCCL_{sheet_name}_Raw")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ raw_sheets.append(new_name)
+ print(f" Added {new_name} (will be hidden)")
+
+ # Read collective comparison - include ALL sheets
+ coll_comp_xl = pd.ExcelFile(coll_comparison)
+ for sheet_name in coll_comp_xl.sheet_names:
+ df = pd.read_excel(coll_comparison, sheet_name=sheet_name)
+
+ # Determine appropriate naming
+ if 'nccl' in sheet_name.lower():
+ if '_cmp' in sheet_name or 'comparison' in sheet_name.lower():
+ new_name = f"NCCL_{sheet_name.replace('nccl_', '').title().replace('_', '')}"
+ else:
+ new_name = f"NCCL_{sheet_name}"
+ else:
+ new_name = sheet_name
+
+ df.to_excel(writer, sheet_name=new_name, index=False)
+
+ if '_cmp' in sheet_name.lower() or 'comparison' in sheet_name.lower():
+ comparison_sheets.append(new_name)
+ else:
+ raw_sheets.append(new_name)
+
+ print(f" Added {new_name}")
+
+ # === CREATE SUMMARY DASHBOARD ===
+ print("\nCreating Summary Dashboard...")
+
+ # Read key metrics for dashboard
+ gpu_summary = pd.read_excel(gpu_comparison, sheet_name='Summary_Comparison')
+
+ # Create dashboard data
+ dashboard_data = {
+ 'Metric': [],
+ baseline_label: [],
+ test_label: [],
+ 'Improvement (%)': [],
+ 'Status': []
+ }
+
+ # Add GPU metrics
+ # Find the actual column names (they may be config-specific like '32cu_512threads_time_ms')
+ time_cols = [col for col in gpu_summary.columns if 'time_ms' in col and 'diff' not in col and 'percent' not in col]
+ if len(time_cols) >= 2:
+ baseline_col = time_cols[0]
+ test_col = time_cols[1]
+ else:
+ # Fallback to default names
+ baseline_col = 'baseline_time_ms' if 'baseline_time_ms' in gpu_summary.columns else time_cols[0] if time_cols else None
+ test_col = 'test_time_ms' if 'test_time_ms' in gpu_summary.columns else time_cols[1] if len(time_cols) > 1 else None
+
+ if baseline_col and test_col:
+ for _, row in gpu_summary.iterrows():
+ metric_type = row['type']
+ dashboard_data['Metric'].append(f"GPU_{metric_type}")
+ dashboard_data[baseline_label].append(round(row[baseline_col], 2))
+ dashboard_data[test_label].append(round(row[test_col], 2))
+ dashboard_data['Improvement (%)'].append(round(row['percent_change'], 2) if 'percent_change' in row else 0)
+
+ pct_val = row['percent_change'] if 'percent_change' in row else 0
+ dashboard_data['Status'].append('Better' if pct_val > 0 else 'Worse' if pct_val < -1 else 'Similar')
+
+ dashboard_df = pd.DataFrame(dashboard_data)
+ dashboard_df.to_excel(writer, sheet_name='Summary_Dashboard', index=False)
+ summary_sheets.append('Summary_Dashboard')
+ print(f" Added Summary_Dashboard")
+
+ # Now modify the workbook to hide sheets and add tables
+ print("\nApplying formatting...")
+ wb = load_workbook(output_file)
+
+ # Hide raw data sheets
+ for sheet_name in raw_sheets:
+ if sheet_name in wb.sheetnames:
+ wb[sheet_name].sheet_state = 'hidden'
+ print(f" Hidden: {sheet_name}")
+
+ # Convert all sheets to tables
+ for sheet_name in wb.sheetnames:
+ ws = wb[sheet_name]
+
+ # Skip if sheet is empty
+ if ws.max_row <= 1:
+ continue
+
+ # Create unique table name from sheet name (remove special chars)
+ table_name = sheet_name.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
+ # Ensure name starts with letter and is max 255 chars
+ if not table_name[0].isalpha():
+ table_name = 'Tbl_' + table_name
+ table_name = table_name[:255]
+
+ add_excel_table(ws, table_name)
+ print(f" Converted to table: {sheet_name}")
+
+ # Add conditional formatting for percent_change columns
+ if 'Cmp' in sheet_name or 'Comparison' in sheet_name:
+ # Find percent_change columns
+ for col_idx in range(1, ws.max_column + 1):
+ cell_value = ws.cell(row=1, column=col_idx).value
+ if cell_value and 'percent_change' in str(cell_value):
+ col_letter = get_column_letter(col_idx)
+ data_range = f'{col_letter}2:{col_letter}{ws.max_row}'
+
+ # Apply color scale: red (min/negative) -> white (0) -> green (max/positive)
+ try:
+ ws.conditional_formatting.add(data_range,
+ ColorScaleRule(
+ start_type='min', start_color='F8696B', # Red
+ mid_type='num', mid_value=0, mid_color='FFFFFF', # White
+ end_type='max', end_color='63BE7B' # Green
+ ))
+ print(f" Applied color scale to {sheet_name} column {cell_value}")
+ except Exception as e:
+ print(f" Warning: Could not apply formatting to {cell_value}: {e}")
+
+ # Move Summary Dashboard to first position
+ if 'Summary_Dashboard' in wb.sheetnames:
+ dashboard_sheet = wb['Summary_Dashboard']
+ wb.move_sheet(dashboard_sheet, offset=-(len(wb.sheetnames)-1))
+ wb.active = 0 # Set dashboard as active sheet
+ print("\n Moved Summary_Dashboard to first position")
+
+ # Save workbook
+ wb.save(output_file)
+ print(f"\nFinal report saved: {output_file}")
+
+ # Report structure
+ print("\nReport Structure:")
+ print(" Visible Sheets (Analysis):")
+ print(f" - Summary_Dashboard")
+ for sheet in comparison_sheets:
+ print(f" - {sheet}")
+ print("\n Hidden Sheets (Raw Data):")
+ for sheet in raw_sheets:
+ print(f" - {sheet}")
+ print("\n All data formatted as Excel tables with filters")
+ print(" Percent change columns are color-coded (green=better, red=worse)")
+ print("\nUsers can unhide raw data sheets in Excel: Right-click any sheet tab → Unhide")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Create final comprehensive report with all data',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Example:
+ python create_final_report.py \\
+ --gpu-combined gpu_timeline_combined.xlsx \\
+ --gpu-comparison gpu_timeline_comparison.xlsx \\
+ --coll-combined collective_combined.xlsx \\
+ --coll-comparison collective_comparison.xlsx \\
+ --output final_analysis_report.xlsx
+ """
+ )
+
+ parser.add_argument('--gpu-combined', required=True,
+ help='Path to GPU timeline combined file')
+ parser.add_argument('--gpu-comparison', required=True,
+ help='Path to GPU timeline comparison file')
+ parser.add_argument('--coll-combined', required=True,
+ help='Path to collective combined file')
+ parser.add_argument('--coll-comparison', required=True,
+ help='Path to collective comparison file')
+ parser.add_argument('--output', required=True,
+ help='Output path for final report')
+ parser.add_argument('--baseline-label', default='Baseline',
+ help='Label for baseline configuration')
+ parser.add_argument('--test-label', default='Test',
+ help='Label for test configuration')
+
+ args = parser.parse_args()
+
+ # Validate inputs
+ for file_arg in ['gpu_combined', 'gpu_comparison', 'coll_combined', 'coll_comparison']:
+ file_path = getattr(args, file_arg)
+ if not Path(file_path).exists():
+ print(f"Error: File not found: {file_path}")
+ return 1
+
+ create_final_report(
+ args.gpu_combined,
+ args.gpu_comparison,
+ args.coll_combined,
+ args.coll_comparison,
+ args.output,
+ args.baseline_label.replace('_', ' '),
+ args.test_label.replace('_', ' ')
+ )
+
+ return 0
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/scripts/tracelens_single_config/generate_enhanced_plots.py b/scripts/tracelens_single_config/generate_enhanced_plots.py
new file mode 100755
index 0000000..226310d
--- /dev/null
+++ b/scripts/tracelens_single_config/generate_enhanced_plots.py
@@ -0,0 +1,766 @@
+#!/usr/bin/env python3
+"""
+Enhanced plot generation matching the PDF report style.
+Generates exactly 12 plots as specified.
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import argparse
+from pathlib import Path
+import warnings
+import base64
+from datetime import datetime
+warnings.filterwarnings('ignore')
+
+plt.style.use('seaborn-v0_8-whitegrid')
+sns.set_palette("husl")
+
+
+def plot1_percentage_change(summary_data, output_dir):
+ """Plot 1: Percentage Change Overview."""
+ print("\nGenerating Plot 1: Percentage Change Overview")
+
+ columns = summary_data.columns.tolist()
+ baseline_label = columns[1] if len(columns) > 1 else 'Baseline'
+ test_label = columns[2] if len(columns) > 2 else 'Test'
+
+ if 'Improvement (%)' not in summary_data.columns:
+ print(" No improvement data found")
+ return
+
+ metrics = summary_data['Metric'].values
+ values = summary_data['Improvement (%)'].values
+
+ fig, ax = plt.subplots(figsize=(12, 8))
+ colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+ bars = ax.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+ for bar, val in zip(bars, values):
+ x_pos = bar.get_width()
+ ax.text(x_pos + (0.5 if x_pos > 0 else -0.5), bar.get_y() + bar.get_height()/2,
+ f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold')
+
+ ax.axvline(x=0, color='black', linestyle='-', linewidth=1)
+ ax.set_xlabel('Percent Change (%)', fontsize=12)
+ ax.set_title(f'GPU Metrics: Percent Change ({baseline_label} vs {test_label})\nPositive = Improvement ({test_label} Faster)',
+ fontsize=14, fontweight='bold')
+ ax.grid(True, alpha=0.3, axis='x')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot1_percentage_change_overview.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot1_percentage_change_overview.png")
+
+
+def plot2_absolute_time_comparison(summary_data, output_dir):
+ """Plot 2: Absolute Time Comparison."""
+ print("\nGenerating Plot 2: Absolute Time Comparison")
+
+ columns = summary_data.columns.tolist()
+ baseline_label = columns[1] if len(columns) > 1 else 'Baseline'
+ test_label = columns[2] if len(columns) > 2 else 'Test'
+
+ metrics = summary_data['Metric'].values
+ baseline_values = summary_data[baseline_label].values
+ test_values = summary_data[test_label].values
+
+ fig, ax = plt.subplots(figsize=(14, 8))
+
+ x = np.arange(len(metrics))
+ width = 0.35
+
+ bars1 = ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+ bars2 = ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+
+ ax.set_xlabel('Metric Type', fontsize=12)
+ ax.set_ylabel('Time (ms)', fontsize=12)
+ ax.set_title('GPU Metrics: Absolute Time Comparison', fontsize=14, fontweight='bold')
+ ax.set_xticks(x)
+ ax.set_xticklabels(metrics, rotation=45, ha='right')
+ ax.legend()
+ ax.grid(True, alpha=0.3, axis='y')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot2_absolute_time_comparison.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot2_absolute_time_comparison.png")
+
+
+def plot3_performance_heatmap(byrank_data, output_dir):
+ """Plot 3: Performance Heatmap by Rank."""
+ print("\nGenerating Plot 3: Performance Heatmap by Rank")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ metrics = byrank_data['type'].unique() if 'type' in byrank_data.columns else []
+ ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else []
+
+ time_cols = [col for col in byrank_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+ time_col = time_cols[-1] if len(time_cols) > 1 else time_cols[0] if time_cols else None
+
+ if not time_col:
+ print(" No time column found")
+ return
+
+ heatmap_data = byrank_data.pivot_table(index='type', columns='rank', values=time_col, aggfunc='mean')
+
+ fig, ax = plt.subplots(figsize=(12, 8))
+ sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlOrRd', cbar_kws={'label': 'Time (ms)'}, ax=ax)
+
+ ax.set_title('Performance Heatmap by Rank (Time in ms)', fontsize=14, fontweight='bold')
+ ax.set_xlabel('Rank', fontsize=12)
+ ax.set_ylabel('Metric Type', fontsize=12)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot3_performance_heatmap.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot3_performance_heatmap.png")
+
+
+def plot4_total_execution_time(byrank_data, output_dir):
+ """Plot 4: Total Execution Time by Rank (Line Plot)."""
+ print("\nGenerating Plot 4: Total Execution Time by Rank")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ total_time_data = byrank_data[byrank_data['type'] == 'total_time']
+ if total_time_data.empty:
+ print(" No total_time data found")
+ return
+
+ ranks = sorted(total_time_data['rank'].unique())
+ time_cols = [col for col in total_time_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ for col in time_cols[:2]:
+ times = [total_time_data[total_time_data['rank'] == r][col].values[0] if not total_time_data[total_time_data['rank'] == r].empty else 0 for r in ranks]
+ label = col.replace('_time_ms', '').replace('_', ' ')
+ ax.plot(ranks, times, marker='o', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+ ax.set_xlabel('Rank', fontsize=12)
+ ax.set_ylabel('Total Execution Time (ms)', fontsize=12)
+ ax.set_title('Total Execution Time by Rank', fontsize=14, fontweight='bold')
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+ ax.set_xticks(ranks)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot4_total_execution_time.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot4_total_execution_time.png")
+
+
+def plot5_computation_time(byrank_data, output_dir):
+ """Plot 5: Computation Time Across Ranks."""
+ print("\nGenerating Plot 5: Computation Time Across Ranks")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ comp_data = byrank_data[byrank_data['type'] == 'computation_time']
+ if comp_data.empty:
+ print(" No computation_time data found")
+ return
+
+ ranks = sorted(comp_data['rank'].unique())
+ time_cols = [col for col in comp_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ for col in time_cols[:2]:
+ times = [comp_data[comp_data['rank'] == r][col].values[0] if not comp_data[comp_data['rank'] == r].empty else 0 for r in ranks]
+ label = col.replace('_time_ms', '').replace('_', ' ')
+ ax.plot(ranks, times, marker='s', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+ ax.set_xlabel('Rank', fontsize=12)
+ ax.set_ylabel('Computation Time (ms)', fontsize=12)
+ ax.set_title('Computation Time Across Ranks', fontsize=14, fontweight='bold')
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+ ax.set_xticks(ranks)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot5_computation_time.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot5_computation_time.png")
+
+
+def plot6_communication_time(byrank_data, output_dir):
+ """Plot 6: Total Communication Time Across Ranks."""
+ print("\nGenerating Plot 6: Total Communication Time Across Ranks")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ comm_data = byrank_data[byrank_data['type'] == 'total_comm_time']
+ if comm_data.empty:
+ print(" No total_comm_time data found")
+ return
+
+ ranks = sorted(comm_data['rank'].unique())
+ time_cols = [col for col in comm_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ for col in time_cols[:2]:
+ times = [comm_data[comm_data['rank'] == r][col].values[0] if not comm_data[comm_data['rank'] == r].empty else 0 for r in ranks]
+ label = col.replace('_time_ms', '').replace('_', ' ')
+ ax.plot(ranks, times, marker='^', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+ ax.set_xlabel('Rank', fontsize=12)
+ ax.set_ylabel('Communication Time (ms)', fontsize=12)
+ ax.set_title('Total Communication Time Across Ranks', fontsize=14, fontweight='bold')
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+ ax.set_xticks(ranks)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot6_communication_time.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot6_communication_time.png")
+
+
+def plot7_idle_time(byrank_data, output_dir):
+ """Plot 7: Idle Time Across Ranks."""
+ print("\nGenerating Plot 7: Idle Time Across Ranks")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ idle_data = byrank_data[byrank_data['type'] == 'idle_time']
+ if idle_data.empty:
+ print(" No idle_time data found")
+ return
+
+ ranks = sorted(idle_data['rank'].unique())
+ time_cols = [col for col in idle_data.columns if 'time' in col.lower() and 'diff' not in col.lower()]
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ for col in time_cols[:2]:
+ times = [idle_data[idle_data['rank'] == r][col].values[0] if not idle_data[idle_data['rank'] == r].empty else 0 for r in ranks]
+ label = col.replace('_time_ms', '').replace('_', ' ')
+ ax.plot(ranks, times, marker='D', markersize=8, linewidth=2, label=label, alpha=0.8)
+
+ ax.set_xlabel('Rank', fontsize=12)
+ ax.set_ylabel('Idle Time (ms)', fontsize=12)
+ ax.set_title('Idle Time Across Ranks', fontsize=14, fontweight='bold')
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+ ax.set_xticks(ranks)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot7_idle_time.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot7_idle_time.png")
+
+
+def plot8_percentage_time_difference(byrank_data, output_dir):
+ """Plot 8: Percentage Time Difference Across Ranks (8 subplots in 2x4 grid)."""
+ print("\nGenerating Plot 8: Percentage Time Difference (8 subplots)")
+
+ if byrank_data is None or byrank_data.empty:
+ print(" No by-rank data available")
+ return
+
+ metrics = ['busy_time', 'computation_time', 'total_comm_time', 'exposed_comm_time',
+ 'idle_time', 'total_memcpy_time', 'exposed_memcpy_time', 'total_time']
+
+ pct_cols = [col for col in byrank_data.columns if 'percent_change' in col.lower()]
+ if not pct_cols:
+ print(" No percent_change column found")
+ return
+
+ pct_col = pct_cols[0]
+ ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else []
+
+ # Create 2x4 subplot grid
+ fig, axes = plt.subplots(2, 4, figsize=(20, 10))
+ axes = axes.flatten()
+
+ for idx, metric in enumerate(metrics):
+ ax = axes[idx]
+ metric_data = byrank_data[byrank_data['type'] == metric]
+
+ if not metric_data.empty:
+ values = [metric_data[metric_data['rank'] == r][pct_col].values[0] if not metric_data[metric_data['rank'] == r].empty else 0 for r in ranks]
+
+ colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+ ax.bar(ranks, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+ ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
+ ax.set_xlabel('Rank', fontsize=10)
+ ax.set_ylabel('Percent Change (%)', fontsize=10)
+ ax.set_title(metric.replace('_', ' ').title(), fontsize=11, fontweight='bold')
+ ax.grid(True, alpha=0.3, axis='y')
+ ax.set_xticks(ranks)
+
+ plt.suptitle('Percentage Time Difference Across Ranks (All Metrics)', fontsize=16, fontweight='bold')
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot8_percentage_difference_all_metrics.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot8_percentage_difference_all_metrics.png")
+
+
+def plot9_nccl_latency(nccl_data, output_dir):
+ """Plot 9: Communication Latency Comparison per Message Size."""
+ print("\nGenerating Plot 9: Communication Latency vs Message Size")
+
+ if nccl_data is None or nccl_data.empty:
+ print(" No NCCL data available")
+ return
+
+ if 'In msg nelems' not in nccl_data.columns:
+ print(" Required columns not found")
+ return
+
+ latency_cols = [col for col in nccl_data.columns if 'comm_latency' in col.lower() or 'latency_mean' in col.lower()]
+ if not latency_cols:
+ print(" No latency columns found")
+ return
+
+ fig, ax = plt.subplots(figsize=(14, 7))
+
+ nccl_sorted = nccl_data.sort_values('In msg nelems')
+ msg_sizes = nccl_sorted['In msg nelems'].values
+
+ x = np.arange(len(msg_sizes))
+ width = 0.35
+
+ if len(latency_cols) >= 2:
+ baseline_values = nccl_sorted[latency_cols[0]].values
+ test_values = nccl_sorted[latency_cols[1]].values
+
+ baseline_label = latency_cols[0].replace('_comm_latency_mean', '').replace('_', ' ').title()
+ test_label = latency_cols[1].replace('_comm_latency_mean', '').replace('_', ' ').title()
+
+ ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+ ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+ else:
+ ax.bar(x, nccl_sorted[latency_cols[0]].values, alpha=0.8, color='steelblue')
+
+ ax.set_xlabel('Message Size (elements)', fontsize=12)
+ ax.set_ylabel('Communication Latency (ms)', fontsize=12)
+ ax.set_title('Communication Latency Comparison per Message Size', fontsize=14, fontweight='bold')
+ ax.set_xticks(x)
+ ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+ ax.legend()
+ ax.grid(True, alpha=0.3, axis='y')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot9_nccl_latency.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot9_nccl_latency.png")
+
+
+def plot10_algorithm_bandwidth(nccl_data, output_dir):
+ """Plot 10: Algorithm Bandwidth."""
+ print("\nGenerating Plot 10: Algorithm Bandwidth")
+
+ if nccl_data is None or nccl_data.empty:
+ print(" No NCCL data available")
+ return
+
+ algo_bw_cols = [col for col in nccl_data.columns if 'algo bw' in col.lower()]
+ if not algo_bw_cols or 'In msg nelems' not in nccl_data.columns:
+ print(" Required columns not found")
+ return
+
+ fig, ax = plt.subplots(figsize=(14, 7))
+
+ nccl_sorted = nccl_data.sort_values('In msg nelems')
+ msg_sizes = nccl_sorted['In msg nelems'].values
+
+ x = np.arange(len(msg_sizes))
+ width = 0.35
+
+ if len(algo_bw_cols) >= 2:
+ baseline_values = nccl_sorted[algo_bw_cols[0]].values
+ test_values = nccl_sorted[algo_bw_cols[1]].values
+
+ baseline_label = algo_bw_cols[0].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title()
+ test_label = algo_bw_cols[1].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title()
+
+ ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+ ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+ else:
+ ax.bar(x, nccl_sorted[algo_bw_cols[0]].values, alpha=0.8, color='steelblue')
+
+ ax.set_xlabel('Message Size (elements)', fontsize=12)
+ ax.set_ylabel('Algorithm Bandwidth (GB/s)', fontsize=12)
+ ax.set_title('Algorithm Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold')
+ ax.set_xticks(x)
+ ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+ ax.legend()
+ ax.grid(True, alpha=0.3, axis='y')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot10_algorithm_bandwidth.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot10_algorithm_bandwidth.png")
+
+
+def plot11_bus_bandwidth(nccl_data, output_dir):
+ """Plot 11: Bus Bandwidth."""
+ print("\nGenerating Plot 11: Bus Bandwidth")
+
+ if nccl_data is None or nccl_data.empty:
+ print(" No NCCL data available")
+ return
+
+ bus_bw_cols = [col for col in nccl_data.columns if 'bus bw' in col.lower()]
+ if not bus_bw_cols or 'In msg nelems' not in nccl_data.columns:
+ print(" Required columns not found")
+ return
+
+ fig, ax = plt.subplots(figsize=(14, 7))
+
+ nccl_sorted = nccl_data.sort_values('In msg nelems')
+ msg_sizes = nccl_sorted['In msg nelems'].values
+
+ x = np.arange(len(msg_sizes))
+ width = 0.35
+
+ if len(bus_bw_cols) >= 2:
+ baseline_values = nccl_sorted[bus_bw_cols[0]].values
+ test_values = nccl_sorted[bus_bw_cols[1]].values
+
+ baseline_label = bus_bw_cols[0].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title()
+ test_label = bus_bw_cols[1].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title()
+
+ ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue')
+ ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange')
+ else:
+ ax.bar(x, nccl_sorted[bus_bw_cols[0]].values, alpha=0.8, color='steelblue')
+
+ ax.set_xlabel('Message Size (elements)', fontsize=12)
+ ax.set_ylabel('Bus Bandwidth (GB/s)', fontsize=12)
+ ax.set_title('Bus Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold')
+ ax.set_xticks(x)
+ ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right')
+ ax.legend()
+ ax.grid(True, alpha=0.3, axis='y')
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot11_bus_bandwidth.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot11_bus_bandwidth.png")
+
+
+def plot12_nccl_summary(nccl_data, output_dir):
+ """Plot 12: NCCL Percentage Summary and Total Communication Latency."""
+ print("\nGenerating Plot 12: NCCL Summary (Percentage & Total Latency)")
+
+ if nccl_data is None or nccl_data.empty:
+ print(" No NCCL data available")
+ return
+
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
+
+ # Left: Percentage change summary for key metrics
+ pct_cols = [col for col in nccl_data.columns if 'percent_change' in col.lower()]
+ if pct_cols and len(pct_cols) > 0:
+ metrics = []
+ values = []
+
+ for col in pct_cols:
+ metric_name = col.replace('percent_change_', '').replace('_', ' ').title()
+ metrics.append(metric_name)
+ avg_value = nccl_data[col].mean()
+ values.append(avg_value)
+
+ if metrics:
+ colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+ bars = ax1.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
+
+ for bar, val in zip(bars, values):
+ x_pos = bar.get_width()
+ ax1.text(x_pos + (1 if x_pos > 0 else -1), bar.get_y() + bar.get_height()/2,
+ f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold')
+
+ ax1.axvline(x=0, color='black', linestyle='-', linewidth=1)
+ ax1.set_xlabel('Percent Change (%)', fontsize=12)
+ ax1.set_title('NCCL Metrics: Average Percent Change', fontsize=13, fontweight='bold')
+ ax1.grid(True, alpha=0.3)
+ else:
+ ax1.text(0.5, 0.5, 'No percentage change data available',
+ ha='center', va='center', transform=ax1.transAxes, fontsize=12)
+
+ # Right: Total communication latency comparison
+ total_latency_cols = [col for col in nccl_data.columns if ('Total comm latency' in col or 'total_latency' in col.lower()) and 'percent' not in col.lower()]
+
+ if total_latency_cols and len(total_latency_cols) >= 1:
+ labels = []
+ totals = []
+
+ for col in total_latency_cols[:2]:
+ label = col.replace('_Total comm latency (ms)', '').replace('_total_latency', '').replace('_', ' ').strip().title()
+ if not label:
+ label = 'Total'
+ total = nccl_data[col].sum()
+ labels.append(label)
+ totals.append(total)
+
+ if totals:
+ colors = ['steelblue', 'darkorange'] if len(totals) > 1 else ['steelblue']
+ bars = ax2.bar(labels, totals, color=colors[:len(totals)], alpha=0.8, edgecolor='black', linewidth=1)
+
+ for bar, val in zip(bars, totals):
+ ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height(),
+ f'{val:.1f} ms', ha='center', va='bottom', fontsize=12, fontweight='bold')
+
+ if len(totals) == 2 and totals[0] > 0:
+ improvement = (totals[0] - totals[1]) / totals[0] * 100
+ y_pos = max(totals) * 0.6
+ ax2.text(0.5, y_pos, f'Improvement: {improvement:.1f}%',
+ ha='center', fontsize=13, fontweight='bold',
+ bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.6, edgecolor='black'))
+
+ ax2.set_ylabel('Total Communication Latency (ms)', fontsize=12)
+ ax2.set_title('Total Communication Latency Comparison', fontsize=13, fontweight='bold')
+ ax2.grid(True, alpha=0.3, axis='y')
+ else:
+ ax2.text(0.5, 0.5, 'No total latency data available',
+ ha='center', va='center', transform=ax2.transAxes, fontsize=12)
+
+ plt.tight_layout()
+ plt.savefig(output_dir / 'plot12_nccl_summary.png', dpi=150, bbox_inches='tight')
+ plt.close()
+ print(f" Saved: plot12_nccl_summary.png")
+
+
+def generate_html_report(input_path, output_dir, baseline_label='Baseline', test_label='Test'):
+ """Generate HTML report with all plots embedded."""
+ print("\nGenerating HTML Report...")
+
+ plot_files = sorted(output_dir.glob('plot*.png'))
+
+ html_content = f"""
+
+
+
+
+ RCCL Performance Analysis: {baseline_label} vs {test_label}
+
+
+
+ RCCL Performance Analysis Report
+ Comparing: {baseline_label} vs {test_label}
+ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+ GPU Performance Metrics
+"""
+
+ plot_titles = {
+ 'plot1': 'Percentage Change Overview',
+ 'plot2': 'Absolute Time Comparison',
+ 'plot3': 'Performance Heatmap by Rank',
+ 'plot4': 'Total Execution Time by Rank',
+ 'plot5': 'Computation Time Across Ranks',
+ 'plot6': 'Communication Time Across Ranks',
+ 'plot7': 'Idle Time Across Ranks',
+ 'plot8': 'Percentage Time Difference (All Metrics)',
+ 'plot9': 'NCCL Communication Latency',
+ 'plot10': 'NCCL Algorithm Bandwidth',
+ 'plot11': 'NCCL Bus Bandwidth',
+ 'plot12': 'NCCL Summary'
+ }
+
+ # Add GPU plots first (plot1-plot8)
+ for plot_file in plot_files:
+ plot_num = plot_file.stem.split('_')[0]
+ if plot_num not in ['plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7', 'plot8']:
+ continue
+
+ title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title())
+
+ with open(plot_file, 'rb') as f:
+ img_data = base64.b64encode(f.read()).decode()
+
+ html_content += f"""
+
+
{title}
+

+
+"""
+
+ # Add NCCL section
+ html_content += "\n NCCL/Collective Performance
\n"
+
+ # Add NCCL plots (plot9-plot12)
+ for plot_file in plot_files:
+ plot_num = plot_file.stem.split('_')[0]
+ if plot_num not in ['plot9', 'plot10', 'plot11', 'plot12']:
+ continue
+
+ title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title())
+
+ with open(plot_file, 'rb') as f:
+ img_data = base64.b64encode(f.read()).decode()
+
+ html_content += f"""
+
+
{title}
+

+
+"""
+
+ html_content += """
+
+ Generated by TraceLens Analysis Pipeline
+
+
+
+"""
+
+ html_path = output_dir / 'performance_analysis_report.html'
+ with open(html_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ print(f" HTML report saved to: {html_path}")
+ return html_path
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Generate 12 analysis plots')
+ parser.add_argument('--input', required=True, help='Path to final_analysis_report.xlsx')
+ parser.add_argument('--output', default='plots', help='Output directory for plots')
+
+ args = parser.parse_args()
+
+ input_path = Path(args.input)
+ output_dir = Path(args.output)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ if not input_path.exists():
+ print(f"Error: Input file not found: {input_path}")
+ return 1
+
+ sheets = pd.read_excel(input_path, sheet_name=None)
+
+ print(f"\nGenerating 12 plots from {input_path.name}...")
+
+ # Extract baseline and test labels from Summary_Dashboard
+ baseline_label = 'Baseline'
+ test_label = 'Test'
+
+ summary_sheet = sheets.get('Summary_Dashboard')
+ if summary_sheet is not None:
+ columns = summary_sheet.columns.tolist()
+ if len(columns) >= 3:
+ baseline_label = columns[1]
+ test_label = columns[2]
+
+ plot1_percentage_change(summary_sheet, output_dir)
+ plot2_absolute_time_comparison(summary_sheet, output_dir)
+
+ # GPU by-rank data
+ byrank_sheet = None
+ for name in ['GPU_ByRank_Cmp', 'GPU_ByRank_Comparison', 'Comparison_By_Rank']:
+ if name in sheets:
+ byrank_sheet = sheets[name]
+ break
+
+ if byrank_sheet is not None:
+ plot3_performance_heatmap(byrank_sheet, output_dir)
+ plot4_total_execution_time(byrank_sheet, output_dir)
+ plot5_computation_time(byrank_sheet, output_dir)
+ plot6_communication_time(byrank_sheet, output_dir)
+ plot7_idle_time(byrank_sheet, output_dir)
+ plot8_percentage_time_difference(byrank_sheet, output_dir)
+
+ # NCCL data
+ nccl_sheet = None
+ for name in sheets:
+ if 'nccl' in name.lower() and ('cmp' in name.lower() or 'comparison' in name.lower()):
+ nccl_sheet = sheets[name]
+ break
+
+ # Try to get the actual NCCL data sheets (not just comparison)
+ if not nccl_sheet or nccl_sheet.empty:
+ for name in sheets:
+ if 'nccl' in name.lower() and 'summary' in name.lower():
+ nccl_sheet = sheets[name]
+ break
+
+ if nccl_sheet is not None and not nccl_sheet.empty:
+ plot9_nccl_latency(nccl_sheet, output_dir)
+ plot10_algorithm_bandwidth(nccl_sheet, output_dir)
+ plot11_bus_bandwidth(nccl_sheet, output_dir)
+ plot12_nccl_summary(nccl_sheet, output_dir)
+
+ # Generate HTML report with configuration labels
+ html_path = generate_html_report(input_path, output_dir, baseline_label, test_label)
+
+ print(f"\n{'='*60}")
+ print(f"All 12 plots generated successfully!")
+ print(f"Output directory: {output_dir}")
+ print(f"\nHTML Report: {html_path}")
+ print(" - Open in browser to view all plots")
+ print(" - Print to PDF: Ctrl+P or Cmd+P")
+ print(f"{'='*60}")
+
+ return 0
+
+
+if __name__ == '__main__':
+ import sys
+ sys.exit(main())
diff --git a/scripts/tracelens_single_config/generate_merged_html.py b/scripts/tracelens_single_config/generate_merged_html.py
new file mode 100644
index 0000000..1bb89f6
--- /dev/null
+++ b/scripts/tracelens_single_config/generate_merged_html.py
@@ -0,0 +1,182 @@
+import os
+from pathlib import Path
+import base64
+import argparse
+
+
+def get_image_data(image_path):
+ try:
+ with open(image_path, "rb") as f:
+ return base64.b64encode(f.read()).decode("utf-8")
+ except Exception as e:
+ print(f"Error getting image data: {e}")
+ return None
+
+
+def create_final_html(plot_file_path, output_path):
+ html_header = """
+
+
+
+
+ Performance Analysis Report
+
+
+
+
+ Performance Analysis Report
+
+
+
+ Executive Summary
+
+ Comparison of GPU performance metrics
+ implementations across 8 ranks.
+ """
+
+ summary_section = f"""
+ 1. Overall GPU Metrics Comparison
+ """
+
+ summary_chart = get_image_data(plot_file_path / "improvement_chart.png")
+ if summary_chart is not None:
+ summary_section += f"""
+ Percentage Change Overview
+
+ Overall performance change across key GPU metrics. Negetive values indicate improvement
+ """
+ absolute_time_chart = get_image_data(plot_file_path / "abs_time_comparison.png")
+ if absolute_time_chart is not None:
+ summary_section += f"""
+ Absolute Time Comparison
+
+ Side-by-side comparison of absolute execution times for all GPU metrics.
+ """
+
+ cross_rank_comparison_section = f"""
+ 2. Cross-Rank Performance Comparison
+ """
+ gpu_time_heatmap = get_image_data(plot_file_path / "gpu_time_heatmap.png")
+ if gpu_time_heatmap is not None:
+ cross_rank_comparison_section += f"""
+ Performance Heatmap by Rank
+
+ Comprehensive heatmap showing percent change for all metrics across all ranks. Green indicates better performance (positive % change).
+ """
+
+ item_list = {
+ "total_time": {
+ "name": "Total Time",
+ "description": "Total execution time comparison across all ranks, showing end-to-end performance characteristics.",
+ "chart_path": plot_file_path / "total_time_by_rank.png",
+ },
+ "computation_time": {
+ "name": "Computation Time",
+ "description": "Pure computation time excluding communication overhead, analyzed per rank.",
+ "chart_path": plot_file_path / "computation_time_by_rank.png",
+ },
+ "total_comm_time": {
+ "name": "Communication Time",
+ "description": "Total time spent in collective communication operations across ranks.",
+ "chart_path": plot_file_path / "total_comm_time_by_rank.png",
+ },
+ "idle_time": {
+ "name": "Idle Time",
+ "description": "GPU idle time comparison showing resource utilization efficiency per rank.",
+ "chart_path": plot_file_path / "idle_time_by_rank.png",
+ },
+ "gpu_time_change_percentage_summaryby_rank": {
+ "name": "Detailed Percentage Change by Metric",
+ "description": "Detailed breakdown of percent change for each metric type across all ranks.",
+ "chart_path": plot_file_path
+ / "gpu_time_change_percentage_summaryby_rank.png",
+ },
+ }
+ for item in item_list.keys():
+ cross_rank_comparison_chart = get_image_data(item_list[item]["chart_path"])
+ if cross_rank_comparison_chart is not None:
+ cross_rank_comparison_section += f"""
+ {item_list[item]['name']}
+
+ {item_list[item]['description']}.
+ """
+
+ summary_section += cross_rank_comparison_section
+
+ nccl_charst_section = f"""
+ 3. NCCL Collective Operations Analysis
+ """
+ nccl_chart_item_list = {
+ "NCCL Communication Latency": "Mean communication latency for NCCL allreduce operations across different message sizes",
+ "NCCL Algorithm Bandwidth": "Algorithm bandwidth achieved for different message sizes in NCCL collective operations.",
+ "NCCL Bus Bandwidth": "Bus bandwidth utilization across NCCL operations and message sizes.",
+ "NCCL Performance Percentage Change": "Percent change in communication latency and bandwidth metrics for each message sizec configuration",
+ "NCCL Total Communication Latency": "Aggregate communication latency summed across all operations for each message size.",
+ }
+ for item in nccl_chart_item_list.keys():
+ nccl_image_data = get_image_data(
+ plot_file_path / f'{item.replace(" ", "_")}_comparison.png'
+ )
+ if nccl_image_data is not None:
+ nccl_charst_section += f"""
+ {item}
+
+ {nccl_chart_item_list[item]}
+ """
+
+ summary_section += nccl_charst_section
+
+ footer_section = f"""
+
+
+
+ """
+ summary_section += footer_section
+
+ final_html = html_header + summary_section
+ with open(output_path, "w") as f:
+ f.write(final_html)
+ print(f"Final HTML file created at: {output_path}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Create a final HTML file for the analysis report."
+ )
+ parser.add_argument(
+ "-p",
+ "--plot-files-directory",
+ type=Path,
+ required=True,
+ help="Path to the plot files direcotry.",
+ )
+ parser.add_argument(
+ "-o", "--output-html", type=None, default=None, help="Path to the output file."
+ )
+ args = parser.parse_args()
+ output_path = (
+ args.output_html
+ if args.output_html
+ else args.plot_files_directory.parent / "final_analysis_report.html"
+ )
+ create_final_html(args.plot_files_directory, output_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tracelens_single_config/merge_tracelens_analysis.py b/scripts/tracelens_single_config/merge_tracelens_analysis.py
new file mode 100644
index 0000000..fade0d5
--- /dev/null
+++ b/scripts/tracelens_single_config/merge_tracelens_analysis.py
@@ -0,0 +1,465 @@
+import argparse
+from pathlib import Path
+import subprocess
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def run_command(cmd, description):
+ """Execute a command and handle errors."""
+ print(f"\n{'='*80}")
+ print(f"{description}")
+ print(f"{'='*80}")
+ print(f"Command: {' '.join(cmd)}")
+
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode != 0:
+ print(f"Error: {description} failed!")
+ print(f"Stderr: {result.stderr}")
+ return False
+
+ print(result.stdout)
+ return True
+
+
+def plot_nccl_data_per_msg(df, labels, output_dir: Path):
+ """
+ Plot comm_latency_mean for each message size from NCCL data.
+ """
+ output_path = Path(output_dir)
+ output_path.mkdir(parents=True, exist_ok=True)
+
+ # Get unique index values (Collective_MsgSize)
+ indices = df["index"].values
+
+ x = np.arange(len(indices))
+ width = 0.8 / len(labels)
+ # Vibrant color palette
+ vibrant_colors = [
+ "#E63946",
+ "#2A9D8F",
+ "#E9C46A",
+ "#264653",
+ "#F4A261",
+ "#8338EC",
+ "#06D6A0",
+ "#FF006E",
+ ]
+
+ plot_items = {
+ "NCCL Communication Latency": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Communication Latency (ms)",
+ "y_col": "comm_latency_mean",
+ },
+ "NCCL Algorithm Bandwidth": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Algorithm Bandwidth (GB/s)",
+ "y_col": "algo bw (GB/s)_mean",
+ },
+ "NCCL Bus Bandwidth": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Bus Bandwidth (GB/s)",
+ "y_col": "bus bw (GB/s)_mean",
+ },
+ "NCCL Total Communication Latency": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Total Communication Latency (ms)",
+ "y_col": "Total comm latency (ms)",
+ },
+ }
+
+ for plot_item in plot_items.keys():
+ fig, ax = plt.subplots(figsize=(14, 7))
+ for i, label in enumerate(labels):
+ col_name = f"{plot_items[plot_item]['y_col']}_{label}"
+ print(f"Plotting {col_name}")
+ if col_name in df.columns:
+ values = df[col_name].values
+ color = vibrant_colors[i % len(vibrant_colors)]
+ offset = (i - len(labels) / 2 + 0.5) * width
+ ax.bar(
+ x + offset,
+ values,
+ width,
+ label=label,
+ color=color,
+ alpha=0.85,
+ edgecolor="black",
+ linewidth=0.5,
+ )
+ else:
+ print(f"Column {col_name} not found in dataframe")
+
+ ax.set_xlabel(plot_items[plot_item]["x_label"], fontsize=12, fontweight="bold")
+ ax.set_ylabel(plot_items[plot_item]["y_label"], fontsize=12, fontweight="bold")
+ ax.set_title(f"{plot_item} per Message Size", fontsize=14, fontweight="bold")
+ ax.set_xticks(x)
+ ax.set_xticklabels(indices, rotation=45, ha="right", fontsize=9)
+ ax.legend(loc="upper left")
+ ax.grid(True, alpha=0.3, axis="y")
+
+ plt.tight_layout()
+ output_file = output_path / f'{plot_item.replace(" ", "_")}_comparison.png'
+ plt.savefig(output_file, dpi=150, bbox_inches="tight")
+ plt.close()
+ print(f"Saved: {output_file}")
+ print("Completed plotting NCCL data per message size")
+
+
+def plot_all_types_per_rank(df, labels, output_dir: Path):
+ """
+ Plot data for every rank, where every unique type is a different file.
+
+ Parameters:
+ -----------
+ df : DataFrame
+ Merged gpu_time_per_rank_df with columns like 'type', 'rank0_label1', 'rank0_label2', etc.
+ labels : list
+ List of configuration labels (e.g., ['32cu_512threads', '37cu_384threads'])
+ output_dir : str
+ Directory to save plots
+ """
+ output_path = Path(output_dir)
+ output_path.mkdir(parents=True, exist_ok=True)
+
+ unique_types = df["type"].unique()
+
+ # Find rank columns (extract rank numbers from column names)
+ # Columns are like: rank0_32cu_512threads, rank1_32cu_512threads, etc.
+ sample_label = labels[0]
+ rank_cols = [
+ col for col in df.columns if col.endswith(f"_{sample_label}") and col != "type"
+ ]
+ ranks = [col.replace(f"_{sample_label}", "") for col in rank_cols]
+
+ print(f"Found ranks: {ranks}")
+ print(f"Found types: {unique_types}")
+
+ for metric_type in unique_types:
+ type_data = df[df["type"] == metric_type]
+
+ if type_data.empty:
+ continue
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ x = np.arange(len(ranks))
+ # Vibrant color palette
+ vibrant_colors = [
+ "#E63946",
+ "#2A9D8F",
+ "#E9C46A",
+ "#264653",
+ "#F4A261",
+ "#8338EC",
+ "#06D6A0",
+ "#FF006E",
+ ]
+ markers = ["o", "s", "^", "D", "v", "p", "h", "*"]
+
+ for i, label in enumerate(labels):
+ values = []
+ for rank in ranks:
+ col_name = f"{rank}_{label}"
+ if col_name in type_data.columns:
+ val = type_data[col_name].values[0]
+ values.append(val if pd.notna(val) else 0)
+ else:
+ values.append(0)
+
+ color = vibrant_colors[i % len(vibrant_colors)]
+ marker = markers[i % len(markers)]
+ ax.plot(
+ x,
+ values,
+ label=label,
+ color=color,
+ marker=marker,
+ markersize=8,
+ linewidth=2,
+ alpha=0.85,
+ )
+
+ ax.set_xlabel("Rank", fontsize=12, fontweight="bold")
+ ax.set_ylabel("Time (ms)", fontsize=12, fontweight="bold")
+ ax.set_title(f"{metric_type} - Time per Rank", fontsize=14, fontweight="bold")
+ ax.set_xticks(x)
+ ax.set_xticklabels(ranks)
+ ax.legend(loc="upper right")
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+
+ # Save with sanitized filename
+ safe_name = metric_type.replace("/", "_").replace(" ", "_").replace(":", "_")
+ output_file = output_path / f"{safe_name}_by_rank.png"
+ plt.savefig(output_file, dpi=150, bbox_inches="tight")
+ plt.close()
+ print(f"Saved: {output_file}")
+
+
+def plot_gpu_time_summary(df, labels, output_dir: Path):
+
+ types = df["type"].values
+ values = []
+
+ for label in labels:
+ values.append(df[f"time ms_{label}"].values)
+
+ fig, ax = plt.subplots(figsize=(10, 5))
+
+ x = np.arange(len(types))
+ width = 0.15
+ for i, value in enumerate(values):
+ offset = (i - len(labels) / 2 + 0.5) * width
+ bars = ax.bar(x + offset, value, width, label=labels[i])
+
+ ax.set_xlabel("Type")
+ ax.set_ylabel("Time (ms)")
+ ax.set_title("GPU Time Summary by Rank")
+ ax.set_xticks(x)
+ ax.set_xticklabels(types, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3, axis="y")
+
+ plt.tight_layout()
+ plt.savefig(output_dir / "abs_time_comparison.png")
+ plt.close()
+
+
+"""
+def plot_improvement_chart(df, output_path):
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # Color bars based on positive/negative values
+ colors = ['#2ecc71' if val > 0 else '#e74c3c' for val in df['Improvement (%)']]
+
+ bars = ax.barh(df['Metric'], df['Improvement (%)'], color=colors)
+ ax.yaxis.grid(True, linestyle='--', alpha=0.7, color='gray')
+ ax.set_axisbelow(True)
+
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.spines['bottom'].set_visible(False)
+ ax.spines['left'].set_visible(False)
+
+ # Customize the chart
+ ax.set_ylabel('Metric', fontsize=12)
+ ax.set_xlabel('Change (%)', fontsize=12)
+ ax.set_title(
+ 'GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)',
+ fontsize=14, fontweight='bold'
+ )
+
+ plt.tight_layout()
+ plt.savefig(output_path / 'improvement_chart.png', dpi=150)
+ plt.close()
+
+"""
+
+
+def plot_gpu_time_percentage_change(df, labels, output_dir: Path):
+ """
+ Create separate horizontal bar charts for each label comparing against baseline (labels[0]).
+ """
+ types = df["type"].values
+ base_label = labels[0]
+
+ # Vibrant color palette
+ vibrant_colors = [
+ "#E63946",
+ "#2A9D8F",
+ "#E9C46A",
+ "#264653",
+ "#F4A261",
+ "#8338EC",
+ "#06D6A0",
+ "#FF006E",
+ ]
+
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, max(8, len(types) * 0.5)))
+ for i, label in enumerate(labels[1:]):
+ ax = axes[i]
+ col_name = f"percentage_change_{label}"
+ if col_name not in df.columns:
+ print(f"Column {col_name} not found, skipping")
+ continue
+
+ values = df[col_name].values
+
+ # Create 1x2 subplot figure
+
+ # Color bars based on positive/negative values (green = improvement, red = regression)
+ colors = ["#2ecc71" if val < 0 else "#e74c3c" for val in values]
+
+ # Horizontal bar chart
+ y = np.arange(len(types))
+ bars = ax.barh(
+ y, values, color=colors, alpha=0.85, edgecolor="black", linewidth=0.5
+ )
+
+ # Add vertical line at 0
+ ax.axvline(x=0, color="black", linestyle="-", linewidth=1)
+
+ ax.set_yticks(y)
+ ax.set_yticklabels(types, fontsize=10)
+ ax.set_xlabel("Percentage Change (%)", fontsize=12, fontweight="bold")
+ ax.set_ylabel("Type", fontsize=12, fontweight="bold")
+ ax.set_title(
+ f"GPU Time Percentage Change: {label} vs {base_label}\n(Negative = Improvement)",
+ fontsize=14,
+ fontweight="bold",
+ )
+ ax.grid(True, alpha=0.3, axis="x")
+
+ plt.tight_layout()
+
+ output_file = output_dir / f"improvement_chart.png"
+ plt.savefig(output_file, dpi=150, bbox_inches="tight")
+ plt.close()
+ print(f"Saved: {output_file}")
+
+
+def calculate_gpu_timepercentage_change(df, labels):
+ base_label = labels[0]
+ for label in labels[1:]:
+ df[f"percentage_change_{label}"] = (
+ (df[f"time ms_{label}"] - df[f"time ms_{base_label}"])
+ / df[f"time ms_{base_label}"]
+ * 100
+ )
+ return df
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--inputs",
+ type=str,
+ nargs="+",
+ required=True,
+ help="List of directories containing gpu_timeline_summary_mean.xlsx",
+ )
+
+ parser.add_argument(
+ "--output", type=Path, default="./output.xlsx", help="Output xls file name"
+ )
+ args = parser.parse_args()
+
+ labels = []
+ summary_dfs = []
+ gpu_time_per_rank_dfs = []
+ nccl_dfs = []
+
+ for directory in args.inputs:
+ dir_path = Path(directory)
+ label = dir_path.stem
+
+ if not dir_path.exists():
+ print(f"Directory not found: {dir_path}")
+ continue
+
+ input_excel_file = (
+ dir_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx"
+ )
+ nccl_excel_file = (
+ dir_path
+ / "tracelens_analysis"
+ / "collective_reports"
+ / "collective_all_ranks.xlsx"
+ )
+ if not input_excel_file.exists():
+ print(f"Excel file not found: {input_excel_file}")
+ continue
+
+ labels.append(label)
+
+ # Read and rename columns with label suffix
+ summary = pd.read_excel(input_excel_file, sheet_name="Summary")
+ gpu_time = pd.read_excel(input_excel_file, sheet_name="Per_Rank_Time_ms")
+
+ # Rename non-key columns with label suffix
+ summary = summary.rename(
+ columns={col: f"{col}_{label}" for col in summary.columns if col != "type"}
+ )
+ gpu_time = gpu_time.rename(
+ columns={col: f"{col}_{label}" for col in gpu_time.columns if col != "type"}
+ )
+
+ summary_dfs.append(summary)
+ gpu_time_per_rank_dfs.append(gpu_time)
+ print(f"Loaded: {label}")
+
+ if nccl_excel_file.exists():
+ nccl_df = pd.read_excel(
+ nccl_excel_file, sheet_name="nccl_summary_implicit_sync"
+ )
+
+ # Create index column by appending "Collective name" and "In msg nelems"
+ nccl_df["index"] = (
+ nccl_df["Collective name"].astype(str)
+ + "_"
+ + nccl_df["In msg nelems"].astype(str)
+ )
+
+ # Rename non-key columns with label suffix (exclude 'index' as it's the merge key)
+ nccl_df = nccl_df.rename(
+ columns={
+ col: f"{col}_{label}" for col in nccl_df.columns if col != "index"
+ }
+ )
+ nccl_dfs.append(nccl_df)
+ print(f"Loaded: {label} NCCL")
+ else:
+ print(f"NCCL file not found: {nccl_excel_file}")
+
+ # Merge all DataFrames on 'type'
+ summary_df = summary_dfs[0]
+ gpu_time_per_rank_df = gpu_time_per_rank_dfs[0]
+ nccl_df = nccl_dfs[0]
+
+ for i in range(1, len(summary_dfs)):
+ summary_df = pd.merge(summary_df, summary_dfs[i], on="type", how="outer")
+ gpu_time_per_rank_df = pd.merge(
+ gpu_time_per_rank_df, gpu_time_per_rank_dfs[i], on="type", how="outer"
+ )
+ nccl_df = pd.merge(nccl_df, nccl_dfs[i], on="index", how="outer")
+
+ summary_df = calculate_gpu_timepercentage_change(summary_df, labels)
+
+ with pd.ExcelWriter(args.output, engine="openpyxl") as writer:
+ summary_df.to_excel(writer, sheet_name="Summary", index=False)
+ gpu_time_per_rank_df.to_excel(
+ writer, sheet_name="Per_Rank_Time_ms", index=False
+ )
+ nccl_df.to_excel(writer, sheet_name="NCCL_Summary", index=False)
+
+ output_dir = Path(args.output).parent / "plots"
+ output_dir.mkdir(parents=True, exist_ok=True)
+ plot_gpu_time_percentage_change(summary_df, labels, output_dir)
+ plot_gpu_time_summary(summary_df, labels, output_dir)
+ plot_all_types_per_rank(gpu_time_per_rank_df, labels, output_dir)
+ plot_nccl_data_per_msg(nccl_df, labels, output_dir)
+
+ html_script_path = Path(__file__).parent / "generate_merged_html.py"
+ cmd = [
+ "python3",
+ str(html_script_path),
+ "--plot-files-directory",
+ str(output_dir),
+ "--output-html",
+ str(args.output.parent / "final_analysis_report.html"),
+ ]
+ if run_command(cmd, "Creating final HTML"):
+ print(
+ f"Final HTML file created at: {args.output.parent / 'final_analysis_report.html'}"
+ )
+ else:
+ print("Failed to create final HTML file")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tracelens_single_config/process_gpu_timeline.py b/scripts/tracelens_single_config/process_gpu_timeline.py
new file mode 100755
index 0000000..7fd00b7
--- /dev/null
+++ b/scripts/tracelens_single_config/process_gpu_timeline.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import argparse
+from pathlib import Path
+
+
+def geometric_mean(values):
+ values = np.array(values)
+ values = np.where(values == 0, 1e-10, values)
+ return np.exp(np.mean(np.log(values)))
+
+
+def process_gpu_timeline(reports_dir, use_geo_mean=False):
+ reports_path = Path(reports_dir)
+
+ if not reports_path.exists():
+ print(f"Error: Directory not found: {reports_dir}")
+ return 1
+
+ print(f"Processing GPU timeline from: {reports_dir}")
+ print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}")
+
+ perf_files = sorted(reports_path.glob('perf_rank*.xlsx'))
+
+ if not perf_files:
+ print("Error: No perf_rank*.xlsx files found")
+ return 1
+
+ print(f"Found {len(perf_files)} rank files")
+
+ rank_data = []
+ for file_path in perf_files:
+ rank_num = int(file_path.stem.replace('perf_rank', ''))
+ try:
+ df = pd.read_excel(file_path, sheet_name='gpu_timeline')
+ df['rank'] = rank_num
+ rank_data.append(df)
+ print(f" Rank {rank_num}: OK")
+ except Exception as e:
+ print(f" Rank {rank_num}: Error - {e}")
+
+ if not rank_data:
+ print("Error: No valid data loaded")
+ return 1
+
+ combined = pd.concat(rank_data, ignore_index=True)
+
+ agg_func = geometric_mean if use_geo_mean else 'mean'
+ aggregated = combined.groupby('type').agg({
+ 'time ms': agg_func,
+ 'percent': agg_func
+ }).reset_index()
+
+ aggregated['num_ranks'] = len(perf_files)
+
+ method_suffix = 'geomean' if use_geo_mean else 'mean'
+ output_path = reports_path.parent / f'gpu_timeline_summary_{method_suffix}.xlsx'
+
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+ aggregated.to_excel(writer, sheet_name='Summary', index=False)
+
+ combined_sorted = combined.sort_values(['rank', 'type'])
+ combined_sorted.to_excel(writer, sheet_name='All_Ranks_Combined', index=False)
+
+ per_rank = combined.pivot_table(
+ values='time ms',
+ index='type',
+ columns='rank',
+ aggfunc='first'
+ )
+ per_rank.to_excel(writer, sheet_name='Per_Rank_Time_ms')
+
+ per_rank_pct = combined.pivot_table(
+ values='percent',
+ index='type',
+ columns='rank',
+ aggfunc='first'
+ )
+ per_rank_pct.to_excel(writer, sheet_name='Per_Rank_Percent')
+
+ print(f"\nSaved: {output_path}")
+ print("\nSummary:")
+ print(aggregated.to_string(index=False))
+
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Aggregate GPU timeline across ranks')
+ parser.add_argument('--reports-dir', required=True, help='Path to individual_reports directory')
+ parser.add_argument('--geo-mean', action='store_true', help='Use geometric mean')
+
+ args = parser.parse_args()
+
+ return process_gpu_timeline(args.reports_dir, args.geo_mean)
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/scripts/tracelens_single_config/run_full_analysis.py b/scripts/tracelens_single_config/run_full_analysis.py
new file mode 100755
index 0000000..ec38581
--- /dev/null
+++ b/scripts/tracelens_single_config/run_full_analysis.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""
+Master script for complete TraceLens analysis pipeline.
+Runs analysis on baseline and test traces, then performs all comparisons.
+"""
+import argparse
+import subprocess
+import os
+import sys
+from pathlib import Path
+
+
+def run_command(cmd, description):
+ """Execute a command and handle errors."""
+ print(f"\n{'='*80}")
+ print(f"{description}")
+ print(f"{'='*80}")
+ print(f"Command: {' '.join(cmd)}")
+
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode != 0:
+ print(f"Error: {description} failed!")
+ print(f"Stderr: {result.stderr}")
+ return False
+
+ print(result.stdout)
+ return True
+
+
+def run_tracelens_analysis(trace_dir, output_name, individual_only=False, collective_only=False):
+ """Run TraceLens analysis on a single trace directory."""
+ print(f"\nAnalyzing: {trace_dir}")
+
+ # Build command
+ script_path = Path(__file__).parent / "run_tracelens_single_config.sh"
+ cmd = ["bash", str(script_path), trace_dir]
+
+ if individual_only:
+ cmd.append("--individual-only")
+ elif collective_only:
+ cmd.append("--collective-only")
+
+ return run_command(cmd, f"TraceLens analysis for {output_name}")
+
+
+def process_gpu_timeline(reports_dir):
+ """Process GPU timeline from individual reports."""
+ script_path = Path(__file__).parent / "process_gpu_timeline.py"
+ cmd = ["python3", str(script_path), "--reports-dir", reports_dir]
+
+ return run_command(cmd, "Processing GPU timeline")
+
+
+def combine_reports(baseline_file, test_file, output_file):
+ """Combine baseline and test reports."""
+ script_path = Path(__file__).parent / "combine_reports.py"
+ cmd = ["python3", str(script_path),
+ "--baseline", baseline_file,
+ "--test", test_file,
+ "--output", output_file]
+
+ return run_command(cmd, f"Combining reports to {output_file}")
+
+
+def add_comparison_sheets(input_file, output_file, baseline_label=None, test_label=None):
+ """Add comparison sheets for GPU timeline."""
+ script_path = Path(__file__).parent / "add_comparison_sheets.py"
+ cmd = ["python3", str(script_path),
+ "--input", input_file,
+ "--output", output_file]
+ if baseline_label:
+ cmd.extend(["--baseline-label", baseline_label])
+ if test_label:
+ cmd.extend(["--test-label", test_label])
+
+ return run_command(cmd, "Adding GPU timeline comparison sheets")
+
+
+def add_collective_comparison(input_file, output_file, baseline_label=None, test_label=None):
+ """Add comparison sheets for collective operations."""
+ script_path = Path(__file__).parent / "add_collective_comparison.py"
+ cmd = ["python3", str(script_path),
+ "--input", input_file,
+ "--output", output_file]
+ if baseline_label:
+ cmd.extend(["--baseline-label", baseline_label])
+ if test_label:
+ cmd.extend(["--test-label", test_label])
+
+ return run_command(cmd, "Adding collective comparison sheets")
+
+
+def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label=None, test_label=None):
+ """Create comprehensive final report with all data."""
+ script_path = Path(__file__).parent / "create_final_report.py"
+ cmd = ["python3", str(script_path),
+ "--gpu-combined", gpu_combined,
+ "--gpu-comparison", gpu_comparison,
+ "--coll-combined", coll_combined,
+ "--coll-comparison", coll_comparison,
+ "--output", output_file]
+
+ if baseline_label:
+ cmd.extend(["--baseline-label", baseline_label])
+ if test_label:
+ cmd.extend(["--test-label", test_label])
+
+ return run_command(cmd, "Creating comprehensive final report")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Complete TraceLens analysis pipeline with comparisons',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Full analysis with everything including final report
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline/traces \\
+ --test /path/to/test/traces \\
+ --output /path/to/output \\
+ --all
+
+ # Only GPU timeline comparison
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline \\
+ --test /path/to/test \\
+ --output /path/to/output \\
+ --gpu-timeline
+
+ # Create final report (skip TraceLens if already done)
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline \\
+ --test /path/to/test \\
+ --output /path/to/output \\
+ --gpu-timeline --collective --final-report \\
+ --skip-tracelens
+
+ """
+ )
+
+ # Required arguments
+ parser.add_argument('--baseline', required=True,
+ help='Path to baseline trace directory')
+ parser.add_argument('--test', required=True,
+ help='Path to test trace directory')
+ parser.add_argument('--output', required=True,
+ help='Output directory for comparison results')
+
+ # Analysis options
+ parser.add_argument('--skip-tracelens', action='store_true',
+ help='Skip TraceLens report generation (if already done)')
+ parser.add_argument('--individual-only', action='store_true',
+ help='Generate only individual reports')
+ parser.add_argument('--collective-only', action='store_true',
+ help='Generate only collective reports')
+
+ # Comparison options
+ parser.add_argument('--gpu-timeline', action='store_true',
+ help='Perform GPU timeline comparison')
+ parser.add_argument('--collective', action='store_true',
+ help='Perform collective/NCCL comparison')
+ parser.add_argument('--final-report', action='store_true',
+ help='Create comprehensive final report with tables and hidden raw data')
+ parser.add_argument('--generate-plots', action='store_true',
+ help='Generate visualization plots and HTML report from final report')
+ parser.add_argument('--all', action='store_true',
+ help='Perform all analyses and comparisons including final report, plots, and HTML report')
+
+ args = parser.parse_args()
+
+ # Handle --all flag
+ if args.all:
+ args.gpu_timeline = True
+ args.collective = True
+ args.final_report = True
+ args.generate_plots = True
+
+ # Validate inputs
+ baseline_path = Path(args.baseline)
+ test_path = Path(args.test)
+ output_path = Path(args.output)
+
+ if not baseline_path.exists():
+ print(f"Error: Baseline path not found: {args.baseline}")
+ return 1
+
+ if not test_path.exists():
+ print(f"Error: Test path not found: {args.test}")
+ return 1
+
+ # Create output directory
+ output_path.mkdir(parents=True, exist_ok=True)
+
+ print("\n" + "="*80)
+ print("TRACELENS FULL ANALYSIS PIPELINE")
+ print("="*80)
+ print(f"Baseline: {args.baseline}")
+ print(f"Test: {args.test}")
+ print(f"Output: {args.output}")
+ print(f"Options:")
+ print(f" Skip TraceLens: {args.skip_tracelens}")
+ print(f" GPU timeline: {args.gpu_timeline}")
+ print(f" Collective: {args.collective}")
+ print(f" Final report: {args.final_report}")
+
+ # Step 1: Run TraceLens analysis on both directories
+ if not args.skip_tracelens:
+ print("\n" + "="*80)
+ print("STEP 1: Running TraceLens Analysis")
+ print("="*80)
+
+ if not run_tracelens_analysis(args.baseline, "baseline",
+ args.individual_only, args.collective_only):
+ return 1
+
+ if not run_tracelens_analysis(args.test, "test",
+ args.individual_only, args.collective_only):
+ return 1
+ else:
+ print("\nSkipping TraceLens report generation (--skip-tracelens flag)")
+
+ # Determine analysis directories
+ baseline_analysis = baseline_path / "tracelens_analysis"
+ test_analysis = test_path / "tracelens_analysis"
+
+ if not baseline_analysis.exists():
+ print(f"Error: Baseline analysis not found: {baseline_analysis}")
+ print("Run without --skip-tracelens flag first")
+ return 1
+
+ if not test_analysis.exists():
+ print(f"Error: Test analysis not found: {test_analysis}")
+ print("Run without --skip-tracelens flag first")
+ return 1
+
+ # Extract config labels from paths
+ baseline_label = baseline_path.name # e.g., "56cu_256threads"
+ test_label = test_path.name # e.g., "37cu_384threads"
+
+ # Step 2: GPU Timeline Comparison
+ if args.gpu_timeline:
+ print("\n" + "="*80)
+ print("STEP 2: GPU Timeline Comparison")
+ print(f" Baseline: {baseline_label}")
+ print(f" Test: {test_label}")
+ print("="*80)
+
+ # Process GPU timelines
+ baseline_reports = baseline_analysis / "individual_reports"
+ test_reports = test_analysis / "individual_reports"
+
+ if not baseline_reports.exists() or not test_reports.exists():
+ print("Error: Individual reports not found. Run without --individual-only flag")
+ return 1
+
+ print(f"\nProcessing baseline GPU timeline ({baseline_label})...")
+ if not process_gpu_timeline(str(baseline_reports)):
+ return 1
+
+ print(f"\nProcessing test GPU timeline ({test_label})...")
+ if not process_gpu_timeline(str(test_reports)):
+ return 1
+
+ # Combine GPU timeline summaries
+ baseline_gpu = baseline_analysis / "gpu_timeline_summary_mean.xlsx"
+ test_gpu = test_analysis / "gpu_timeline_summary_mean.xlsx"
+ combined_gpu = output_path / "gpu_timeline_combined.xlsx"
+
+ if not combine_reports(str(baseline_gpu), str(test_gpu), str(combined_gpu)):
+ return 1
+
+ # Add comparison sheets
+ gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+ if not add_comparison_sheets(str(combined_gpu), str(gpu_comparison), baseline_label, test_label):
+ return 1
+
+ print(f"\nGPU timeline comparison saved to: {gpu_comparison}")
+
+ # Step 3: Collective Comparison
+ if args.collective:
+ print("\n" + "="*80)
+ print("STEP 3: Collective/NCCL Comparison")
+ print(f" Baseline: {baseline_label}")
+ print(f" Test: {test_label}")
+ print("="*80)
+
+ baseline_collective = baseline_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+ test_collective = test_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+
+ if not baseline_collective.exists() or not test_collective.exists():
+ print("Error: Collective reports not found. Run without --collective-only flag")
+ return 1
+
+ # Combine collective reports
+ combined_collective = output_path / "collective_combined.xlsx"
+ if not combine_reports(str(baseline_collective), str(test_collective),
+ str(combined_collective)):
+ return 1
+
+ # Add collective comparison
+ collective_comparison = output_path / "collective_comparison.xlsx"
+ if not add_collective_comparison(str(combined_collective),
+ str(collective_comparison), baseline_label, test_label):
+ return 1
+
+ print(f"\nCollective comparison saved to: {collective_comparison}")
+
+ # Step 4: Create final comprehensive report
+ if args.final_report and args.gpu_timeline and args.collective:
+ print("\n" + "="*80)
+ print("STEP 4: Creating Final Comprehensive Report")
+ print("="*80)
+
+ gpu_combined = output_path / "gpu_timeline_combined.xlsx"
+ gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+ collective_combined = output_path / "collective_combined.xlsx"
+ collective_comparison = output_path / "collective_comparison.xlsx"
+ final_report = output_path / "final_analysis_report.xlsx"
+
+ if not create_final_report(str(gpu_combined), str(gpu_comparison),
+ str(collective_combined), str(collective_comparison),
+ str(final_report), baseline_label, test_label):
+ return 1
+
+ print(f"\nFinal comprehensive report saved to: {final_report}")
+ print(" - Summary Dashboard as first sheet")
+ print(" - All comparison sheets visible")
+ print(" - Raw data sheets hidden (can be unhidden in Excel)")
+ print(" - All data formatted as Excel tables with filters")
+ print(" - Color coding applied (green=better, red=worse)")
+
+ # Step 5: Generate visualization plots
+ if args.generate_plots and args.final_report:
+ print("\n" + "="*80)
+ print("STEP 5: Generating Visualization Plots")
+ print("="*80)
+
+ final_report = output_path / "final_analysis_report.xlsx"
+ plots_dir = output_path / "plots"
+
+ if final_report.exists():
+ script_path = Path(__file__).parent / "generate_enhanced_plots.py"
+ cmd = ["python3", str(script_path),
+ "--input", str(final_report),
+ "--output", str(plots_dir)]
+
+ # The script generates HTML report by default
+ if run_command(cmd, "Generating visualization plots and HTML report"):
+ print(f"\nOutput saved to: {plots_dir}/")
+ print("\n Generated plots:")
+ print(" - Percentage Change Overview")
+ print(" - Absolute Time Comparison")
+ print(" - Performance Heatmap by Rank")
+ print(" - Total Execution Time by Rank")
+ print(" - Time Breakdown by Rank")
+ print(" - Percentage Breakdown by Rank")
+ print(" - NCCL/Collective Metrics")
+ print("\n HTML Report: plots/performance_analysis_report.html")
+ print(" - Open in browser to view complete report")
+ print(" - Print to PDF: Ctrl+P (or Cmd+P on Mac)")
+ else:
+ print(" Final report not found, skipping plot generation")
+
+ # Summary
+ print("\n" + "="*80)
+ print("ANALYSIS COMPLETE!")
+ print("="*80)
+ print(f"\nResults saved to: {output_path}")
+
+ files = list(output_path.glob("*.xlsx"))
+ if files:
+ print("\nGenerated Excel files:")
+ for f in sorted(files):
+ print(f" - {f.name}")
+
+ if args.generate_plots:
+ plots_dir = output_path / "plots"
+ if plots_dir.exists():
+ plot_files = list(plots_dir.glob("*.png"))
+ if plot_files:
+ print("\nGenerated plots:")
+ for f in sorted(plot_files):
+ print(f" - plots/{f.name}")
+
+ print("\nAnalysis pipeline completed successfully!")
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
new file mode 100755
index 0000000..b30ac09
--- /dev/null
+++ b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+
+# Compare specific RCCL Warp Speed configurations
+# Usage: ./run_rccl_warp_speed_comparison.sh [OPTIONS]
+# -c CONFIG_FILE Config file (default: config/distributed.yaml)
+# -p PAIRS CU,threads pairs (e.g., "56,256 37,384 32,512")
+# -h Show help
+#
+# Examples:
+# # Use default 3 configurations
+# ./run_rccl_warp_speed_comparison.sh
+#
+# # Custom configurations
+# ./run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512"
+#
+# # Different config file with custom pairs
+# ./run_rccl_warp_speed_comparison.sh -c myconfig.yaml -p "40,256 30,512"
+
+CONFIG_FILE="config/distributed.yaml"
+CUSTOM_PAIRS=""
+
+# Parse command line arguments
+while getopts "c:p:h" opt; do
+ case $opt in
+ c)
+ CONFIG_FILE="$OPTARG"
+ ;;
+ p)
+ CUSTOM_PAIRS="$OPTARG"
+ ;;
+ h)
+ echo "Usage: $0 [OPTIONS]"
+ echo " -c CONFIG_FILE Config file (default: config/single_node/gemm_overlap_comm.yaml)"
+ echo " -p PAIRS CU,threads pairs (e.g., \"56,256 37,384 32,512\")"
+ echo " -h Show help"
+ echo ""
+ echo "Examples:"
+ echo " # Use default 3 configurations"
+ echo " $0"
+ echo ""
+ echo " # Custom configurations"
+ echo " $0 -p \"56,256 37,384 32,512\""
+ echo ""
+ echo " # Different config file with custom pairs"
+ echo " $0 -c myconfig.yaml -p \"40,256 30,512\""
+ exit 0
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG"
+ exit 1
+ ;;
+ esac
+done
+BASE_CMD="torchrun --nproc_per_node 8 train.py --config ${CONFIG_FILE}"
+BASE_OVERRIDES="--override training.max_steps=100 --override profiling.tensorboard=false"
+
+# Base output directory
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+BASE_OUTPUT_DIR="experiments/rccl_warp_speed_${TIMESTAMP}"
+
+# Create base output directory
+mkdir -p "${BASE_OUTPUT_DIR}"
+
+# Log file
+SWEEP_LOG="${BASE_OUTPUT_DIR}/rccl_warp_speed_comparison_${TIMESTAMP}.log"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+# Function to log with timestamp
+log() {
+ local message="$1"
+ local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
+ echo "[${timestamp}] ${message}" | tee -a "${SWEEP_LOG}"
+}
+
+# Cleanup function for Ctrl+C
+cleanup() {
+ echo ""
+ echo -e "${RED}=== Caught interrupt signal (Ctrl+C) ===${NC}" | tee -a "${SWEEP_LOG}"
+ log "Cleaning up all training processes..."
+ sudo pkill -9 -f "train.py" 2>/dev/null || true
+ sudo pkill -9 -f "torchrun" 2>/dev/null || true
+ log "Cleanup complete. Exiting."
+ exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+echo -e "${GREEN}=== RCCL Warp Speed Configuration Comparison ===${NC}" | tee "${SWEEP_LOG}"
+log "Config file: ${CONFIG_FILE}"
+log "Results directory: ${BASE_OUTPUT_DIR}"
+echo ""
+
+# Check RCCL version and configuration
+echo -e "${BLUE}=== RCCL Version Check ===${NC}" | tee -a "${SWEEP_LOG}"
+
+# Check if custom RCCL is available
+if [ -d "/opt/rccl/build/release" ]; then
+ echo -e "${GREEN}[OK] Custom RCCL found at /opt/rccl/build/release${NC}" | tee -a "${SWEEP_LOG}"
+
+ # Check branch and commit
+ if [ -d "/opt/rccl/.git" ]; then
+ cd /opt/rccl
+ RCCL_BRANCH=$(git branch --show-current 2>/dev/null)
+ RCCL_COMMIT=$(git log --oneline -1 2>/dev/null)
+ cd - > /dev/null
+
+ echo " Branch: ${RCCL_BRANCH}" | tee -a "${SWEEP_LOG}"
+ echo " Commit: ${RCCL_COMMIT}" | tee -a "${SWEEP_LOG}"
+
+ # Verify it's warp_speed_v1
+ if [[ "${RCCL_BRANCH}" == "warp_speed_v1" ]]; then
+ echo -e " ${GREEN}[OK] Using warp_speed_v1 branch${NC}" | tee -a "${SWEEP_LOG}"
+ else
+ echo -e " ${YELLOW}[WARNING] Not on warp_speed_v1 branch (current: ${RCCL_BRANCH})${NC}" | tee -a "${SWEEP_LOG}"
+ fi
+ fi
+
+ # Check library size to verify it's built
+ RCCL_LIB_SIZE=$(ls -lh /opt/rccl/build/release/librccl.so.1.0 2>/dev/null | awk '{print $5}')
+ echo " Library size: ${RCCL_LIB_SIZE}" | tee -a "${SWEEP_LOG}"
+else
+ echo -e "${YELLOW}[WARNING] Custom RCCL not found, will use PyTorch bundled version${NC}" | tee -a "${SWEEP_LOG}"
+ echo " PyTorch's bundled RCCL may not have warp_speed features!" | tee -a "${SWEEP_LOG}"
+fi
+
+# Test if RCCL responds to warp_speed environment variables
+echo "" | tee -a "${SWEEP_LOG}"
+echo "Testing warp_speed environment variable response..." | tee -a "${SWEEP_LOG}"
+export RCCL_WARP_SPEED_ENABLE=1
+export RCCL_WARP_SPEED_CU_COUNT=56
+export NCCL_DEBUG=VERSION
+
+python -c "
+import torch
+print('PyTorch version:', torch.__version__)
+if torch.cuda.is_available():
+ print('ROCm/CUDA available:', True)
+ print('Device count:', torch.cuda.device_count())
+" 2>&1 | tee -a "${SWEEP_LOG}"
+
+# Clean up test variables
+unset RCCL_WARP_SPEED_CU_COUNT
+unset NCCL_DEBUG
+
+echo -e "${BLUE}===========================${NC}" | tee -a "${SWEEP_LOG}"
+echo ""
+
+# Define configurations to test
+# Format: "NAME|CU_COUNT|THREADS_PER_BLOCK"
+if [ -n "$CUSTOM_PAIRS" ]; then
+ # Parse custom pairs
+ CONFIGS=()
+ for pair in $CUSTOM_PAIRS; do
+ IFS=',' read -r cu threads <<< "$pair"
+ CONFIGS+=("${cu}cu_${threads}threads|${cu}|${threads}")
+ done
+ log "Using custom configurations: ${CUSTOM_PAIRS}"
+else
+ # Use default configurations
+ CONFIGS=(
+ "56cu_256threads|56|256"
+ "37cu_384threads|37|384"
+ "32cu_512threads|32|512"
+ )
+ log "Using default RCCL Warp Speed configurations"
+fi
+
+# Track results
+declare -A RUN_STATUS
+declare -A RUN_TIMES
+
+# Run each configuration
+for config in "${CONFIGS[@]}"; do
+ IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+
+ OUTPUT_DIR="${BASE_OUTPUT_DIR}/${NAME}"
+
+ echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}"
+ log "Running configuration: ${NAME}"
+ log " RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}"
+ log " RCCL_THREADS_PER_BLOCK=${THREADS}"
+ log " Output directory: ${OUTPUT_DIR}"
+ echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}"
+
+ # Create output directory
+ mkdir -p "${OUTPUT_DIR}"
+
+ # Record start time
+ START_TIME=$(date +%s)
+
+ # Export environment variables so child processes inherit them
+ export RCCL_WARP_SPEED_ENABLE=1
+ export RCCL_UNROLL_FACTOR=1
+ export RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}
+ export RCCL_THREADS_PER_BLOCK=${THREADS}
+ export HSA_ENABLE_SDMA=0
+ export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1
+
+ # Use custom RCCL if available
+ if [ -d "/opt/rccl/build/release" ]; then
+ export LD_LIBRARY_PATH=/opt/rccl/build/release:${LD_LIBRARY_PATH:-}
+ log " Using custom RCCL from /opt/rccl/build/release"
+ fi
+
+ # Run the command
+ ${BASE_CMD} ${BASE_OVERRIDES} \
+ --override training.output_dir=${OUTPUT_DIR} \
+ 2>&1 | tee "${OUTPUT_DIR}/run_output.log"
+
+ EXIT_CODE=${PIPESTATUS[0]}
+ END_TIME=$(date +%s)
+ DURATION=$((END_TIME - START_TIME))
+
+ # Unset environment variables to avoid affecting next run
+ unset RCCL_WARP_SPEED_CU_COUNT
+ unset RCCL_THREADS_PER_BLOCK
+
+ RUN_TIMES[${NAME}]=${DURATION}
+
+ if [ $EXIT_CODE -eq 0 ]; then
+ log "[OK] Completed ${NAME} (duration: ${DURATION}s)"
+ RUN_STATUS[${NAME}]="SUCCESS"
+ else
+ log "[ERROR] Failed ${NAME} (exit code: $EXIT_CODE, duration: ${DURATION}s)"
+ RUN_STATUS[${NAME}]="FAILED"
+ fi
+
+ # Fix permissions if running as root in container
+ if [ "$EUID" -eq 0 ]; then
+ chmod -R 755 "${OUTPUT_DIR}" 2>/dev/null || true
+ fi
+
+ echo ""
+ log "Waiting 5 seconds before next run..."
+ sleep 5
+done
+
+# Generate summary report
+echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}"
+echo -e "${BLUE}SUMMARY REPORT${NC}" | tee -a "${SWEEP_LOG}"
+echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}"
+
+SUMMARY_FILE="${BASE_OUTPUT_DIR}/rccl_warp_speed_summary_${TIMESTAMP}.txt"
+{
+ echo "RCCL Warp Speed Configuration Comparison"
+ echo "Generated: $(date)"
+ echo ""
+ printf "%-20s %-10s %-15s %-10s\n" "CONFIGURATION" "CU_COUNT" "THREADS" "STATUS"
+ echo "----------------------------------------------------------------"
+
+ for config in "${CONFIGS[@]}"; do
+ IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+ STATUS="${RUN_STATUS[${NAME}]:-UNKNOWN}"
+ DURATION="${RUN_TIMES[${NAME}]:-N/A}"
+ printf "%-20s %-10s %-15s %-10s (duration: %ss)\n" "${NAME}" "${CU_COUNT}" "${THREADS}" "${STATUS}" "${DURATION}"
+ done
+
+ echo ""
+ echo "Output directories:"
+ for config in "${CONFIGS[@]}"; do
+ IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+ echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/"
+ done
+
+ echo ""
+ echo "Trace files for each configuration:"
+ for config in "${CONFIGS[@]}"; do
+ IFS='|' read -r NAME CU_COUNT THREADS <<< "$config"
+ echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/torch_profiler/"
+ done
+} | tee "${SUMMARY_FILE}"
+
+log "Summary saved to: ${SUMMARY_FILE}"
+
+# Fix permissions for the entire output directory if running as root
+if [ "$EUID" -eq 0 ]; then
+ echo "Fixing permissions for output directory..." | tee -a "${SWEEP_LOG}"
+ chmod -R 755 "${BASE_OUTPUT_DIR}" 2>/dev/null || true
+fi
+
+echo ""
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}Next Steps: Run TraceLens Analysis${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo "To analyze and compare these configurations:"
+echo ""
+echo "./scripts/tracelens_single_config/run_tracelens_analysis.sh ${BASE_OUTPUT_DIR}"
+echo ""
+echo "This will generate:"
+echo " - Individual reports for each rank (all 3 configs)"
+echo " - Collective reports (all 3 configs)"
+echo " - Comparison reports across the 3 configurations"
+echo ""
+
+log "All runs completed! Run TraceLens analysis next."
diff --git a/scripts/tracelens_single_config/run_tracelens_single_config.sh b/scripts/tracelens_single_config/run_tracelens_single_config.sh
new file mode 100755
index 0000000..96831ff
--- /dev/null
+++ b/scripts/tracelens_single_config/run_tracelens_single_config.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+# TraceLens Analysis for Single Configuration (No Sweep)
+# Usage: ./run_tracelens_single_config.sh
+#
+# The script accepts either:
+# - Path to parent directory containing torch_profiler/
+# - Path to torch_profiler/ directory directly
+#
+# Examples:
+# ./run_tracelens_single_config.sh /path/to/traces
+# ./run_tracelens_single_config.sh /path/to/traces/torch_profiler
+#
+# Note: Uses GEMM-patched TraceLens wrapper to recognize ROCm Tensile kernels
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Use patched TraceLens wrapper for GEMM recognition
+TRACELENS_WRAPPER="python $SCRIPT_DIR/../tracelens_with_gemm_patch.py"
+
+# Parse options
+RUN_INDIVIDUAL=true
+RUN_COLLECTIVE=true
+
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --individual-only)
+ RUN_COLLECTIVE=false
+ shift
+ ;;
+ --collective-only)
+ RUN_INDIVIDUAL=false
+ shift
+ ;;
+ *)
+ INPUT_DIR="$1"
+ shift
+ ;;
+ esac
+done
+
+# Check if directory provided
+if [ -z "$INPUT_DIR" ]; then
+ echo "Error: Please provide trace directory"
+ echo ""
+ echo "Usage: $0 [options]"
+ echo ""
+ echo "Options:"
+ echo " --individual-only Generate only individual reports"
+ echo " --collective-only Generate only collective report"
+ echo ""
+ echo "Examples:"
+ echo " $0 /path/to/traces"
+ echo " $0 /path/to/traces --individual-only"
+ echo " $0 /path/to/traces --collective-only"
+ echo ""
+ exit 1
+fi
+
+# Verify directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+ echo "Error: Directory not found: $INPUT_DIR"
+ exit 1
+fi
+
+# Auto-detect structure: is this torch_profiler/ or its parent?
+TORCH_PROF_DIR=""
+BASE_DIR=""
+
+# Check if INPUT_DIR contains rank directories (i.e., it IS torch_profiler/)
+if find "$INPUT_DIR" -maxdepth 1 -type d -name "rank*" | grep -q .; then
+ TORCH_PROF_DIR="$INPUT_DIR"
+ BASE_DIR=$(dirname "$INPUT_DIR")
+ echo "Detected torch_profiler directory: $TORCH_PROF_DIR"
+# Check if INPUT_DIR contains torch_profiler/ subdirectory
+elif [ -d "$INPUT_DIR/torch_profiler" ]; then
+ TORCH_PROF_DIR="$INPUT_DIR/torch_profiler"
+ BASE_DIR="$INPUT_DIR"
+ echo "Found torch_profiler subdirectory: $TORCH_PROF_DIR"
+else
+ echo "Error: Cannot find rank directories in expected structure"
+ echo ""
+ echo "Expected one of:"
+ echo " 1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)"
+ echo " 2. Parent directory containing torch_profiler/rank0/, rank1/, ..."
+ echo ""
+ echo "Provided: $INPUT_DIR"
+ exit 1
+fi
+
+echo "════════════════════════════════════════════════════════════════"
+echo " TraceLens Analysis - Single Configuration"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "Input directory: $INPUT_DIR"
+echo "Torch profiler traces: $TORCH_PROF_DIR"
+echo ""
+
+# Create output directory in the base directory
+OUTPUT_DIR="${BASE_DIR}/tracelens_analysis"
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR/individual_reports"
+mkdir -p "$OUTPUT_DIR/collective_reports"
+
+# Detect number of ranks
+NUM_RANKS=$(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | wc -l)
+
+if [ $NUM_RANKS -eq 0 ]; then
+ echo "Error: No rank directories found in $TORCH_PROF_DIR"
+ exit 1
+fi
+
+echo "Detected $NUM_RANKS ranks"
+
+# Show sample trace files
+echo ""
+echo "Sample trace files:"
+for rank_dir in $(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | sort | head -3); do
+ rank_name=$(basename "$rank_dir")
+ trace_file=$(find "$rank_dir" -name "*.json" | head -1)
+ if [ -n "$trace_file" ]; then
+ echo " $rank_name: $(basename "$trace_file")"
+ fi
+done
+if [ "$RUN_INDIVIDUAL" = true ]; then
+ echo ""
+ echo "════════════════════════════════════════════════════════════════"
+ echo "Step 1: Generating Individual Performance Reports"
+ echo "════════════════════════════════════════════════════════════════"
+ echo ""
+
+# Process each rank
+for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+ # Try multiple directory naming patterns
+ RANK_DIR=""
+ if [ -d "$TORCH_PROF_DIR/rank${rank_idx}" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+ elif [ -d "$TORCH_PROF_DIR/rank_${rank_idx}" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank_${rank_idx}"
+ elif [ -d "$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)"
+ fi
+
+ if [ -z "$RANK_DIR" ] || [ ! -d "$RANK_DIR" ]; then
+ echo " Skip rank ${rank_idx} - directory not found"
+ continue
+ fi
+
+ # Find trace file
+ TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+
+ if [ -z "$TRACE" ]; then
+ echo "⚠️ Skip rank ${rank_idx} - no trace file found"
+ continue
+ fi
+
+ OUTPUT="$OUTPUT_DIR/individual_reports/perf_rank${rank_idx}.xlsx"
+
+ echo "Processing rank ${rank_idx}..."
+ echo " Trace: $(basename "$TRACE")"
+
+ $TRACELENS_WRAPPER generate_perf_report \
+ --profile_json_path "$TRACE" \
+ --output_xlsx_path "$OUTPUT" \
+ --include_unlinked_kernels \
+ --short_kernel_study \
+ --short_kernel_threshold_us 50 \
+ --topk_ops 100 \
+ --topk_roofline_ops 100
+
+ echo " Done: $OUTPUT"
+ echo ""
+done
+
+fi
+
+if [ "$RUN_COLLECTIVE" = true ]; then
+ echo ""
+ echo "════════════════════════════════════════════════════════════════"
+ echo "Step 2: Generating Multi-Rank Collective Report"
+ echo "════════════════════════════════════════════════════════════════"
+ echo ""
+
+# Find a sample trace file to get the filename pattern
+SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank0" -name "*.json" -type f | head -1)
+if [ -z "$SAMPLE_TRACE" ]; then
+ # Try alternative rank naming
+ SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_0" -name "*.json" -type f | head -1)
+fi
+
+if [ -z "$SAMPLE_TRACE" ]; then
+ # Try rank_00
+ SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_00" -name "*.json" -type f | head -1)
+fi
+
+if [ -n "$SAMPLE_TRACE" ]; then
+ OUTPUT="$OUTPUT_DIR/collective_reports/collective_all_ranks.xlsx"
+
+ echo "Generating collective report for all $NUM_RANKS ranks..."
+
+ # Create symlinks with consistent names for collective report
+ for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+ RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+ if [ -d "$RANK_DIR" ]; then
+ TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+ if [ -n "$TRACE" ]; then
+ ln -sf "$(basename "$TRACE")" "$RANK_DIR/trace.json"
+ fi
+ fi
+ done
+
+ echo " Trace pattern: rank*/trace.json"
+
+ $TRACELENS_WRAPPER generate_multi_rank_collective \
+ --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \
+ --world_size $NUM_RANKS \
+ --output_xlsx_path "$OUTPUT" \
+ --detailed_analysis \
+ --use_multiprocessing
+
+ echo " Done: $OUTPUT"
+else
+ echo " Could not generate collective report - no trace files found"
+fi
+
+fi
+
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "Analysis Complete!"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "📁 Results saved to:"
+echo " $OUTPUT_DIR/"
+echo ""
+
+# Count generated reports
+INDIV_COUNT=$(find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+COLL_COUNT=$(find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+
+echo "Generated reports:"
+echo " Individual reports (per rank): $INDIV_COUNT"
+echo " Collective reports (all ranks): $COLL_COUNT"
+echo ""
+
+echo "📊 Report Files:"
+echo ""
+echo "Individual Performance Reports:"
+if [ $INDIV_COUNT -gt 0 ]; then
+ find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" | sort | sed 's/^/ /'
+else
+ echo " (none generated)"
+fi
+echo ""
+
+echo "Collective Reports:"
+if [ $COLL_COUNT -gt 0 ]; then
+ find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" | sed 's/^/ /'
+else
+ echo " (none generated)"
+fi
+
+echo ""
+echo "Done!"
diff --git a/scripts/tracelens_with_gemm_patch.py b/scripts/tracelens_with_gemm_patch.py
new file mode 100755
index 0000000..6a200d9
--- /dev/null
+++ b/scripts/tracelens_with_gemm_patch.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+TraceLens with GEMM Recognition Patches
+
+This script applies GEMM recognition patches and runs TraceLens commands.
+
+Usage:
+ python tracelens_with_gemm_patch.py generate_perf_report [args...]
+ python tracelens_with_gemm_patch.py generate_multi_rank_collective [args...]
+ python tracelens_with_gemm_patch.py compare_perf_reports [args...]
+"""
+
+import re
+import sys
+
+
+def apply_gemm_patches():
+ """Apply all GEMM recognition patches to TraceLens."""
+
+ print("Applying TraceLens GEMM recognition patches...")
+
+ # Patch kernel_name_parser for enhanced ROCm GEMM recognition
+ try:
+ from TraceLens.PerfModel import kernel_name_parser
+
+ def patched_is_rocm_gemm(kernel_name):
+ """
+ Enhanced ROCm GEMM pattern matching for Tensile kernels.
+ Recognizes: Cijk_Alik_Bljk_... and variants with arbitrary prefixes.
+ """
+ pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+ return bool(re.match(pattern, kernel_name))
+
+ def patched_parse_rocm_gemm(kernel_name):
+ """Parse ROCm GEMM kernel details."""
+ # Parse transpose flags
+ trans_a, trans_b = None, None
+ if "_Ailk_" in kernel_name:
+ trans_a = False
+ elif "_Alik_" in kernel_name:
+ trans_a = True
+ if "_Bljk_" in kernel_name:
+ trans_b = False
+ elif "_Bjlk_" in kernel_name:
+ trans_b = True
+
+ # Parse macro tile size (MT64x16x64)
+ macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name)
+ if macro_tile_match:
+ mt_m = int(macro_tile_match.group(1))
+ mt_n = int(macro_tile_match.group(2))
+ depth_u = int(macro_tile_match.group(3))
+ else:
+ mt_m, mt_n, depth_u = None, None, None
+
+ return {
+ "transpose": (trans_a, trans_b),
+ "mt_m": mt_m,
+ "mt_n": mt_n,
+ "depth_u": depth_u,
+ }
+
+ def patched_gemm_name_parser(kernel_name):
+ """Enhanced GEMM name parser with better ROCm support."""
+ if patched_is_rocm_gemm(kernel_name):
+ return patched_parse_rocm_gemm(kernel_name)
+ elif kernel_name_parser.is_cuda_gemm(kernel_name):
+ return kernel_name_parser.parse_cuda_gemm(kernel_name)
+ return None
+
+ kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm
+ kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm
+ kernel_name_parser.gemm_name_parser = patched_gemm_name_parser
+
+ print(" [OK] Patched kernel_name_parser (ROCm GEMM recognition)")
+ except ImportError as e:
+ print(f" [WARN] Could not patch kernel_name_parser: {e}")
+
+ # Patch Trace2Tree util for is_gemm_kernel function
+ try:
+ from TraceLens.Trace2Tree import util as trace_util
+
+ def patched_is_gemm_kernel(kernel_event: dict) -> bool:
+ """Enhanced GEMM kernel detection."""
+ assert kernel_event["cat"] == "kernel"
+ kernel_name = kernel_event["name"]
+
+ # ROCm Tensile GEMM pattern: C[xyz]_A[xyz]_B[xyz]
+ pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+ is_rocm_gemm = bool(re.match(pattern, kernel_name))
+
+ # CUDA GEMM pattern
+ is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name
+
+ return is_rocm_gemm or is_cuda_gemm
+
+ trace_util.is_gemm_kernel = patched_is_gemm_kernel
+ print(" [OK] Patched Trace2Tree.util (is_gemm_kernel)")
+ except ImportError as e:
+ print(f" [WARN] Could not patch Trace2Tree.util: {e}")
+
+ # Patch TraceEventUtils to enhance GEMM keys
+ try:
+ from TraceLens import util as tracelens_util
+
+ if hasattr(tracelens_util, 'TraceEventUtils'):
+ if hasattr(tracelens_util.TraceEventUtils, 'JaxOpKeys'):
+ original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys
+ enhanced_gemm_keys = ["Cijk", "gemm", "nvjet", "cublasLt", "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}"]
+
+ all_keys = list(set(original_gemm_keys + enhanced_gemm_keys))
+ tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys
+
+ print(" [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)")
+ except (ImportError, AttributeError) as e:
+ print(f" [WARN] Could not patch TraceEventUtils: {e}")
+
+ # Patch torch_op_mapping for better categorization
+ try:
+ from TraceLens.PerfModel import torch_op_mapping
+
+ original_categorize = torch_op_mapping.categorize_torch_op
+
+ def patched_categorize_torch_op(row):
+ """Enhanced categorization with better GEMM detection."""
+ result = original_categorize(row)
+
+ # If result is 'other', check for GEMM patterns in kernel names
+ if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0:
+ kernel_name = row["kernel_details"][0]["name"]
+ pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$"
+ if re.match(pattern, kernel_name):
+ return "GEMM"
+
+ return result
+
+ torch_op_mapping.categorize_torch_op = patched_categorize_torch_op
+ print(" [OK] Patched torch_op_mapping (categorize_torch_op)")
+ except ImportError as e:
+ print(f" [WARN] Could not patch torch_op_mapping: {e}")
+
+ print("[OK] All GEMM patches applied successfully!\n")
+
+
+def main():
+ if len(sys.argv) < 2:
+ print("Usage: tracelens_with_gemm_patch.py [args...]")
+ print("")
+ print("Commands:")
+ print(" generate_perf_report - Generate individual performance report")
+ print(" generate_multi_rank_collective - Generate multi-rank collective report")
+ print(" compare_perf_reports - Compare performance reports")
+ sys.exit(1)
+
+ # Apply patches before importing TraceLens reporting modules
+ apply_gemm_patches()
+
+ # Import TraceLens after patches are applied
+ from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_perf_report_main
+ from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import main as generate_multi_rank_collective_report_main
+ from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_perf_reports_main
+
+ command = sys.argv[1]
+
+ # Remove the command from argv so TraceLens sees only its args
+ sys.argv = [sys.argv[0]] + sys.argv[2:]
+
+ if command == "generate_perf_report":
+ generate_perf_report_main()
+ elif command == "generate_multi_rank_collective":
+ generate_multi_rank_collective_report_main()
+ elif command == "compare_perf_reports":
+ compare_perf_reports_main()
+ else:
+ print(f"Error: Unknown command '{command}'")
+ print("")
+ print("Available commands:")
+ print(" generate_perf_report")
+ print(" generate_multi_rank_collective")
+ print(" compare_perf_reports")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/aorta/training/fsdp_trainer.py b/src/aorta/training/fsdp_trainer.py
index 0a6fbc5..d3a03e0 100644
--- a/src/aorta/training/fsdp_trainer.py
+++ b/src/aorta/training/fsdp_trainer.py
@@ -273,6 +273,7 @@ def build_ddp_model(
if device.type == "cuda":
device_ids = [device.index if device.index is not None else torch.cuda.current_device()]
+ print(f"===> {device_ids} {model}")
ddp_model = DDP(
model,
device_ids=device_ids,
@@ -735,6 +736,7 @@ def main(args: Optional[argparse.Namespace] = None, *, enable_rocm_metrics: bool
training_cfg.max_steps or training_cfg.epochs * len(dataloader),
)
+ print(f"Calling main trainer with device {env['device']}")
profiler = StreamProfiler(env["device"])
try: