diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml index 04946c6..6179857 100644 --- a/docker/docker-compose.rocm70_9-1.yaml +++ b/docker/docker-compose.rocm70_9-1.yaml @@ -3,7 +3,7 @@ services: container_name: training-overlap-bugs-rocm70 build: context: . - dockerfile: Dockerfile.rocm70 + dockerfile: Dockerfile.rocm70_9-1 user: root privileged: true network_mode: host @@ -15,8 +15,6 @@ services: security_opt: - seccomp=unconfined environment: - - RCCL_FOLDER=/rccl - - LD_LIBRARY_PATH=/rccl/build/release:$LD_LIBRARY_PATH - TORCH_NCCL_HIGH_PRIORITY=1 volumes: diff --git a/scripts/tracelens_single_config/README.md b/scripts/tracelens_single_config/README.md new file mode 100644 index 0000000..2c7ca54 --- /dev/null +++ b/scripts/tracelens_single_config/README.md @@ -0,0 +1,166 @@ +# RCCL Warp Speed Performance Testing + +Test RCCL warp_speed_v1 branch from https://github.com/mustafabar/rccl.git + +## Prerequisites + +```bash +pip install pandas openpyxl matplotlib seaborn numpy +``` + +## Run Tests + +### Step 1: Start Container and Build RCCL + +```bash +cd docker +docker-compose -f docker-compose.rocm70_9-1.yaml build +docker-compose -f docker-compose.rocm70_9-1.yaml up -d +docker-compose -f docker-compose.rocm70_9-1.yaml exec torchenv-rocm70 bash + +# Inside container - build warp_speed_v1 (always rebuild) +# Note: Set --amdgpu_targets to match your GPU architecture +# Run 'rocminfo | grep gfx' to find your GPU target (e.g., gfx942, gfx950) +cd /opt +if [ -d "rccl" ]; then + cd rccl + git checkout warp_speed_v1 + git pull +else + git clone --recursive https://github.com/mustafabar/rccl.git + cd rccl + git checkout warp_speed_v1 +fi +./install.sh -l --amdgpu_targets=gfx950 + +cd /workspace/aorta +``` + +### Step 2: Run RCCL Tests + +```bash +# Default 3 configurations +./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh + +# Custom configurations (CU_count,threads pairs) +./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512" -c ./config/single_node/gemm_overlap_comm.yaml +``` + +Output structure: +``` +experiments/ + rccl_warp_speed_YYYYMMDD_HHMMSS/ + 56cu_256threads/ + torch_profiler/ # Raw profiler traces + run_output.log # Training output log + 37cu_384threads/ + 32cu_512threads/ + rccl_warp_speed_summary_YYYYMMDD_HHMMSS.txt +``` + +### Step 3: Generate Reports (Outside Container) + +```bash +# Exit container +exit + +# Run complete analysis +python scripts/tracelens_single_config/run_full_analysis.py \ + --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \ + --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \ + --output comparison_results \ + --all + +# Or skip TraceLens if already done +python scripts/tracelens_single_config/run_full_analysis.py \ + --baseline experiments/rccl_warp_speed_YYYYMMDD/56cu_256threads \ + --test experiments/rccl_warp_speed_YYYYMMDD/37cu_384threads \ + --output comparison_results \ + --all --skip-tracelens +``` + +## Generated Excel Reports + +### Individual TraceLens Reports (per configuration) +Each configuration generates: +- `tracelens_analysis/individual_reports/perf_rank*.xlsx` - Per-rank performance breakdown +- `tracelens_analysis/collective_reports/collective_all_ranks.xlsx` - Collective operations summary +- `tracelens_analysis/gpu_timeline_summary_mean.xlsx` - GPU timeline averages + +### Final Analysis Report (`final_analysis_report.xlsx`) + +Contains multiple sheets: + +**Summary Sheets:** +- `Summary_Dashboard` - High-level comparison metrics with percentage changes +- `Summary_Comparison` - Side-by-side summary comparison +- `GPU_ByRank_Comparison` - Detailed per-rank performance comparison +- `Comparison_By_Rank` - Rank-wise metric comparison with differences + +**GPU Timeline Sheets:** +- `All_Ranks_Combined` - Combined GPU timeline data from all ranks +- `Summary` - Aggregated GPU timeline summary +- `Rank_*` - Individual rank GPU timelines + +**Collective/NCCL Sheets:** +- `nccl_summary_implicit_sync` - NCCL operations with implicit synchronization +- `nccl_summary_long` - Long-running NCCL operations +- `nccl_summary_implicit_sync_comparison` - Comparison of implicit sync operations +- `nccl_summary_long_comparison` - Comparison of long operations + +**Raw Data Sheets (hidden by default):** +- `gpu_timeline_combined` - Raw combined GPU timeline data +- `gpu_timeline_comparison` - Raw GPU timeline comparison data +- `collective_combined` - Raw collective operations data +- `collective_comparison` - Raw collective comparison data + +### Comparison Reports + +- `gpu_timeline_combined.xlsx` - Baseline and test GPU metrics combined +- `gpu_timeline_comparison.xlsx` - GPU metrics with comparison analysis +- `collective_combined.xlsx` - Baseline and test collective operations combined +- `collective_comparison.xlsx` - Collective operations with comparison analysis + +## Generated Visualizations + +### HTML Report +- `performance_analysis_report.html` - Complete report with all embedded plots + +### Individual Plot Files (12 Total) +1. `plot1_percentage_change_overview.png` - Horizontal bar chart showing performance changes +2. `plot2_absolute_time_comparison.png` - Bar chart comparing absolute times +3. `plot3_performance_heatmap.png` - Heatmap of performance by rank +4. `plot4_total_execution_time.png` - Line plot of total execution time per rank +5. `plot5_computation_time.png` - Line plot of computation time across ranks +6. `plot6_communication_time.png` - Line plot of communication time across ranks +7. `plot7_idle_time.png` - Line plot of idle time across ranks +8. `plot8_percentage_difference_all_metrics.png` - Bar plot showing percentage differences for all metrics +9. `plot9_nccl_latency.png` - Line plot of latency vs message size +10. `plot10_algorithm_bandwidth.png` - Line plot of algorithm bandwidth vs message size +11. `plot11_bus_bandwidth.png` - Line plot of bus bandwidth vs message size +12. `plot12_nccl_summary.png` - Combined percentage summary and total latency + +## Key Metrics Analyzed + +**GPU Metrics:** +- `computation_time` - Time spent in computation +- `total_comm_time` - Total communication time +- `exposed_comm_time` - Non-overlapped communication time +- `idle_time` - GPU idle time +- `total_memcpy_time` - Memory copy time +- `exposed_memcpy_time` - Non-overlapped memory copy time +- `busy_time` - Total GPU busy time +- `total_time` - Total execution time + +**NCCL Metrics:** +- `comm_latency_mean` - Average communication latency +- `algo bw (GB/s)_mean` - Algorithm bandwidth +- `bus bw (GB/s)_mean` - Bus bandwidth +- `Total comm latency (ms)` - Total communication latency +- `count` - Number of operations + +## Convert to PDF + +1. Open `performance_analysis_report.html` in browser +2. Print to PDF (Ctrl+P or Cmd+P) +3. Choose landscape orientation for better plot visibility diff --git a/scripts/tracelens_single_config/add_collective_comparison.py b/scripts/tracelens_single_config/add_collective_comparison.py new file mode 100644 index 0000000..6f3f310 --- /dev/null +++ b/scripts/tracelens_single_config/add_collective_comparison.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +import pandas as pd +import argparse +from openpyxl.styles import Color +from openpyxl.formatting.rule import ColorScaleRule + + +def add_collective_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'): + print(f"Loading: {input_path}") + print(f" Baseline label: {baseline_label}") + print(f" Test label: {test_label}") + + xl = pd.ExcelFile(input_path) + + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + # Copy only summary sheets + for sheet_name in xl.sheet_names: + # Only keep sheets with 'summary' in the name + if 'summary' not in sheet_name.lower(): + print(f" Skip {sheet_name} (keeping only summary sheets)") + continue + df = pd.read_excel(input_path, sheet_name=sheet_name) + df.to_excel(writer, sheet_name=sheet_name, index=False) + print(f" Copied {sheet_name}") + + # Process summary sheets for comparison + for sheet_name in ['nccl_summary_implicit_sync', 'nccl_summary_long']: + if sheet_name not in xl.sheet_names: + continue + + df = pd.read_excel(input_path, sheet_name=sheet_name) + + # Get actual source values from the dataframe + sources = df['source'].unique() + # Determine which is baseline and which is test (baseline should be first) + if len(sources) >= 2: + actual_baseline = sources[0] + actual_test = sources[1] + else: + actual_baseline = baseline_label + actual_test = test_label + + # Separate baseline and test + baseline_df = df[df['source'] == actual_baseline].copy() + test_df = df[df['source'] == actual_test].copy() + + if len(baseline_df) == 0 or len(test_df) == 0: + print(f" Skip {sheet_name} - missing data") + continue + + # Create comparison dataframe + comparison = pd.DataFrame() + + # Identify key columns for grouping + group_cols = ['Collective name', 'dtype', 'In msg nelems'] + if not all(col in baseline_df.columns for col in group_cols): + group_cols = ['Collective name'] + + # Group and compare + baseline_grouped = baseline_df.groupby(group_cols, as_index=False) + test_grouped = test_df.groupby(group_cols, as_index=False) + + for name, base_group in baseline_grouped: + # Find matching test group + if isinstance(name, tuple): + mask = pd.Series([True] * len(test_df), index=test_df.index) + for col, val in zip(group_cols, name): + mask = mask & (test_df[col] == val) + else: + mask = (test_df[group_cols[0]] == name) + + test_group = test_df.loc[mask] + + if len(test_group) == 0: + continue + + # Create comparison row + comp_row = {} + + # Copy grouping columns + if isinstance(name, tuple): + for col, val in zip(group_cols, name): + comp_row[col] = val + else: + comp_row[group_cols[0]] = name + + # Compare numeric columns + numeric_cols = ['comm_latency_mean', 'algo bw (GB/s)_mean', 'bus bw (GB/s)_mean', + 'Total comm latency (ms)', 'count'] + + for col in numeric_cols: + if col not in base_group.columns or col not in test_group.columns: + continue + + base_val = base_group[col].values[0] + test_val = test_group[col].values[0] + + comp_row[f'{baseline_label}_{col}'] = base_val + comp_row[f'{test_label}_{col}'] = test_val + comp_row[f'diff_{col}'] = test_val - base_val + + # For latency/time: positive percent_change means faster (less time) + # For bandwidth: positive percent_change means better (more bandwidth) + if 'latency' in col.lower() or 'time' in col.lower(): + # Lower is better - positive when saleelk is faster + pct_change = (base_val - test_val) / base_val * 100 if base_val != 0 else 0 + comp_row[f'percent_change_{col}'] = pct_change + elif 'bw' in col.lower() or 'bandwidth' in col.lower(): + # Higher is better - positive when saleelk is better + pct_change = (test_val - base_val) / base_val * 100 if base_val != 0 else 0 + comp_row[f'percent_change_{col}'] = pct_change + + comp_row[f'ratio_{col}'] = test_val / base_val if base_val != 0 else 0 + + comparison = pd.concat([comparison, pd.DataFrame([comp_row])], ignore_index=True) + + # Write comparison sheet (shorten name to fit Excel's 31 char limit) + # Replace 'nccl_summary_' with 'nccl_' and '_comparison' with '_cmp' + comparison_sheet_name = sheet_name.replace('nccl_summary_', 'nccl_') + '_cmp' + comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False) + print(f" Added {comparison_sheet_name}") + + # Add conditional formatting to percent_change columns + print(f" Applying conditional formatting to {comparison_sheet_name}...") + + ws = writer.sheets[comparison_sheet_name] + + # Format all percent_change columns with color scale + for col_idx, col in enumerate(comparison.columns, start=1): + if 'percent_change' in col: + # Convert column index to Excel letter (A, B, C, ...) + if col_idx <= 26: + col_letter = chr(64 + col_idx) + else: + col_letter = chr(64 + (col_idx // 26)) + chr(64 + (col_idx % 26)) + + data_range = f'{col_letter}2:{col_letter}{len(comparison)+1}' + + # Color scale: red (min/negative) -> white (0) -> green (max/positive) + ws.conditional_formatting.add(data_range, + ColorScaleRule( + start_type='min', start_color='F8696B', # Red + mid_type='num', mid_value=0, mid_color='FFFFFF', # White + end_type='max', end_color='63BE7B' # Green + )) + + print(f" Formatted {col}") + + print(f"\nSaved: {output_path}") + print("\nNew comparison sheets added") + print("percent_change interpretation:") + print(" For latency/time: Positive = faster (less time)") + print(" For bandwidth: Positive = better (more bandwidth)") + return 0 + + +def main(): + parser = argparse.ArgumentParser(description='Add comparison sheets to combined collective reports') + parser.add_argument('--input', required=True, help='Input combined collective Excel file') + parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets') + parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data') + parser.add_argument('--test-label', default='test', help='Label for test data') + + args = parser.parse_args() + + return add_collective_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label) + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/tracelens_single_config/add_comparison_sheets.py b/scripts/tracelens_single_config/add_comparison_sheets.py new file mode 100755 index 0000000..765f391 --- /dev/null +++ b/scripts/tracelens_single_config/add_comparison_sheets.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +import pandas as pd +import argparse +from openpyxl.styles import Color +from openpyxl.formatting.rule import ColorScaleRule + + +def add_comparison_sheets(input_path, output_path, baseline_label='baseline', test_label='test'): + print(f"Loading: {input_path}") + print(f" Baseline label: {baseline_label}") + print(f" Test label: {test_label}") + + xl = pd.ExcelFile(input_path) + + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + # Copy all original sheets + for sheet_name in xl.sheet_names: + df = pd.read_excel(input_path, sheet_name=sheet_name) + df.to_excel(writer, sheet_name=sheet_name, index=False) + print(f" Copied {sheet_name}") + + # Add comparison sheets + all_combined = pd.read_excel(input_path, sheet_name='All_Ranks_Combined') + + # Get actual source values from the dataframe + sources = all_combined['source'].unique() + # Determine which is baseline and which is test (baseline should be first) + if len(sources) >= 2: + actual_baseline = sources[0] + actual_test = sources[1] + else: + actual_baseline = baseline_label + actual_test = test_label + + # Comparison 1: Side-by-side by rank + baseline_data = all_combined[all_combined['source'] == actual_baseline] + test_data = all_combined[all_combined['source'] == actual_test] + + comparison_by_rank = pd.DataFrame() + for rank in sorted(baseline_data['rank'].unique()): + base_rank = baseline_data[baseline_data['rank'] == rank].set_index('type') + test_rank = test_data[test_data['rank'] == rank].set_index('type') + + for metric_type in base_rank.index: + if metric_type in test_rank.index: + base_time = base_rank.loc[metric_type, 'time ms'] + test_time = test_rank.loc[metric_type, 'time ms'] + ratio_val = test_time / base_time if base_time != 0 else 0 + # Percentage change: positive when test is faster (takes less time) + pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0 + + # Determine if better or worse + if pct_change > 1: + status = 'Better' + elif pct_change < -1: + status = 'Worse' + else: + status = 'Similar' + + comparison_by_rank = pd.concat([comparison_by_rank, pd.DataFrame({ + 'rank': [rank], + 'type': [metric_type], + f'{baseline_label}_time_ms': [base_time], + f'{test_label}_time_ms': [test_time], + 'diff_time_ms': [test_time - base_time], + 'percent_change': [pct_change], + 'status': [status], + 'ratio': [ratio_val], + f'{baseline_label}_percent': [base_rank.loc[metric_type, 'percent']], + f'{test_label}_percent': [test_rank.loc[metric_type, 'percent']], + 'diff_percent': [test_rank.loc[metric_type, 'percent'] - base_rank.loc[metric_type, 'percent']] + })], ignore_index=True) + + comparison_by_rank.to_excel(writer, sheet_name='Comparison_By_Rank', index=False) + print(f" Added Comparison_By_Rank") + + # Comparison 2: Summary comparison + summary = pd.read_excel(input_path, sheet_name='Summary') + baseline_summary = summary[summary['source'] == actual_baseline].set_index('type') + test_summary = summary[summary['source'] == actual_test].set_index('type') + + summary_comparison = pd.DataFrame() + for metric_type in baseline_summary.index: + if metric_type in test_summary.index: + base_time = baseline_summary.loc[metric_type, 'time ms'] + test_time = test_summary.loc[metric_type, 'time ms'] + ratio_val = test_time / base_time if base_time != 0 else 0 + # Percentage change: positive when test is faster (takes less time) + pct_change = (base_time - test_time) / base_time * 100 if base_time != 0 else 0 + + summary_comparison = pd.concat([summary_comparison, pd.DataFrame({ + 'type': [metric_type], + f'{baseline_label}_time_ms': [base_time], + f'{test_label}_time_ms': [test_time], + 'diff_time_ms': [test_time - base_time], + 'percent_change': [pct_change], + 'ratio': [ratio_val], + f'{baseline_label}_percent': [baseline_summary.loc[metric_type, 'percent']], + f'{test_label}_percent': [test_summary.loc[metric_type, 'percent']], + 'diff_percent': [test_summary.loc[metric_type, 'percent'] - baseline_summary.loc[metric_type, 'percent']] + })], ignore_index=True) + + summary_comparison.to_excel(writer, sheet_name='Summary_Comparison', index=False) + print(f" Added Summary_Comparison") + + # Add conditional formatting to percent_change columns + print("\n Applying conditional formatting...") + + # Create color scale: Red (negative) -> White (0) -> Green (positive) + + # Format Comparison_By_Rank + ws_rank = writer.sheets['Comparison_By_Rank'] + # Find percent_change column + for col_idx, col in enumerate(comparison_by_rank.columns, start=1): + if col == 'percent_change': + col_letter = chr(64 + col_idx) # Convert to Excel column letter + data_range = f'{col_letter}2:{col_letter}{len(comparison_by_rank)+1}' + # Color scale: red (min) -> white (0) -> green (max) + ws_rank.conditional_formatting.add(data_range, + ColorScaleRule( + start_type='min', start_color='F8696B', # Red + mid_type='num', mid_value=0, mid_color='FFFFFF', # White + end_type='max', end_color='63BE7B' # Green + )) + print(f" Formatted Comparison_By_Rank column {col}") + break + + # Format Summary_Comparison + ws_summary = writer.sheets['Summary_Comparison'] + for col_idx, col in enumerate(summary_comparison.columns, start=1): + if col == 'percent_change': + col_letter = chr(64 + col_idx) + data_range = f'{col_letter}2:{col_letter}{len(summary_comparison)+1}' + # Color scale: red (min) -> white (0) -> green (max) + ws_summary.conditional_formatting.add(data_range, + ColorScaleRule( + start_type='min', start_color='F8696B', # Red + mid_type='num', mid_value=0, mid_color='FFFFFF', # White + end_type='max', end_color='63BE7B' # Green + )) + print(f" Formatted Summary_Comparison column {col}") + break + + print(f"\nSaved: {output_path}") + print("\nNew sheets:") + print(" Comparison_By_Rank - Side-by-side comparison for each rank") + print(" Summary_Comparison - Overall comparison") + return 0 + + +def main(): + parser = argparse.ArgumentParser(description='Add comparison sheets to combined GPU timeline') + parser.add_argument('--input', required=True, help='Input combined Excel file') + parser.add_argument('--output', required=True, help='Output Excel file with comparison sheets') + parser.add_argument('--baseline-label', default='baseline', help='Label for baseline data') + parser.add_argument('--test-label', default='test', help='Label for test data') + + args = parser.parse_args() + + return add_comparison_sheets(args.input, args.output, args.baseline_label, args.test_label) + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/tracelens_single_config/combine_reports.py b/scripts/tracelens_single_config/combine_reports.py new file mode 100755 index 0000000..e5a4a95 --- /dev/null +++ b/scripts/tracelens_single_config/combine_reports.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import pandas as pd +import argparse +from pathlib import Path + + +def combine_collective_reports(baseline_path, test_path, output_path): + # Extract folder names from paths for labels + baseline_label = Path(baseline_path).parent.parent.name # Get the config folder name + test_label = Path(test_path).parent.parent.name # Get the config folder name + + print(f"Loading baseline ({baseline_label}): {baseline_path}") + baseline_xl = pd.ExcelFile(baseline_path) + + print(f"Loading test ({test_label}): {test_path}") + test_xl = pd.ExcelFile(test_path) + + print(f"\nBaseline sheets: {baseline_xl.sheet_names}") + print(f"Test sheets: {test_xl.sheet_names}") + + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + for sheet_name in baseline_xl.sheet_names: + if sheet_name not in test_xl.sheet_names: + print(f" Skip {sheet_name} - not in test file") + continue + + baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name) + test_df = pd.read_excel(test_path, sheet_name=sheet_name) + + baseline_df['source'] = baseline_label + test_df['source'] = test_label + + combined = pd.concat([baseline_df, test_df], ignore_index=True) + + combined.to_excel(writer, sheet_name=sheet_name, index=False) + print(f" Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows") + + print(f"\nSaved: {output_path}") + return 0 # Return success code + + +def main(): + parser = argparse.ArgumentParser(description='Combine two collective reports') + parser.add_argument('--baseline', required=True, help='Path to baseline collective_all_ranks.xlsx') + parser.add_argument('--test', required=True, help='Path to test collective_all_ranks.xlsx') + parser.add_argument('--output', required=True, help='Output path for combined Excel file') + + args = parser.parse_args() + + return combine_collective_reports(args.baseline, args.test, args.output) + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/tracelens_single_config/create_final_report.py b/scripts/tracelens_single_config/create_final_report.py new file mode 100755 index 0000000..16caac9 --- /dev/null +++ b/scripts/tracelens_single_config/create_final_report.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +""" +Create final comprehensive report with combined and comparison data. +Raw data sheets are hidden and all data is formatted as Excel tables. +""" +import pandas as pd +import argparse +from pathlib import Path +from openpyxl import load_workbook +from openpyxl.worksheet.table import Table, TableStyleInfo +from openpyxl.styles import Color +from openpyxl.formatting.rule import ColorScaleRule + + +def get_column_letter(col_num): + """Convert column number to Excel column letter.""" + result = "" + while col_num > 0: + col_num -= 1 + result = chr(65 + (col_num % 26)) + result + col_num //= 26 + return result + + +def add_excel_table(worksheet, table_name, start_row=1): + """Convert worksheet data to Excel table format.""" + # Find data range + max_row = worksheet.max_row + max_col = worksheet.max_column + + if max_row <= start_row: + return # No data + + # Ensure all column headers are strings + for col_idx in range(1, max_col + 1): + cell = worksheet.cell(row=start_row, column=col_idx) + if cell.value is not None and not isinstance(cell.value, str): + cell.value = str(cell.value) + + # Create table reference using proper column letter conversion + start_cell = f"A{start_row}" + end_col_letter = get_column_letter(max_col) + end_cell = f"{end_col_letter}{max_row}" + table_ref = f"{start_cell}:{end_cell}" + + # Create table with style + try: + tab = Table(displayName=table_name, ref=table_ref) + style = TableStyleInfo( + name="TableStyleMedium2", + showFirstColumn=False, + showLastColumn=False, + showRowStripes=True, + showColumnStripes=False + ) + tab.tableStyleInfo = style + + # Add table to worksheet + worksheet.add_table(tab) + except Exception as e: + print(f" Warning: Could not create table {table_name}: {e}") + + +def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label='Baseline', test_label='Test'): + """Create comprehensive report with all data.""" + + print("Creating comprehensive final report...") + print(f" Output: {output_file}") + print(f" Baseline: {baseline_label}") + print(f" Test: {test_label}") + + # Track sheet info for hiding/organizing + raw_sheets = [] + comparison_sheets = [] + summary_sheets = [] + + with pd.ExcelWriter(output_file, engine='openpyxl') as writer: + + # === GPU TIMELINE SHEETS === + print("\nAdding GPU Timeline sheets...") + + # Read GPU combined (raw data) + gpu_comb_xl = pd.ExcelFile(gpu_combined) + sheet_mapping = { + 'Summary': 'GPU_Summary_Raw', + 'All_Ranks_Combined': 'GPU_AllRanks_Raw', + 'Per_Rank_Time_ms': 'GPU_Time_Raw', + 'Per_Rank_Percent': 'GPU_Pct_Raw' + } + for sheet_name in gpu_comb_xl.sheet_names: + df = pd.read_excel(gpu_combined, sheet_name=sheet_name) + new_name = sheet_mapping.get(sheet_name, f"GPU_{sheet_name}_Raw") + df.to_excel(writer, sheet_name=new_name, index=False) + raw_sheets.append(new_name) + print(f" Added {new_name} (will be hidden)") + + # Read GPU comparison + gpu_comp_xl = pd.ExcelFile(gpu_comparison) + comp_mapping = { + 'Summary_Comparison': 'GPU_Summary_Cmp', + 'Comparison_By_Rank': 'GPU_ByRank_Cmp' + } + for sheet_name in gpu_comp_xl.sheet_names: + if 'Comparison' in sheet_name: + df = pd.read_excel(gpu_comparison, sheet_name=sheet_name) + new_name = comp_mapping.get(sheet_name, f"GPU_{sheet_name}") + df.to_excel(writer, sheet_name=new_name, index=False) + comparison_sheets.append(new_name) + print(f" Added {new_name}") + + # === COLLECTIVE SHEETS === + print("\nAdding Collective/NCCL sheets...") + + # Read collective combined (raw data for hidden sheets) + coll_comb_xl = pd.ExcelFile(coll_combined) + coll_mapping = { + 'nccl_summary_implicit_sync': 'NCCL_ImplSync_Raw', + 'nccl_summary_long': 'NCCL_Long_Raw' + } + for sheet_name in coll_comb_xl.sheet_names: + if 'summary' in sheet_name.lower(): + df = pd.read_excel(coll_combined, sheet_name=sheet_name) + new_name = coll_mapping.get(sheet_name, f"NCCL_{sheet_name}_Raw") + df.to_excel(writer, sheet_name=new_name, index=False) + raw_sheets.append(new_name) + print(f" Added {new_name} (will be hidden)") + + # Read collective comparison - include ALL sheets + coll_comp_xl = pd.ExcelFile(coll_comparison) + for sheet_name in coll_comp_xl.sheet_names: + df = pd.read_excel(coll_comparison, sheet_name=sheet_name) + + # Determine appropriate naming + if 'nccl' in sheet_name.lower(): + if '_cmp' in sheet_name or 'comparison' in sheet_name.lower(): + new_name = f"NCCL_{sheet_name.replace('nccl_', '').title().replace('_', '')}" + else: + new_name = f"NCCL_{sheet_name}" + else: + new_name = sheet_name + + df.to_excel(writer, sheet_name=new_name, index=False) + + if '_cmp' in sheet_name.lower() or 'comparison' in sheet_name.lower(): + comparison_sheets.append(new_name) + else: + raw_sheets.append(new_name) + + print(f" Added {new_name}") + + # === CREATE SUMMARY DASHBOARD === + print("\nCreating Summary Dashboard...") + + # Read key metrics for dashboard + gpu_summary = pd.read_excel(gpu_comparison, sheet_name='Summary_Comparison') + + # Create dashboard data + dashboard_data = { + 'Metric': [], + baseline_label: [], + test_label: [], + 'Improvement (%)': [], + 'Status': [] + } + + # Add GPU metrics + # Find the actual column names (they may be config-specific like '32cu_512threads_time_ms') + time_cols = [col for col in gpu_summary.columns if 'time_ms' in col and 'diff' not in col and 'percent' not in col] + if len(time_cols) >= 2: + baseline_col = time_cols[0] + test_col = time_cols[1] + else: + # Fallback to default names + baseline_col = 'baseline_time_ms' if 'baseline_time_ms' in gpu_summary.columns else time_cols[0] if time_cols else None + test_col = 'test_time_ms' if 'test_time_ms' in gpu_summary.columns else time_cols[1] if len(time_cols) > 1 else None + + if baseline_col and test_col: + for _, row in gpu_summary.iterrows(): + metric_type = row['type'] + dashboard_data['Metric'].append(f"GPU_{metric_type}") + dashboard_data[baseline_label].append(round(row[baseline_col], 2)) + dashboard_data[test_label].append(round(row[test_col], 2)) + dashboard_data['Improvement (%)'].append(round(row['percent_change'], 2) if 'percent_change' in row else 0) + + pct_val = row['percent_change'] if 'percent_change' in row else 0 + dashboard_data['Status'].append('Better' if pct_val > 0 else 'Worse' if pct_val < -1 else 'Similar') + + dashboard_df = pd.DataFrame(dashboard_data) + dashboard_df.to_excel(writer, sheet_name='Summary_Dashboard', index=False) + summary_sheets.append('Summary_Dashboard') + print(f" Added Summary_Dashboard") + + # Now modify the workbook to hide sheets and add tables + print("\nApplying formatting...") + wb = load_workbook(output_file) + + # Hide raw data sheets + for sheet_name in raw_sheets: + if sheet_name in wb.sheetnames: + wb[sheet_name].sheet_state = 'hidden' + print(f" Hidden: {sheet_name}") + + # Convert all sheets to tables + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + + # Skip if sheet is empty + if ws.max_row <= 1: + continue + + # Create unique table name from sheet name (remove special chars) + table_name = sheet_name.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '') + # Ensure name starts with letter and is max 255 chars + if not table_name[0].isalpha(): + table_name = 'Tbl_' + table_name + table_name = table_name[:255] + + add_excel_table(ws, table_name) + print(f" Converted to table: {sheet_name}") + + # Add conditional formatting for percent_change columns + if 'Cmp' in sheet_name or 'Comparison' in sheet_name: + # Find percent_change columns + for col_idx in range(1, ws.max_column + 1): + cell_value = ws.cell(row=1, column=col_idx).value + if cell_value and 'percent_change' in str(cell_value): + col_letter = get_column_letter(col_idx) + data_range = f'{col_letter}2:{col_letter}{ws.max_row}' + + # Apply color scale: red (min/negative) -> white (0) -> green (max/positive) + try: + ws.conditional_formatting.add(data_range, + ColorScaleRule( + start_type='min', start_color='F8696B', # Red + mid_type='num', mid_value=0, mid_color='FFFFFF', # White + end_type='max', end_color='63BE7B' # Green + )) + print(f" Applied color scale to {sheet_name} column {cell_value}") + except Exception as e: + print(f" Warning: Could not apply formatting to {cell_value}: {e}") + + # Move Summary Dashboard to first position + if 'Summary_Dashboard' in wb.sheetnames: + dashboard_sheet = wb['Summary_Dashboard'] + wb.move_sheet(dashboard_sheet, offset=-(len(wb.sheetnames)-1)) + wb.active = 0 # Set dashboard as active sheet + print("\n Moved Summary_Dashboard to first position") + + # Save workbook + wb.save(output_file) + print(f"\nFinal report saved: {output_file}") + + # Report structure + print("\nReport Structure:") + print(" Visible Sheets (Analysis):") + print(f" - Summary_Dashboard") + for sheet in comparison_sheets: + print(f" - {sheet}") + print("\n Hidden Sheets (Raw Data):") + for sheet in raw_sheets: + print(f" - {sheet}") + print("\n All data formatted as Excel tables with filters") + print(" Percent change columns are color-coded (green=better, red=worse)") + print("\nUsers can unhide raw data sheets in Excel: Right-click any sheet tab → Unhide") + + +def main(): + parser = argparse.ArgumentParser( + description='Create final comprehensive report with all data', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Example: + python create_final_report.py \\ + --gpu-combined gpu_timeline_combined.xlsx \\ + --gpu-comparison gpu_timeline_comparison.xlsx \\ + --coll-combined collective_combined.xlsx \\ + --coll-comparison collective_comparison.xlsx \\ + --output final_analysis_report.xlsx + """ + ) + + parser.add_argument('--gpu-combined', required=True, + help='Path to GPU timeline combined file') + parser.add_argument('--gpu-comparison', required=True, + help='Path to GPU timeline comparison file') + parser.add_argument('--coll-combined', required=True, + help='Path to collective combined file') + parser.add_argument('--coll-comparison', required=True, + help='Path to collective comparison file') + parser.add_argument('--output', required=True, + help='Output path for final report') + parser.add_argument('--baseline-label', default='Baseline', + help='Label for baseline configuration') + parser.add_argument('--test-label', default='Test', + help='Label for test configuration') + + args = parser.parse_args() + + # Validate inputs + for file_arg in ['gpu_combined', 'gpu_comparison', 'coll_combined', 'coll_comparison']: + file_path = getattr(args, file_arg) + if not Path(file_path).exists(): + print(f"Error: File not found: {file_path}") + return 1 + + create_final_report( + args.gpu_combined, + args.gpu_comparison, + args.coll_combined, + args.coll_comparison, + args.output, + args.baseline_label.replace('_', ' '), + args.test_label.replace('_', ' ') + ) + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/tracelens_single_config/generate_enhanced_plots.py b/scripts/tracelens_single_config/generate_enhanced_plots.py new file mode 100755 index 0000000..226310d --- /dev/null +++ b/scripts/tracelens_single_config/generate_enhanced_plots.py @@ -0,0 +1,766 @@ +#!/usr/bin/env python3 +""" +Enhanced plot generation matching the PDF report style. +Generates exactly 12 plots as specified. +""" + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import argparse +from pathlib import Path +import warnings +import base64 +from datetime import datetime +warnings.filterwarnings('ignore') + +plt.style.use('seaborn-v0_8-whitegrid') +sns.set_palette("husl") + + +def plot1_percentage_change(summary_data, output_dir): + """Plot 1: Percentage Change Overview.""" + print("\nGenerating Plot 1: Percentage Change Overview") + + columns = summary_data.columns.tolist() + baseline_label = columns[1] if len(columns) > 1 else 'Baseline' + test_label = columns[2] if len(columns) > 2 else 'Test' + + if 'Improvement (%)' not in summary_data.columns: + print(" No improvement data found") + return + + metrics = summary_data['Metric'].values + values = summary_data['Improvement (%)'].values + + fig, ax = plt.subplots(figsize=(12, 8)) + colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values] + bars = ax.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5) + + for bar, val in zip(bars, values): + x_pos = bar.get_width() + ax.text(x_pos + (0.5 if x_pos > 0 else -0.5), bar.get_y() + bar.get_height()/2, + f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold') + + ax.axvline(x=0, color='black', linestyle='-', linewidth=1) + ax.set_xlabel('Percent Change (%)', fontsize=12) + ax.set_title(f'GPU Metrics: Percent Change ({baseline_label} vs {test_label})\nPositive = Improvement ({test_label} Faster)', + fontsize=14, fontweight='bold') + ax.grid(True, alpha=0.3, axis='x') + + plt.tight_layout() + plt.savefig(output_dir / 'plot1_percentage_change_overview.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot1_percentage_change_overview.png") + + +def plot2_absolute_time_comparison(summary_data, output_dir): + """Plot 2: Absolute Time Comparison.""" + print("\nGenerating Plot 2: Absolute Time Comparison") + + columns = summary_data.columns.tolist() + baseline_label = columns[1] if len(columns) > 1 else 'Baseline' + test_label = columns[2] if len(columns) > 2 else 'Test' + + metrics = summary_data['Metric'].values + baseline_values = summary_data[baseline_label].values + test_values = summary_data[test_label].values + + fig, ax = plt.subplots(figsize=(14, 8)) + + x = np.arange(len(metrics)) + width = 0.35 + + bars1 = ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue') + bars2 = ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange') + + ax.set_xlabel('Metric Type', fontsize=12) + ax.set_ylabel('Time (ms)', fontsize=12) + ax.set_title('GPU Metrics: Absolute Time Comparison', fontsize=14, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels(metrics, rotation=45, ha='right') + ax.legend() + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plt.savefig(output_dir / 'plot2_absolute_time_comparison.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot2_absolute_time_comparison.png") + + +def plot3_performance_heatmap(byrank_data, output_dir): + """Plot 3: Performance Heatmap by Rank.""" + print("\nGenerating Plot 3: Performance Heatmap by Rank") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + metrics = byrank_data['type'].unique() if 'type' in byrank_data.columns else [] + ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else [] + + time_cols = [col for col in byrank_data.columns if 'time' in col.lower() and 'diff' not in col.lower()] + time_col = time_cols[-1] if len(time_cols) > 1 else time_cols[0] if time_cols else None + + if not time_col: + print(" No time column found") + return + + heatmap_data = byrank_data.pivot_table(index='type', columns='rank', values=time_col, aggfunc='mean') + + fig, ax = plt.subplots(figsize=(12, 8)) + sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlOrRd', cbar_kws={'label': 'Time (ms)'}, ax=ax) + + ax.set_title('Performance Heatmap by Rank (Time in ms)', fontsize=14, fontweight='bold') + ax.set_xlabel('Rank', fontsize=12) + ax.set_ylabel('Metric Type', fontsize=12) + + plt.tight_layout() + plt.savefig(output_dir / 'plot3_performance_heatmap.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot3_performance_heatmap.png") + + +def plot4_total_execution_time(byrank_data, output_dir): + """Plot 4: Total Execution Time by Rank (Line Plot).""" + print("\nGenerating Plot 4: Total Execution Time by Rank") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + total_time_data = byrank_data[byrank_data['type'] == 'total_time'] + if total_time_data.empty: + print(" No total_time data found") + return + + ranks = sorted(total_time_data['rank'].unique()) + time_cols = [col for col in total_time_data.columns if 'time' in col.lower() and 'diff' not in col.lower()] + + fig, ax = plt.subplots(figsize=(12, 6)) + + for col in time_cols[:2]: + times = [total_time_data[total_time_data['rank'] == r][col].values[0] if not total_time_data[total_time_data['rank'] == r].empty else 0 for r in ranks] + label = col.replace('_time_ms', '').replace('_', ' ') + ax.plot(ranks, times, marker='o', markersize=8, linewidth=2, label=label, alpha=0.8) + + ax.set_xlabel('Rank', fontsize=12) + ax.set_ylabel('Total Execution Time (ms)', fontsize=12) + ax.set_title('Total Execution Time by Rank', fontsize=14, fontweight='bold') + ax.legend() + ax.grid(True, alpha=0.3) + ax.set_xticks(ranks) + + plt.tight_layout() + plt.savefig(output_dir / 'plot4_total_execution_time.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot4_total_execution_time.png") + + +def plot5_computation_time(byrank_data, output_dir): + """Plot 5: Computation Time Across Ranks.""" + print("\nGenerating Plot 5: Computation Time Across Ranks") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + comp_data = byrank_data[byrank_data['type'] == 'computation_time'] + if comp_data.empty: + print(" No computation_time data found") + return + + ranks = sorted(comp_data['rank'].unique()) + time_cols = [col for col in comp_data.columns if 'time' in col.lower() and 'diff' not in col.lower()] + + fig, ax = plt.subplots(figsize=(12, 6)) + + for col in time_cols[:2]: + times = [comp_data[comp_data['rank'] == r][col].values[0] if not comp_data[comp_data['rank'] == r].empty else 0 for r in ranks] + label = col.replace('_time_ms', '').replace('_', ' ') + ax.plot(ranks, times, marker='s', markersize=8, linewidth=2, label=label, alpha=0.8) + + ax.set_xlabel('Rank', fontsize=12) + ax.set_ylabel('Computation Time (ms)', fontsize=12) + ax.set_title('Computation Time Across Ranks', fontsize=14, fontweight='bold') + ax.legend() + ax.grid(True, alpha=0.3) + ax.set_xticks(ranks) + + plt.tight_layout() + plt.savefig(output_dir / 'plot5_computation_time.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot5_computation_time.png") + + +def plot6_communication_time(byrank_data, output_dir): + """Plot 6: Total Communication Time Across Ranks.""" + print("\nGenerating Plot 6: Total Communication Time Across Ranks") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + comm_data = byrank_data[byrank_data['type'] == 'total_comm_time'] + if comm_data.empty: + print(" No total_comm_time data found") + return + + ranks = sorted(comm_data['rank'].unique()) + time_cols = [col for col in comm_data.columns if 'time' in col.lower() and 'diff' not in col.lower()] + + fig, ax = plt.subplots(figsize=(12, 6)) + + for col in time_cols[:2]: + times = [comm_data[comm_data['rank'] == r][col].values[0] if not comm_data[comm_data['rank'] == r].empty else 0 for r in ranks] + label = col.replace('_time_ms', '').replace('_', ' ') + ax.plot(ranks, times, marker='^', markersize=8, linewidth=2, label=label, alpha=0.8) + + ax.set_xlabel('Rank', fontsize=12) + ax.set_ylabel('Communication Time (ms)', fontsize=12) + ax.set_title('Total Communication Time Across Ranks', fontsize=14, fontweight='bold') + ax.legend() + ax.grid(True, alpha=0.3) + ax.set_xticks(ranks) + + plt.tight_layout() + plt.savefig(output_dir / 'plot6_communication_time.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot6_communication_time.png") + + +def plot7_idle_time(byrank_data, output_dir): + """Plot 7: Idle Time Across Ranks.""" + print("\nGenerating Plot 7: Idle Time Across Ranks") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + idle_data = byrank_data[byrank_data['type'] == 'idle_time'] + if idle_data.empty: + print(" No idle_time data found") + return + + ranks = sorted(idle_data['rank'].unique()) + time_cols = [col for col in idle_data.columns if 'time' in col.lower() and 'diff' not in col.lower()] + + fig, ax = plt.subplots(figsize=(12, 6)) + + for col in time_cols[:2]: + times = [idle_data[idle_data['rank'] == r][col].values[0] if not idle_data[idle_data['rank'] == r].empty else 0 for r in ranks] + label = col.replace('_time_ms', '').replace('_', ' ') + ax.plot(ranks, times, marker='D', markersize=8, linewidth=2, label=label, alpha=0.8) + + ax.set_xlabel('Rank', fontsize=12) + ax.set_ylabel('Idle Time (ms)', fontsize=12) + ax.set_title('Idle Time Across Ranks', fontsize=14, fontweight='bold') + ax.legend() + ax.grid(True, alpha=0.3) + ax.set_xticks(ranks) + + plt.tight_layout() + plt.savefig(output_dir / 'plot7_idle_time.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot7_idle_time.png") + + +def plot8_percentage_time_difference(byrank_data, output_dir): + """Plot 8: Percentage Time Difference Across Ranks (8 subplots in 2x4 grid).""" + print("\nGenerating Plot 8: Percentage Time Difference (8 subplots)") + + if byrank_data is None or byrank_data.empty: + print(" No by-rank data available") + return + + metrics = ['busy_time', 'computation_time', 'total_comm_time', 'exposed_comm_time', + 'idle_time', 'total_memcpy_time', 'exposed_memcpy_time', 'total_time'] + + pct_cols = [col for col in byrank_data.columns if 'percent_change' in col.lower()] + if not pct_cols: + print(" No percent_change column found") + return + + pct_col = pct_cols[0] + ranks = sorted(byrank_data['rank'].unique()) if 'rank' in byrank_data.columns else [] + + # Create 2x4 subplot grid + fig, axes = plt.subplots(2, 4, figsize=(20, 10)) + axes = axes.flatten() + + for idx, metric in enumerate(metrics): + ax = axes[idx] + metric_data = byrank_data[byrank_data['type'] == metric] + + if not metric_data.empty: + values = [metric_data[metric_data['rank'] == r][pct_col].values[0] if not metric_data[metric_data['rank'] == r].empty else 0 for r in ranks] + + colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values] + ax.bar(ranks, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5) + + ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) + ax.set_xlabel('Rank', fontsize=10) + ax.set_ylabel('Percent Change (%)', fontsize=10) + ax.set_title(metric.replace('_', ' ').title(), fontsize=11, fontweight='bold') + ax.grid(True, alpha=0.3, axis='y') + ax.set_xticks(ranks) + + plt.suptitle('Percentage Time Difference Across Ranks (All Metrics)', fontsize=16, fontweight='bold') + plt.tight_layout() + plt.savefig(output_dir / 'plot8_percentage_difference_all_metrics.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot8_percentage_difference_all_metrics.png") + + +def plot9_nccl_latency(nccl_data, output_dir): + """Plot 9: Communication Latency Comparison per Message Size.""" + print("\nGenerating Plot 9: Communication Latency vs Message Size") + + if nccl_data is None or nccl_data.empty: + print(" No NCCL data available") + return + + if 'In msg nelems' not in nccl_data.columns: + print(" Required columns not found") + return + + latency_cols = [col for col in nccl_data.columns if 'comm_latency' in col.lower() or 'latency_mean' in col.lower()] + if not latency_cols: + print(" No latency columns found") + return + + fig, ax = plt.subplots(figsize=(14, 7)) + + nccl_sorted = nccl_data.sort_values('In msg nelems') + msg_sizes = nccl_sorted['In msg nelems'].values + + x = np.arange(len(msg_sizes)) + width = 0.35 + + if len(latency_cols) >= 2: + baseline_values = nccl_sorted[latency_cols[0]].values + test_values = nccl_sorted[latency_cols[1]].values + + baseline_label = latency_cols[0].replace('_comm_latency_mean', '').replace('_', ' ').title() + test_label = latency_cols[1].replace('_comm_latency_mean', '').replace('_', ' ').title() + + ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue') + ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange') + else: + ax.bar(x, nccl_sorted[latency_cols[0]].values, alpha=0.8, color='steelblue') + + ax.set_xlabel('Message Size (elements)', fontsize=12) + ax.set_ylabel('Communication Latency (ms)', fontsize=12) + ax.set_title('Communication Latency Comparison per Message Size', fontsize=14, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right') + ax.legend() + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plt.savefig(output_dir / 'plot9_nccl_latency.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot9_nccl_latency.png") + + +def plot10_algorithm_bandwidth(nccl_data, output_dir): + """Plot 10: Algorithm Bandwidth.""" + print("\nGenerating Plot 10: Algorithm Bandwidth") + + if nccl_data is None or nccl_data.empty: + print(" No NCCL data available") + return + + algo_bw_cols = [col for col in nccl_data.columns if 'algo bw' in col.lower()] + if not algo_bw_cols or 'In msg nelems' not in nccl_data.columns: + print(" Required columns not found") + return + + fig, ax = plt.subplots(figsize=(14, 7)) + + nccl_sorted = nccl_data.sort_values('In msg nelems') + msg_sizes = nccl_sorted['In msg nelems'].values + + x = np.arange(len(msg_sizes)) + width = 0.35 + + if len(algo_bw_cols) >= 2: + baseline_values = nccl_sorted[algo_bw_cols[0]].values + test_values = nccl_sorted[algo_bw_cols[1]].values + + baseline_label = algo_bw_cols[0].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title() + test_label = algo_bw_cols[1].replace('_algo bw (GB/s)_mean', '').replace('_', ' ').title() + + ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue') + ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange') + else: + ax.bar(x, nccl_sorted[algo_bw_cols[0]].values, alpha=0.8, color='steelblue') + + ax.set_xlabel('Message Size (elements)', fontsize=12) + ax.set_ylabel('Algorithm Bandwidth (GB/s)', fontsize=12) + ax.set_title('Algorithm Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right') + ax.legend() + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plt.savefig(output_dir / 'plot10_algorithm_bandwidth.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot10_algorithm_bandwidth.png") + + +def plot11_bus_bandwidth(nccl_data, output_dir): + """Plot 11: Bus Bandwidth.""" + print("\nGenerating Plot 11: Bus Bandwidth") + + if nccl_data is None or nccl_data.empty: + print(" No NCCL data available") + return + + bus_bw_cols = [col for col in nccl_data.columns if 'bus bw' in col.lower()] + if not bus_bw_cols or 'In msg nelems' not in nccl_data.columns: + print(" Required columns not found") + return + + fig, ax = plt.subplots(figsize=(14, 7)) + + nccl_sorted = nccl_data.sort_values('In msg nelems') + msg_sizes = nccl_sorted['In msg nelems'].values + + x = np.arange(len(msg_sizes)) + width = 0.35 + + if len(bus_bw_cols) >= 2: + baseline_values = nccl_sorted[bus_bw_cols[0]].values + test_values = nccl_sorted[bus_bw_cols[1]].values + + baseline_label = bus_bw_cols[0].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title() + test_label = bus_bw_cols[1].replace('_bus bw (GB/s)_mean', '').replace('_', ' ').title() + + ax.bar(x - width/2, baseline_values, width, label=baseline_label, alpha=0.8, color='steelblue') + ax.bar(x + width/2, test_values, width, label=test_label, alpha=0.8, color='darkorange') + else: + ax.bar(x, nccl_sorted[bus_bw_cols[0]].values, alpha=0.8, color='steelblue') + + ax.set_xlabel('Message Size (elements)', fontsize=12) + ax.set_ylabel('Bus Bandwidth (GB/s)', fontsize=12) + ax.set_title('Bus Bandwidth Comparison per Message Size', fontsize=14, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels([f'{int(s):,}' for s in msg_sizes], rotation=45, ha='right') + ax.legend() + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plt.savefig(output_dir / 'plot11_bus_bandwidth.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot11_bus_bandwidth.png") + + +def plot12_nccl_summary(nccl_data, output_dir): + """Plot 12: NCCL Percentage Summary and Total Communication Latency.""" + print("\nGenerating Plot 12: NCCL Summary (Percentage & Total Latency)") + + if nccl_data is None or nccl_data.empty: + print(" No NCCL data available") + return + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7)) + + # Left: Percentage change summary for key metrics + pct_cols = [col for col in nccl_data.columns if 'percent_change' in col.lower()] + if pct_cols and len(pct_cols) > 0: + metrics = [] + values = [] + + for col in pct_cols: + metric_name = col.replace('percent_change_', '').replace('_', ' ').title() + metrics.append(metric_name) + avg_value = nccl_data[col].mean() + values.append(avg_value) + + if metrics: + colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values] + bars = ax1.barh(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5) + + for bar, val in zip(bars, values): + x_pos = bar.get_width() + ax1.text(x_pos + (1 if x_pos > 0 else -1), bar.get_y() + bar.get_height()/2, + f'{val:.1f}%', ha='left' if x_pos > 0 else 'right', va='center', fontweight='bold') + + ax1.axvline(x=0, color='black', linestyle='-', linewidth=1) + ax1.set_xlabel('Percent Change (%)', fontsize=12) + ax1.set_title('NCCL Metrics: Average Percent Change', fontsize=13, fontweight='bold') + ax1.grid(True, alpha=0.3) + else: + ax1.text(0.5, 0.5, 'No percentage change data available', + ha='center', va='center', transform=ax1.transAxes, fontsize=12) + + # Right: Total communication latency comparison + total_latency_cols = [col for col in nccl_data.columns if ('Total comm latency' in col or 'total_latency' in col.lower()) and 'percent' not in col.lower()] + + if total_latency_cols and len(total_latency_cols) >= 1: + labels = [] + totals = [] + + for col in total_latency_cols[:2]: + label = col.replace('_Total comm latency (ms)', '').replace('_total_latency', '').replace('_', ' ').strip().title() + if not label: + label = 'Total' + total = nccl_data[col].sum() + labels.append(label) + totals.append(total) + + if totals: + colors = ['steelblue', 'darkorange'] if len(totals) > 1 else ['steelblue'] + bars = ax2.bar(labels, totals, color=colors[:len(totals)], alpha=0.8, edgecolor='black', linewidth=1) + + for bar, val in zip(bars, totals): + ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height(), + f'{val:.1f} ms', ha='center', va='bottom', fontsize=12, fontweight='bold') + + if len(totals) == 2 and totals[0] > 0: + improvement = (totals[0] - totals[1]) / totals[0] * 100 + y_pos = max(totals) * 0.6 + ax2.text(0.5, y_pos, f'Improvement: {improvement:.1f}%', + ha='center', fontsize=13, fontweight='bold', + bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.6, edgecolor='black')) + + ax2.set_ylabel('Total Communication Latency (ms)', fontsize=12) + ax2.set_title('Total Communication Latency Comparison', fontsize=13, fontweight='bold') + ax2.grid(True, alpha=0.3, axis='y') + else: + ax2.text(0.5, 0.5, 'No total latency data available', + ha='center', va='center', transform=ax2.transAxes, fontsize=12) + + plt.tight_layout() + plt.savefig(output_dir / 'plot12_nccl_summary.png', dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved: plot12_nccl_summary.png") + + +def generate_html_report(input_path, output_dir, baseline_label='Baseline', test_label='Test'): + """Generate HTML report with all plots embedded.""" + print("\nGenerating HTML Report...") + + plot_files = sorted(output_dir.glob('plot*.png')) + + html_content = f""" + + + + + RCCL Performance Analysis: {baseline_label} vs {test_label} + + + +

RCCL Performance Analysis Report

+

Comparing: {baseline_label} vs {test_label}

+

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+ +

GPU Performance Metrics

+""" + + plot_titles = { + 'plot1': 'Percentage Change Overview', + 'plot2': 'Absolute Time Comparison', + 'plot3': 'Performance Heatmap by Rank', + 'plot4': 'Total Execution Time by Rank', + 'plot5': 'Computation Time Across Ranks', + 'plot6': 'Communication Time Across Ranks', + 'plot7': 'Idle Time Across Ranks', + 'plot8': 'Percentage Time Difference (All Metrics)', + 'plot9': 'NCCL Communication Latency', + 'plot10': 'NCCL Algorithm Bandwidth', + 'plot11': 'NCCL Bus Bandwidth', + 'plot12': 'NCCL Summary' + } + + # Add GPU plots first (plot1-plot8) + for plot_file in plot_files: + plot_num = plot_file.stem.split('_')[0] + if plot_num not in ['plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7', 'plot8']: + continue + + title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title()) + + with open(plot_file, 'rb') as f: + img_data = base64.b64encode(f.read()).decode() + + html_content += f""" +
+
{title}
+ {title} +
+""" + + # Add NCCL section + html_content += "\n

NCCL/Collective Performance

\n" + + # Add NCCL plots (plot9-plot12) + for plot_file in plot_files: + plot_num = plot_file.stem.split('_')[0] + if plot_num not in ['plot9', 'plot10', 'plot11', 'plot12']: + continue + + title = plot_titles.get(plot_num, plot_file.stem.replace('_', ' ').title()) + + with open(plot_file, 'rb') as f: + img_data = base64.b64encode(f.read()).decode() + + html_content += f""" +
+
{title}
+ {title} +
+""" + + html_content += """ +

+ Generated by TraceLens Analysis Pipeline +

+ + +""" + + html_path = output_dir / 'performance_analysis_report.html' + with open(html_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f" HTML report saved to: {html_path}") + return html_path + + +def main(): + parser = argparse.ArgumentParser(description='Generate 12 analysis plots') + parser.add_argument('--input', required=True, help='Path to final_analysis_report.xlsx') + parser.add_argument('--output', default='plots', help='Output directory for plots') + + args = parser.parse_args() + + input_path = Path(args.input) + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + if not input_path.exists(): + print(f"Error: Input file not found: {input_path}") + return 1 + + sheets = pd.read_excel(input_path, sheet_name=None) + + print(f"\nGenerating 12 plots from {input_path.name}...") + + # Extract baseline and test labels from Summary_Dashboard + baseline_label = 'Baseline' + test_label = 'Test' + + summary_sheet = sheets.get('Summary_Dashboard') + if summary_sheet is not None: + columns = summary_sheet.columns.tolist() + if len(columns) >= 3: + baseline_label = columns[1] + test_label = columns[2] + + plot1_percentage_change(summary_sheet, output_dir) + plot2_absolute_time_comparison(summary_sheet, output_dir) + + # GPU by-rank data + byrank_sheet = None + for name in ['GPU_ByRank_Cmp', 'GPU_ByRank_Comparison', 'Comparison_By_Rank']: + if name in sheets: + byrank_sheet = sheets[name] + break + + if byrank_sheet is not None: + plot3_performance_heatmap(byrank_sheet, output_dir) + plot4_total_execution_time(byrank_sheet, output_dir) + plot5_computation_time(byrank_sheet, output_dir) + plot6_communication_time(byrank_sheet, output_dir) + plot7_idle_time(byrank_sheet, output_dir) + plot8_percentage_time_difference(byrank_sheet, output_dir) + + # NCCL data + nccl_sheet = None + for name in sheets: + if 'nccl' in name.lower() and ('cmp' in name.lower() or 'comparison' in name.lower()): + nccl_sheet = sheets[name] + break + + # Try to get the actual NCCL data sheets (not just comparison) + if not nccl_sheet or nccl_sheet.empty: + for name in sheets: + if 'nccl' in name.lower() and 'summary' in name.lower(): + nccl_sheet = sheets[name] + break + + if nccl_sheet is not None and not nccl_sheet.empty: + plot9_nccl_latency(nccl_sheet, output_dir) + plot10_algorithm_bandwidth(nccl_sheet, output_dir) + plot11_bus_bandwidth(nccl_sheet, output_dir) + plot12_nccl_summary(nccl_sheet, output_dir) + + # Generate HTML report with configuration labels + html_path = generate_html_report(input_path, output_dir, baseline_label, test_label) + + print(f"\n{'='*60}") + print(f"All 12 plots generated successfully!") + print(f"Output directory: {output_dir}") + print(f"\nHTML Report: {html_path}") + print(" - Open in browser to view all plots") + print(" - Print to PDF: Ctrl+P or Cmd+P") + print(f"{'='*60}") + + return 0 + + +if __name__ == '__main__': + import sys + sys.exit(main()) diff --git a/scripts/tracelens_single_config/generate_merged_html.py b/scripts/tracelens_single_config/generate_merged_html.py new file mode 100644 index 0000000..1bb89f6 --- /dev/null +++ b/scripts/tracelens_single_config/generate_merged_html.py @@ -0,0 +1,182 @@ +import os +from pathlib import Path +import base64 +import argparse + + +def get_image_data(image_path): + try: + with open(image_path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + except Exception as e: + print(f"Error getting image data: {e}") + return None + + +def create_final_html(plot_file_path, output_path): + html_header = """ + + + + + Performance Analysis Report + + + + +

Performance Analysis Report

+ +
+ +

Executive Summary

+ + Comparison of GPU performance metrics + implementations across 8 ranks. + """ + + summary_section = f""" +

1. Overall GPU Metrics Comparison

+ """ + + summary_chart = get_image_data(plot_file_path / "improvement_chart.png") + if summary_chart is not None: + summary_section += f""" +

Percentage Change Overview

+ Summary Chart + Overall performance change across key GPU metrics. Negetive values indicate improvement + """ + absolute_time_chart = get_image_data(plot_file_path / "abs_time_comparison.png") + if absolute_time_chart is not None: + summary_section += f""" +

Absolute Time Comparison

+ Absolute Time Comparison + Side-by-side comparison of absolute execution times for all GPU metrics. + """ + + cross_rank_comparison_section = f""" +

2. Cross-Rank Performance Comparison

+ """ + gpu_time_heatmap = get_image_data(plot_file_path / "gpu_time_heatmap.png") + if gpu_time_heatmap is not None: + cross_rank_comparison_section += f""" +

Performance Heatmap by Rank

+ GPU Metric Percentage Change by Rank (HeatMap) + Comprehensive heatmap showing percent change for all metrics across all ranks. Green indicates better performance (positive % change). + """ + + item_list = { + "total_time": { + "name": "Total Time", + "description": "Total execution time comparison across all ranks, showing end-to-end performance characteristics.", + "chart_path": plot_file_path / "total_time_by_rank.png", + }, + "computation_time": { + "name": "Computation Time", + "description": "Pure computation time excluding communication overhead, analyzed per rank.", + "chart_path": plot_file_path / "computation_time_by_rank.png", + }, + "total_comm_time": { + "name": "Communication Time", + "description": "Total time spent in collective communication operations across ranks.", + "chart_path": plot_file_path / "total_comm_time_by_rank.png", + }, + "idle_time": { + "name": "Idle Time", + "description": "GPU idle time comparison showing resource utilization efficiency per rank.", + "chart_path": plot_file_path / "idle_time_by_rank.png", + }, + "gpu_time_change_percentage_summaryby_rank": { + "name": "Detailed Percentage Change by Metric", + "description": "Detailed breakdown of percent change for each metric type across all ranks.", + "chart_path": plot_file_path + / "gpu_time_change_percentage_summaryby_rank.png", + }, + } + for item in item_list.keys(): + cross_rank_comparison_chart = get_image_data(item_list[item]["chart_path"]) + if cross_rank_comparison_chart is not None: + cross_rank_comparison_section += f""" +

{item_list[item]['name']}

+ {item} by Rank + {item_list[item]['description']}. + """ + + summary_section += cross_rank_comparison_section + + nccl_charst_section = f""" +

3. NCCL Collective Operations Analysis

+ """ + nccl_chart_item_list = { + "NCCL Communication Latency": "Mean communication latency for NCCL allreduce operations across different message sizes", + "NCCL Algorithm Bandwidth": "Algorithm bandwidth achieved for different message sizes in NCCL collective operations.", + "NCCL Bus Bandwidth": "Bus bandwidth utilization across NCCL operations and message sizes.", + "NCCL Performance Percentage Change": "Percent change in communication latency and bandwidth metrics for each message sizec configuration", + "NCCL Total Communication Latency": "Aggregate communication latency summed across all operations for each message size.", + } + for item in nccl_chart_item_list.keys(): + nccl_image_data = get_image_data( + plot_file_path / f'{item.replace(" ", "_")}_comparison.png' + ) + if nccl_image_data is not None: + nccl_charst_section += f""" +

{item}

+ {item} Comparison + {nccl_chart_item_list[item]} + """ + + summary_section += nccl_charst_section + + footer_section = f""" + + + + """ + summary_section += footer_section + + final_html = html_header + summary_section + with open(output_path, "w") as f: + f.write(final_html) + print(f"Final HTML file created at: {output_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Create a final HTML file for the analysis report." + ) + parser.add_argument( + "-p", + "--plot-files-directory", + type=Path, + required=True, + help="Path to the plot files direcotry.", + ) + parser.add_argument( + "-o", "--output-html", type=None, default=None, help="Path to the output file." + ) + args = parser.parse_args() + output_path = ( + args.output_html + if args.output_html + else args.plot_files_directory.parent / "final_analysis_report.html" + ) + create_final_html(args.plot_files_directory, output_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/tracelens_single_config/merge_tracelens_analysis.py b/scripts/tracelens_single_config/merge_tracelens_analysis.py new file mode 100644 index 0000000..fade0d5 --- /dev/null +++ b/scripts/tracelens_single_config/merge_tracelens_analysis.py @@ -0,0 +1,465 @@ +import argparse +from pathlib import Path +import subprocess +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + + +def run_command(cmd, description): + """Execute a command and handle errors.""" + print(f"\n{'='*80}") + print(f"{description}") + print(f"{'='*80}") + print(f"Command: {' '.join(cmd)}") + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error: {description} failed!") + print(f"Stderr: {result.stderr}") + return False + + print(result.stdout) + return True + + +def plot_nccl_data_per_msg(df, labels, output_dir: Path): + """ + Plot comm_latency_mean for each message size from NCCL data. + """ + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Get unique index values (Collective_MsgSize) + indices = df["index"].values + + x = np.arange(len(indices)) + width = 0.8 / len(labels) + # Vibrant color palette + vibrant_colors = [ + "#E63946", + "#2A9D8F", + "#E9C46A", + "#264653", + "#F4A261", + "#8338EC", + "#06D6A0", + "#FF006E", + ] + + plot_items = { + "NCCL Communication Latency": { + "x_label": "Collective Operation (Message Size)", + "y_label": "Communication Latency (ms)", + "y_col": "comm_latency_mean", + }, + "NCCL Algorithm Bandwidth": { + "x_label": "Collective Operation (Message Size)", + "y_label": "Algorithm Bandwidth (GB/s)", + "y_col": "algo bw (GB/s)_mean", + }, + "NCCL Bus Bandwidth": { + "x_label": "Collective Operation (Message Size)", + "y_label": "Bus Bandwidth (GB/s)", + "y_col": "bus bw (GB/s)_mean", + }, + "NCCL Total Communication Latency": { + "x_label": "Collective Operation (Message Size)", + "y_label": "Total Communication Latency (ms)", + "y_col": "Total comm latency (ms)", + }, + } + + for plot_item in plot_items.keys(): + fig, ax = plt.subplots(figsize=(14, 7)) + for i, label in enumerate(labels): + col_name = f"{plot_items[plot_item]['y_col']}_{label}" + print(f"Plotting {col_name}") + if col_name in df.columns: + values = df[col_name].values + color = vibrant_colors[i % len(vibrant_colors)] + offset = (i - len(labels) / 2 + 0.5) * width + ax.bar( + x + offset, + values, + width, + label=label, + color=color, + alpha=0.85, + edgecolor="black", + linewidth=0.5, + ) + else: + print(f"Column {col_name} not found in dataframe") + + ax.set_xlabel(plot_items[plot_item]["x_label"], fontsize=12, fontweight="bold") + ax.set_ylabel(plot_items[plot_item]["y_label"], fontsize=12, fontweight="bold") + ax.set_title(f"{plot_item} per Message Size", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(indices, rotation=45, ha="right", fontsize=9) + ax.legend(loc="upper left") + ax.grid(True, alpha=0.3, axis="y") + + plt.tight_layout() + output_file = output_path / f'{plot_item.replace(" ", "_")}_comparison.png' + plt.savefig(output_file, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved: {output_file}") + print("Completed plotting NCCL data per message size") + + +def plot_all_types_per_rank(df, labels, output_dir: Path): + """ + Plot data for every rank, where every unique type is a different file. + + Parameters: + ----------- + df : DataFrame + Merged gpu_time_per_rank_df with columns like 'type', 'rank0_label1', 'rank0_label2', etc. + labels : list + List of configuration labels (e.g., ['32cu_512threads', '37cu_384threads']) + output_dir : str + Directory to save plots + """ + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + unique_types = df["type"].unique() + + # Find rank columns (extract rank numbers from column names) + # Columns are like: rank0_32cu_512threads, rank1_32cu_512threads, etc. + sample_label = labels[0] + rank_cols = [ + col for col in df.columns if col.endswith(f"_{sample_label}") and col != "type" + ] + ranks = [col.replace(f"_{sample_label}", "") for col in rank_cols] + + print(f"Found ranks: {ranks}") + print(f"Found types: {unique_types}") + + for metric_type in unique_types: + type_data = df[df["type"] == metric_type] + + if type_data.empty: + continue + + fig, ax = plt.subplots(figsize=(12, 6)) + + x = np.arange(len(ranks)) + # Vibrant color palette + vibrant_colors = [ + "#E63946", + "#2A9D8F", + "#E9C46A", + "#264653", + "#F4A261", + "#8338EC", + "#06D6A0", + "#FF006E", + ] + markers = ["o", "s", "^", "D", "v", "p", "h", "*"] + + for i, label in enumerate(labels): + values = [] + for rank in ranks: + col_name = f"{rank}_{label}" + if col_name in type_data.columns: + val = type_data[col_name].values[0] + values.append(val if pd.notna(val) else 0) + else: + values.append(0) + + color = vibrant_colors[i % len(vibrant_colors)] + marker = markers[i % len(markers)] + ax.plot( + x, + values, + label=label, + color=color, + marker=marker, + markersize=8, + linewidth=2, + alpha=0.85, + ) + + ax.set_xlabel("Rank", fontsize=12, fontweight="bold") + ax.set_ylabel("Time (ms)", fontsize=12, fontweight="bold") + ax.set_title(f"{metric_type} - Time per Rank", fontsize=14, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(ranks) + ax.legend(loc="upper right") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + + # Save with sanitized filename + safe_name = metric_type.replace("/", "_").replace(" ", "_").replace(":", "_") + output_file = output_path / f"{safe_name}_by_rank.png" + plt.savefig(output_file, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved: {output_file}") + + +def plot_gpu_time_summary(df, labels, output_dir: Path): + + types = df["type"].values + values = [] + + for label in labels: + values.append(df[f"time ms_{label}"].values) + + fig, ax = plt.subplots(figsize=(10, 5)) + + x = np.arange(len(types)) + width = 0.15 + for i, value in enumerate(values): + offset = (i - len(labels) / 2 + 0.5) * width + bars = ax.bar(x + offset, value, width, label=labels[i]) + + ax.set_xlabel("Type") + ax.set_ylabel("Time (ms)") + ax.set_title("GPU Time Summary by Rank") + ax.set_xticks(x) + ax.set_xticklabels(types, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3, axis="y") + + plt.tight_layout() + plt.savefig(output_dir / "abs_time_comparison.png") + plt.close() + + +""" +def plot_improvement_chart(df, output_path): + fig, ax = plt.subplots(figsize=(10, 6)) + + # Color bars based on positive/negative values + colors = ['#2ecc71' if val > 0 else '#e74c3c' for val in df['Improvement (%)']] + + bars = ax.barh(df['Metric'], df['Improvement (%)'], color=colors) + ax.yaxis.grid(True, linestyle='--', alpha=0.7, color='gray') + ax.set_axisbelow(True) + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.spines['left'].set_visible(False) + + # Customize the chart + ax.set_ylabel('Metric', fontsize=12) + ax.set_xlabel('Change (%)', fontsize=12) + ax.set_title( + 'GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)', + fontsize=14, fontweight='bold' + ) + + plt.tight_layout() + plt.savefig(output_path / 'improvement_chart.png', dpi=150) + plt.close() + +""" + + +def plot_gpu_time_percentage_change(df, labels, output_dir: Path): + """ + Create separate horizontal bar charts for each label comparing against baseline (labels[0]). + """ + types = df["type"].values + base_label = labels[0] + + # Vibrant color palette + vibrant_colors = [ + "#E63946", + "#2A9D8F", + "#E9C46A", + "#264653", + "#F4A261", + "#8338EC", + "#06D6A0", + "#FF006E", + ] + + fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, max(8, len(types) * 0.5))) + for i, label in enumerate(labels[1:]): + ax = axes[i] + col_name = f"percentage_change_{label}" + if col_name not in df.columns: + print(f"Column {col_name} not found, skipping") + continue + + values = df[col_name].values + + # Create 1x2 subplot figure + + # Color bars based on positive/negative values (green = improvement, red = regression) + colors = ["#2ecc71" if val < 0 else "#e74c3c" for val in values] + + # Horizontal bar chart + y = np.arange(len(types)) + bars = ax.barh( + y, values, color=colors, alpha=0.85, edgecolor="black", linewidth=0.5 + ) + + # Add vertical line at 0 + ax.axvline(x=0, color="black", linestyle="-", linewidth=1) + + ax.set_yticks(y) + ax.set_yticklabels(types, fontsize=10) + ax.set_xlabel("Percentage Change (%)", fontsize=12, fontweight="bold") + ax.set_ylabel("Type", fontsize=12, fontweight="bold") + ax.set_title( + f"GPU Time Percentage Change: {label} vs {base_label}\n(Negative = Improvement)", + fontsize=14, + fontweight="bold", + ) + ax.grid(True, alpha=0.3, axis="x") + + plt.tight_layout() + + output_file = output_dir / f"improvement_chart.png" + plt.savefig(output_file, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved: {output_file}") + + +def calculate_gpu_timepercentage_change(df, labels): + base_label = labels[0] + for label in labels[1:]: + df[f"percentage_change_{label}"] = ( + (df[f"time ms_{label}"] - df[f"time ms_{base_label}"]) + / df[f"time ms_{base_label}"] + * 100 + ) + return df + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--inputs", + type=str, + nargs="+", + required=True, + help="List of directories containing gpu_timeline_summary_mean.xlsx", + ) + + parser.add_argument( + "--output", type=Path, default="./output.xlsx", help="Output xls file name" + ) + args = parser.parse_args() + + labels = [] + summary_dfs = [] + gpu_time_per_rank_dfs = [] + nccl_dfs = [] + + for directory in args.inputs: + dir_path = Path(directory) + label = dir_path.stem + + if not dir_path.exists(): + print(f"Directory not found: {dir_path}") + continue + + input_excel_file = ( + dir_path / "tracelens_analysis" / "gpu_timeline_summary_mean.xlsx" + ) + nccl_excel_file = ( + dir_path + / "tracelens_analysis" + / "collective_reports" + / "collective_all_ranks.xlsx" + ) + if not input_excel_file.exists(): + print(f"Excel file not found: {input_excel_file}") + continue + + labels.append(label) + + # Read and rename columns with label suffix + summary = pd.read_excel(input_excel_file, sheet_name="Summary") + gpu_time = pd.read_excel(input_excel_file, sheet_name="Per_Rank_Time_ms") + + # Rename non-key columns with label suffix + summary = summary.rename( + columns={col: f"{col}_{label}" for col in summary.columns if col != "type"} + ) + gpu_time = gpu_time.rename( + columns={col: f"{col}_{label}" for col in gpu_time.columns if col != "type"} + ) + + summary_dfs.append(summary) + gpu_time_per_rank_dfs.append(gpu_time) + print(f"Loaded: {label}") + + if nccl_excel_file.exists(): + nccl_df = pd.read_excel( + nccl_excel_file, sheet_name="nccl_summary_implicit_sync" + ) + + # Create index column by appending "Collective name" and "In msg nelems" + nccl_df["index"] = ( + nccl_df["Collective name"].astype(str) + + "_" + + nccl_df["In msg nelems"].astype(str) + ) + + # Rename non-key columns with label suffix (exclude 'index' as it's the merge key) + nccl_df = nccl_df.rename( + columns={ + col: f"{col}_{label}" for col in nccl_df.columns if col != "index" + } + ) + nccl_dfs.append(nccl_df) + print(f"Loaded: {label} NCCL") + else: + print(f"NCCL file not found: {nccl_excel_file}") + + # Merge all DataFrames on 'type' + summary_df = summary_dfs[0] + gpu_time_per_rank_df = gpu_time_per_rank_dfs[0] + nccl_df = nccl_dfs[0] + + for i in range(1, len(summary_dfs)): + summary_df = pd.merge(summary_df, summary_dfs[i], on="type", how="outer") + gpu_time_per_rank_df = pd.merge( + gpu_time_per_rank_df, gpu_time_per_rank_dfs[i], on="type", how="outer" + ) + nccl_df = pd.merge(nccl_df, nccl_dfs[i], on="index", how="outer") + + summary_df = calculate_gpu_timepercentage_change(summary_df, labels) + + with pd.ExcelWriter(args.output, engine="openpyxl") as writer: + summary_df.to_excel(writer, sheet_name="Summary", index=False) + gpu_time_per_rank_df.to_excel( + writer, sheet_name="Per_Rank_Time_ms", index=False + ) + nccl_df.to_excel(writer, sheet_name="NCCL_Summary", index=False) + + output_dir = Path(args.output).parent / "plots" + output_dir.mkdir(parents=True, exist_ok=True) + plot_gpu_time_percentage_change(summary_df, labels, output_dir) + plot_gpu_time_summary(summary_df, labels, output_dir) + plot_all_types_per_rank(gpu_time_per_rank_df, labels, output_dir) + plot_nccl_data_per_msg(nccl_df, labels, output_dir) + + html_script_path = Path(__file__).parent / "generate_merged_html.py" + cmd = [ + "python3", + str(html_script_path), + "--plot-files-directory", + str(output_dir), + "--output-html", + str(args.output.parent / "final_analysis_report.html"), + ] + if run_command(cmd, "Creating final HTML"): + print( + f"Final HTML file created at: {args.output.parent / 'final_analysis_report.html'}" + ) + else: + print("Failed to create final HTML file") + + +if __name__ == "__main__": + main() diff --git a/scripts/tracelens_single_config/process_gpu_timeline.py b/scripts/tracelens_single_config/process_gpu_timeline.py new file mode 100755 index 0000000..7fd00b7 --- /dev/null +++ b/scripts/tracelens_single_config/process_gpu_timeline.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import argparse +from pathlib import Path + + +def geometric_mean(values): + values = np.array(values) + values = np.where(values == 0, 1e-10, values) + return np.exp(np.mean(np.log(values))) + + +def process_gpu_timeline(reports_dir, use_geo_mean=False): + reports_path = Path(reports_dir) + + if not reports_path.exists(): + print(f"Error: Directory not found: {reports_dir}") + return 1 + + print(f"Processing GPU timeline from: {reports_dir}") + print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}") + + perf_files = sorted(reports_path.glob('perf_rank*.xlsx')) + + if not perf_files: + print("Error: No perf_rank*.xlsx files found") + return 1 + + print(f"Found {len(perf_files)} rank files") + + rank_data = [] + for file_path in perf_files: + rank_num = int(file_path.stem.replace('perf_rank', '')) + try: + df = pd.read_excel(file_path, sheet_name='gpu_timeline') + df['rank'] = rank_num + rank_data.append(df) + print(f" Rank {rank_num}: OK") + except Exception as e: + print(f" Rank {rank_num}: Error - {e}") + + if not rank_data: + print("Error: No valid data loaded") + return 1 + + combined = pd.concat(rank_data, ignore_index=True) + + agg_func = geometric_mean if use_geo_mean else 'mean' + aggregated = combined.groupby('type').agg({ + 'time ms': agg_func, + 'percent': agg_func + }).reset_index() + + aggregated['num_ranks'] = len(perf_files) + + method_suffix = 'geomean' if use_geo_mean else 'mean' + output_path = reports_path.parent / f'gpu_timeline_summary_{method_suffix}.xlsx' + + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + aggregated.to_excel(writer, sheet_name='Summary', index=False) + + combined_sorted = combined.sort_values(['rank', 'type']) + combined_sorted.to_excel(writer, sheet_name='All_Ranks_Combined', index=False) + + per_rank = combined.pivot_table( + values='time ms', + index='type', + columns='rank', + aggfunc='first' + ) + per_rank.to_excel(writer, sheet_name='Per_Rank_Time_ms') + + per_rank_pct = combined.pivot_table( + values='percent', + index='type', + columns='rank', + aggfunc='first' + ) + per_rank_pct.to_excel(writer, sheet_name='Per_Rank_Percent') + + print(f"\nSaved: {output_path}") + print("\nSummary:") + print(aggregated.to_string(index=False)) + + return 0 + + +def main(): + parser = argparse.ArgumentParser(description='Aggregate GPU timeline across ranks') + parser.add_argument('--reports-dir', required=True, help='Path to individual_reports directory') + parser.add_argument('--geo-mean', action='store_true', help='Use geometric mean') + + args = parser.parse_args() + + return process_gpu_timeline(args.reports_dir, args.geo_mean) + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/tracelens_single_config/run_full_analysis.py b/scripts/tracelens_single_config/run_full_analysis.py new file mode 100755 index 0000000..ec38581 --- /dev/null +++ b/scripts/tracelens_single_config/run_full_analysis.py @@ -0,0 +1,392 @@ +#!/usr/bin/env python3 +""" +Master script for complete TraceLens analysis pipeline. +Runs analysis on baseline and test traces, then performs all comparisons. +""" +import argparse +import subprocess +import os +import sys +from pathlib import Path + + +def run_command(cmd, description): + """Execute a command and handle errors.""" + print(f"\n{'='*80}") + print(f"{description}") + print(f"{'='*80}") + print(f"Command: {' '.join(cmd)}") + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error: {description} failed!") + print(f"Stderr: {result.stderr}") + return False + + print(result.stdout) + return True + + +def run_tracelens_analysis(trace_dir, output_name, individual_only=False, collective_only=False): + """Run TraceLens analysis on a single trace directory.""" + print(f"\nAnalyzing: {trace_dir}") + + # Build command + script_path = Path(__file__).parent / "run_tracelens_single_config.sh" + cmd = ["bash", str(script_path), trace_dir] + + if individual_only: + cmd.append("--individual-only") + elif collective_only: + cmd.append("--collective-only") + + return run_command(cmd, f"TraceLens analysis for {output_name}") + + +def process_gpu_timeline(reports_dir): + """Process GPU timeline from individual reports.""" + script_path = Path(__file__).parent / "process_gpu_timeline.py" + cmd = ["python3", str(script_path), "--reports-dir", reports_dir] + + return run_command(cmd, "Processing GPU timeline") + + +def combine_reports(baseline_file, test_file, output_file): + """Combine baseline and test reports.""" + script_path = Path(__file__).parent / "combine_reports.py" + cmd = ["python3", str(script_path), + "--baseline", baseline_file, + "--test", test_file, + "--output", output_file] + + return run_command(cmd, f"Combining reports to {output_file}") + + +def add_comparison_sheets(input_file, output_file, baseline_label=None, test_label=None): + """Add comparison sheets for GPU timeline.""" + script_path = Path(__file__).parent / "add_comparison_sheets.py" + cmd = ["python3", str(script_path), + "--input", input_file, + "--output", output_file] + if baseline_label: + cmd.extend(["--baseline-label", baseline_label]) + if test_label: + cmd.extend(["--test-label", test_label]) + + return run_command(cmd, "Adding GPU timeline comparison sheets") + + +def add_collective_comparison(input_file, output_file, baseline_label=None, test_label=None): + """Add comparison sheets for collective operations.""" + script_path = Path(__file__).parent / "add_collective_comparison.py" + cmd = ["python3", str(script_path), + "--input", input_file, + "--output", output_file] + if baseline_label: + cmd.extend(["--baseline-label", baseline_label]) + if test_label: + cmd.extend(["--test-label", test_label]) + + return run_command(cmd, "Adding collective comparison sheets") + + +def create_final_report(gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file, baseline_label=None, test_label=None): + """Create comprehensive final report with all data.""" + script_path = Path(__file__).parent / "create_final_report.py" + cmd = ["python3", str(script_path), + "--gpu-combined", gpu_combined, + "--gpu-comparison", gpu_comparison, + "--coll-combined", coll_combined, + "--coll-comparison", coll_comparison, + "--output", output_file] + + if baseline_label: + cmd.extend(["--baseline-label", baseline_label]) + if test_label: + cmd.extend(["--test-label", test_label]) + + return run_command(cmd, "Creating comprehensive final report") + + +def main(): + parser = argparse.ArgumentParser( + description='Complete TraceLens analysis pipeline with comparisons', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Full analysis with everything including final report + python run_full_analysis.py \\ + --baseline /path/to/baseline/traces \\ + --test /path/to/test/traces \\ + --output /path/to/output \\ + --all + + # Only GPU timeline comparison + python run_full_analysis.py \\ + --baseline /path/to/baseline \\ + --test /path/to/test \\ + --output /path/to/output \\ + --gpu-timeline + + # Create final report (skip TraceLens if already done) + python run_full_analysis.py \\ + --baseline /path/to/baseline \\ + --test /path/to/test \\ + --output /path/to/output \\ + --gpu-timeline --collective --final-report \\ + --skip-tracelens + + """ + ) + + # Required arguments + parser.add_argument('--baseline', required=True, + help='Path to baseline trace directory') + parser.add_argument('--test', required=True, + help='Path to test trace directory') + parser.add_argument('--output', required=True, + help='Output directory for comparison results') + + # Analysis options + parser.add_argument('--skip-tracelens', action='store_true', + help='Skip TraceLens report generation (if already done)') + parser.add_argument('--individual-only', action='store_true', + help='Generate only individual reports') + parser.add_argument('--collective-only', action='store_true', + help='Generate only collective reports') + + # Comparison options + parser.add_argument('--gpu-timeline', action='store_true', + help='Perform GPU timeline comparison') + parser.add_argument('--collective', action='store_true', + help='Perform collective/NCCL comparison') + parser.add_argument('--final-report', action='store_true', + help='Create comprehensive final report with tables and hidden raw data') + parser.add_argument('--generate-plots', action='store_true', + help='Generate visualization plots and HTML report from final report') + parser.add_argument('--all', action='store_true', + help='Perform all analyses and comparisons including final report, plots, and HTML report') + + args = parser.parse_args() + + # Handle --all flag + if args.all: + args.gpu_timeline = True + args.collective = True + args.final_report = True + args.generate_plots = True + + # Validate inputs + baseline_path = Path(args.baseline) + test_path = Path(args.test) + output_path = Path(args.output) + + if not baseline_path.exists(): + print(f"Error: Baseline path not found: {args.baseline}") + return 1 + + if not test_path.exists(): + print(f"Error: Test path not found: {args.test}") + return 1 + + # Create output directory + output_path.mkdir(parents=True, exist_ok=True) + + print("\n" + "="*80) + print("TRACELENS FULL ANALYSIS PIPELINE") + print("="*80) + print(f"Baseline: {args.baseline}") + print(f"Test: {args.test}") + print(f"Output: {args.output}") + print(f"Options:") + print(f" Skip TraceLens: {args.skip_tracelens}") + print(f" GPU timeline: {args.gpu_timeline}") + print(f" Collective: {args.collective}") + print(f" Final report: {args.final_report}") + + # Step 1: Run TraceLens analysis on both directories + if not args.skip_tracelens: + print("\n" + "="*80) + print("STEP 1: Running TraceLens Analysis") + print("="*80) + + if not run_tracelens_analysis(args.baseline, "baseline", + args.individual_only, args.collective_only): + return 1 + + if not run_tracelens_analysis(args.test, "test", + args.individual_only, args.collective_only): + return 1 + else: + print("\nSkipping TraceLens report generation (--skip-tracelens flag)") + + # Determine analysis directories + baseline_analysis = baseline_path / "tracelens_analysis" + test_analysis = test_path / "tracelens_analysis" + + if not baseline_analysis.exists(): + print(f"Error: Baseline analysis not found: {baseline_analysis}") + print("Run without --skip-tracelens flag first") + return 1 + + if not test_analysis.exists(): + print(f"Error: Test analysis not found: {test_analysis}") + print("Run without --skip-tracelens flag first") + return 1 + + # Extract config labels from paths + baseline_label = baseline_path.name # e.g., "56cu_256threads" + test_label = test_path.name # e.g., "37cu_384threads" + + # Step 2: GPU Timeline Comparison + if args.gpu_timeline: + print("\n" + "="*80) + print("STEP 2: GPU Timeline Comparison") + print(f" Baseline: {baseline_label}") + print(f" Test: {test_label}") + print("="*80) + + # Process GPU timelines + baseline_reports = baseline_analysis / "individual_reports" + test_reports = test_analysis / "individual_reports" + + if not baseline_reports.exists() or not test_reports.exists(): + print("Error: Individual reports not found. Run without --individual-only flag") + return 1 + + print(f"\nProcessing baseline GPU timeline ({baseline_label})...") + if not process_gpu_timeline(str(baseline_reports)): + return 1 + + print(f"\nProcessing test GPU timeline ({test_label})...") + if not process_gpu_timeline(str(test_reports)): + return 1 + + # Combine GPU timeline summaries + baseline_gpu = baseline_analysis / "gpu_timeline_summary_mean.xlsx" + test_gpu = test_analysis / "gpu_timeline_summary_mean.xlsx" + combined_gpu = output_path / "gpu_timeline_combined.xlsx" + + if not combine_reports(str(baseline_gpu), str(test_gpu), str(combined_gpu)): + return 1 + + # Add comparison sheets + gpu_comparison = output_path / "gpu_timeline_comparison.xlsx" + if not add_comparison_sheets(str(combined_gpu), str(gpu_comparison), baseline_label, test_label): + return 1 + + print(f"\nGPU timeline comparison saved to: {gpu_comparison}") + + # Step 3: Collective Comparison + if args.collective: + print("\n" + "="*80) + print("STEP 3: Collective/NCCL Comparison") + print(f" Baseline: {baseline_label}") + print(f" Test: {test_label}") + print("="*80) + + baseline_collective = baseline_analysis / "collective_reports" / "collective_all_ranks.xlsx" + test_collective = test_analysis / "collective_reports" / "collective_all_ranks.xlsx" + + if not baseline_collective.exists() or not test_collective.exists(): + print("Error: Collective reports not found. Run without --collective-only flag") + return 1 + + # Combine collective reports + combined_collective = output_path / "collective_combined.xlsx" + if not combine_reports(str(baseline_collective), str(test_collective), + str(combined_collective)): + return 1 + + # Add collective comparison + collective_comparison = output_path / "collective_comparison.xlsx" + if not add_collective_comparison(str(combined_collective), + str(collective_comparison), baseline_label, test_label): + return 1 + + print(f"\nCollective comparison saved to: {collective_comparison}") + + # Step 4: Create final comprehensive report + if args.final_report and args.gpu_timeline and args.collective: + print("\n" + "="*80) + print("STEP 4: Creating Final Comprehensive Report") + print("="*80) + + gpu_combined = output_path / "gpu_timeline_combined.xlsx" + gpu_comparison = output_path / "gpu_timeline_comparison.xlsx" + collective_combined = output_path / "collective_combined.xlsx" + collective_comparison = output_path / "collective_comparison.xlsx" + final_report = output_path / "final_analysis_report.xlsx" + + if not create_final_report(str(gpu_combined), str(gpu_comparison), + str(collective_combined), str(collective_comparison), + str(final_report), baseline_label, test_label): + return 1 + + print(f"\nFinal comprehensive report saved to: {final_report}") + print(" - Summary Dashboard as first sheet") + print(" - All comparison sheets visible") + print(" - Raw data sheets hidden (can be unhidden in Excel)") + print(" - All data formatted as Excel tables with filters") + print(" - Color coding applied (green=better, red=worse)") + + # Step 5: Generate visualization plots + if args.generate_plots and args.final_report: + print("\n" + "="*80) + print("STEP 5: Generating Visualization Plots") + print("="*80) + + final_report = output_path / "final_analysis_report.xlsx" + plots_dir = output_path / "plots" + + if final_report.exists(): + script_path = Path(__file__).parent / "generate_enhanced_plots.py" + cmd = ["python3", str(script_path), + "--input", str(final_report), + "--output", str(plots_dir)] + + # The script generates HTML report by default + if run_command(cmd, "Generating visualization plots and HTML report"): + print(f"\nOutput saved to: {plots_dir}/") + print("\n Generated plots:") + print(" - Percentage Change Overview") + print(" - Absolute Time Comparison") + print(" - Performance Heatmap by Rank") + print(" - Total Execution Time by Rank") + print(" - Time Breakdown by Rank") + print(" - Percentage Breakdown by Rank") + print(" - NCCL/Collective Metrics") + print("\n HTML Report: plots/performance_analysis_report.html") + print(" - Open in browser to view complete report") + print(" - Print to PDF: Ctrl+P (or Cmd+P on Mac)") + else: + print(" Final report not found, skipping plot generation") + + # Summary + print("\n" + "="*80) + print("ANALYSIS COMPLETE!") + print("="*80) + print(f"\nResults saved to: {output_path}") + + files = list(output_path.glob("*.xlsx")) + if files: + print("\nGenerated Excel files:") + for f in sorted(files): + print(f" - {f.name}") + + if args.generate_plots: + plots_dir = output_path / "plots" + if plots_dir.exists(): + plot_files = list(plots_dir.glob("*.png")) + if plot_files: + print("\nGenerated plots:") + for f in sorted(plot_files): + print(f" - plots/{f.name}") + + print("\nAnalysis pipeline completed successfully!") + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh new file mode 100755 index 0000000..b30ac09 --- /dev/null +++ b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh @@ -0,0 +1,302 @@ +#!/bin/bash + +# Compare specific RCCL Warp Speed configurations +# Usage: ./run_rccl_warp_speed_comparison.sh [OPTIONS] +# -c CONFIG_FILE Config file (default: config/distributed.yaml) +# -p PAIRS CU,threads pairs (e.g., "56,256 37,384 32,512") +# -h Show help +# +# Examples: +# # Use default 3 configurations +# ./run_rccl_warp_speed_comparison.sh +# +# # Custom configurations +# ./run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512" +# +# # Different config file with custom pairs +# ./run_rccl_warp_speed_comparison.sh -c myconfig.yaml -p "40,256 30,512" + +CONFIG_FILE="config/distributed.yaml" +CUSTOM_PAIRS="" + +# Parse command line arguments +while getopts "c:p:h" opt; do + case $opt in + c) + CONFIG_FILE="$OPTARG" + ;; + p) + CUSTOM_PAIRS="$OPTARG" + ;; + h) + echo "Usage: $0 [OPTIONS]" + echo " -c CONFIG_FILE Config file (default: config/single_node/gemm_overlap_comm.yaml)" + echo " -p PAIRS CU,threads pairs (e.g., \"56,256 37,384 32,512\")" + echo " -h Show help" + echo "" + echo "Examples:" + echo " # Use default 3 configurations" + echo " $0" + echo "" + echo " # Custom configurations" + echo " $0 -p \"56,256 37,384 32,512\"" + echo "" + echo " # Different config file with custom pairs" + echo " $0 -c myconfig.yaml -p \"40,256 30,512\"" + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" + exit 1 + ;; + esac +done +BASE_CMD="torchrun --nproc_per_node 8 train.py --config ${CONFIG_FILE}" +BASE_OVERRIDES="--override training.max_steps=100 --override profiling.tensorboard=false" + +# Base output directory +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BASE_OUTPUT_DIR="experiments/rccl_warp_speed_${TIMESTAMP}" + +# Create base output directory +mkdir -p "${BASE_OUTPUT_DIR}" + +# Log file +SWEEP_LOG="${BASE_OUTPUT_DIR}/rccl_warp_speed_comparison_${TIMESTAMP}.log" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Function to log with timestamp +log() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] ${message}" | tee -a "${SWEEP_LOG}" +} + +# Cleanup function for Ctrl+C +cleanup() { + echo "" + echo -e "${RED}=== Caught interrupt signal (Ctrl+C) ===${NC}" | tee -a "${SWEEP_LOG}" + log "Cleaning up all training processes..." + sudo pkill -9 -f "train.py" 2>/dev/null || true + sudo pkill -9 -f "torchrun" 2>/dev/null || true + log "Cleanup complete. Exiting." + exit 130 +} + +trap cleanup SIGINT SIGTERM + +echo -e "${GREEN}=== RCCL Warp Speed Configuration Comparison ===${NC}" | tee "${SWEEP_LOG}" +log "Config file: ${CONFIG_FILE}" +log "Results directory: ${BASE_OUTPUT_DIR}" +echo "" + +# Check RCCL version and configuration +echo -e "${BLUE}=== RCCL Version Check ===${NC}" | tee -a "${SWEEP_LOG}" + +# Check if custom RCCL is available +if [ -d "/opt/rccl/build/release" ]; then + echo -e "${GREEN}[OK] Custom RCCL found at /opt/rccl/build/release${NC}" | tee -a "${SWEEP_LOG}" + + # Check branch and commit + if [ -d "/opt/rccl/.git" ]; then + cd /opt/rccl + RCCL_BRANCH=$(git branch --show-current 2>/dev/null) + RCCL_COMMIT=$(git log --oneline -1 2>/dev/null) + cd - > /dev/null + + echo " Branch: ${RCCL_BRANCH}" | tee -a "${SWEEP_LOG}" + echo " Commit: ${RCCL_COMMIT}" | tee -a "${SWEEP_LOG}" + + # Verify it's warp_speed_v1 + if [[ "${RCCL_BRANCH}" == "warp_speed_v1" ]]; then + echo -e " ${GREEN}[OK] Using warp_speed_v1 branch${NC}" | tee -a "${SWEEP_LOG}" + else + echo -e " ${YELLOW}[WARNING] Not on warp_speed_v1 branch (current: ${RCCL_BRANCH})${NC}" | tee -a "${SWEEP_LOG}" + fi + fi + + # Check library size to verify it's built + RCCL_LIB_SIZE=$(ls -lh /opt/rccl/build/release/librccl.so.1.0 2>/dev/null | awk '{print $5}') + echo " Library size: ${RCCL_LIB_SIZE}" | tee -a "${SWEEP_LOG}" +else + echo -e "${YELLOW}[WARNING] Custom RCCL not found, will use PyTorch bundled version${NC}" | tee -a "${SWEEP_LOG}" + echo " PyTorch's bundled RCCL may not have warp_speed features!" | tee -a "${SWEEP_LOG}" +fi + +# Test if RCCL responds to warp_speed environment variables +echo "" | tee -a "${SWEEP_LOG}" +echo "Testing warp_speed environment variable response..." | tee -a "${SWEEP_LOG}" +export RCCL_WARP_SPEED_ENABLE=1 +export RCCL_WARP_SPEED_CU_COUNT=56 +export NCCL_DEBUG=VERSION + +python -c " +import torch +print('PyTorch version:', torch.__version__) +if torch.cuda.is_available(): + print('ROCm/CUDA available:', True) + print('Device count:', torch.cuda.device_count()) +" 2>&1 | tee -a "${SWEEP_LOG}" + +# Clean up test variables +unset RCCL_WARP_SPEED_CU_COUNT +unset NCCL_DEBUG + +echo -e "${BLUE}===========================${NC}" | tee -a "${SWEEP_LOG}" +echo "" + +# Define configurations to test +# Format: "NAME|CU_COUNT|THREADS_PER_BLOCK" +if [ -n "$CUSTOM_PAIRS" ]; then + # Parse custom pairs + CONFIGS=() + for pair in $CUSTOM_PAIRS; do + IFS=',' read -r cu threads <<< "$pair" + CONFIGS+=("${cu}cu_${threads}threads|${cu}|${threads}") + done + log "Using custom configurations: ${CUSTOM_PAIRS}" +else + # Use default configurations + CONFIGS=( + "56cu_256threads|56|256" + "37cu_384threads|37|384" + "32cu_512threads|32|512" + ) + log "Using default RCCL Warp Speed configurations" +fi + +# Track results +declare -A RUN_STATUS +declare -A RUN_TIMES + +# Run each configuration +for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + + OUTPUT_DIR="${BASE_OUTPUT_DIR}/${NAME}" + + echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}" + log "Running configuration: ${NAME}" + log " RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}" + log " RCCL_THREADS_PER_BLOCK=${THREADS}" + log " Output directory: ${OUTPUT_DIR}" + echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}" + + # Create output directory + mkdir -p "${OUTPUT_DIR}" + + # Record start time + START_TIME=$(date +%s) + + # Export environment variables so child processes inherit them + export RCCL_WARP_SPEED_ENABLE=1 + export RCCL_UNROLL_FACTOR=1 + export RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT} + export RCCL_THREADS_PER_BLOCK=${THREADS} + export HSA_ENABLE_SDMA=0 + export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 + + # Use custom RCCL if available + if [ -d "/opt/rccl/build/release" ]; then + export LD_LIBRARY_PATH=/opt/rccl/build/release:${LD_LIBRARY_PATH:-} + log " Using custom RCCL from /opt/rccl/build/release" + fi + + # Run the command + ${BASE_CMD} ${BASE_OVERRIDES} \ + --override training.output_dir=${OUTPUT_DIR} \ + 2>&1 | tee "${OUTPUT_DIR}/run_output.log" + + EXIT_CODE=${PIPESTATUS[0]} + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + + # Unset environment variables to avoid affecting next run + unset RCCL_WARP_SPEED_CU_COUNT + unset RCCL_THREADS_PER_BLOCK + + RUN_TIMES[${NAME}]=${DURATION} + + if [ $EXIT_CODE -eq 0 ]; then + log "[OK] Completed ${NAME} (duration: ${DURATION}s)" + RUN_STATUS[${NAME}]="SUCCESS" + else + log "[ERROR] Failed ${NAME} (exit code: $EXIT_CODE, duration: ${DURATION}s)" + RUN_STATUS[${NAME}]="FAILED" + fi + + # Fix permissions if running as root in container + if [ "$EUID" -eq 0 ]; then + chmod -R 755 "${OUTPUT_DIR}" 2>/dev/null || true + fi + + echo "" + log "Waiting 5 seconds before next run..." + sleep 5 +done + +# Generate summary report +echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}" +echo -e "${BLUE}SUMMARY REPORT${NC}" | tee -a "${SWEEP_LOG}" +echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}" + +SUMMARY_FILE="${BASE_OUTPUT_DIR}/rccl_warp_speed_summary_${TIMESTAMP}.txt" +{ + echo "RCCL Warp Speed Configuration Comparison" + echo "Generated: $(date)" + echo "" + printf "%-20s %-10s %-15s %-10s\n" "CONFIGURATION" "CU_COUNT" "THREADS" "STATUS" + echo "----------------------------------------------------------------" + + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + STATUS="${RUN_STATUS[${NAME}]:-UNKNOWN}" + DURATION="${RUN_TIMES[${NAME}]:-N/A}" + printf "%-20s %-10s %-15s %-10s (duration: %ss)\n" "${NAME}" "${CU_COUNT}" "${THREADS}" "${STATUS}" "${DURATION}" + done + + echo "" + echo "Output directories:" + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/" + done + + echo "" + echo "Trace files for each configuration:" + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/torch_profiler/" + done +} | tee "${SUMMARY_FILE}" + +log "Summary saved to: ${SUMMARY_FILE}" + +# Fix permissions for the entire output directory if running as root +if [ "$EUID" -eq 0 ]; then + echo "Fixing permissions for output directory..." | tee -a "${SWEEP_LOG}" + chmod -R 755 "${BASE_OUTPUT_DIR}" 2>/dev/null || true +fi + +echo "" +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Next Steps: Run TraceLens Analysis${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "To analyze and compare these configurations:" +echo "" +echo "./scripts/tracelens_single_config/run_tracelens_analysis.sh ${BASE_OUTPUT_DIR}" +echo "" +echo "This will generate:" +echo " - Individual reports for each rank (all 3 configs)" +echo " - Collective reports (all 3 configs)" +echo " - Comparison reports across the 3 configurations" +echo "" + +log "All runs completed! Run TraceLens analysis next." diff --git a/scripts/tracelens_single_config/run_tracelens_single_config.sh b/scripts/tracelens_single_config/run_tracelens_single_config.sh new file mode 100755 index 0000000..96831ff --- /dev/null +++ b/scripts/tracelens_single_config/run_tracelens_single_config.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# TraceLens Analysis for Single Configuration (No Sweep) +# Usage: ./run_tracelens_single_config.sh +# +# The script accepts either: +# - Path to parent directory containing torch_profiler/ +# - Path to torch_profiler/ directory directly +# +# Examples: +# ./run_tracelens_single_config.sh /path/to/traces +# ./run_tracelens_single_config.sh /path/to/traces/torch_profiler +# +# Note: Uses GEMM-patched TraceLens wrapper to recognize ROCm Tensile kernels + +set -e + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Use patched TraceLens wrapper for GEMM recognition +TRACELENS_WRAPPER="python $SCRIPT_DIR/../tracelens_with_gemm_patch.py" + +# Parse options +RUN_INDIVIDUAL=true +RUN_COLLECTIVE=true + +while [[ $# -gt 0 ]]; do + case $1 in + --individual-only) + RUN_COLLECTIVE=false + shift + ;; + --collective-only) + RUN_INDIVIDUAL=false + shift + ;; + *) + INPUT_DIR="$1" + shift + ;; + esac +done + +# Check if directory provided +if [ -z "$INPUT_DIR" ]; then + echo "Error: Please provide trace directory" + echo "" + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --individual-only Generate only individual reports" + echo " --collective-only Generate only collective report" + echo "" + echo "Examples:" + echo " $0 /path/to/traces" + echo " $0 /path/to/traces --individual-only" + echo " $0 /path/to/traces --collective-only" + echo "" + exit 1 +fi + +# Verify directory exists +if [ ! -d "$INPUT_DIR" ]; then + echo "Error: Directory not found: $INPUT_DIR" + exit 1 +fi + +# Auto-detect structure: is this torch_profiler/ or its parent? +TORCH_PROF_DIR="" +BASE_DIR="" + +# Check if INPUT_DIR contains rank directories (i.e., it IS torch_profiler/) +if find "$INPUT_DIR" -maxdepth 1 -type d -name "rank*" | grep -q .; then + TORCH_PROF_DIR="$INPUT_DIR" + BASE_DIR=$(dirname "$INPUT_DIR") + echo "Detected torch_profiler directory: $TORCH_PROF_DIR" +# Check if INPUT_DIR contains torch_profiler/ subdirectory +elif [ -d "$INPUT_DIR/torch_profiler" ]; then + TORCH_PROF_DIR="$INPUT_DIR/torch_profiler" + BASE_DIR="$INPUT_DIR" + echo "Found torch_profiler subdirectory: $TORCH_PROF_DIR" +else + echo "Error: Cannot find rank directories in expected structure" + echo "" + echo "Expected one of:" + echo " 1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)" + echo " 2. Parent directory containing torch_profiler/rank0/, rank1/, ..." + echo "" + echo "Provided: $INPUT_DIR" + exit 1 +fi + +echo "════════════════════════════════════════════════════════════════" +echo " TraceLens Analysis - Single Configuration" +echo "════════════════════════════════════════════════════════════════" +echo "" +echo "Input directory: $INPUT_DIR" +echo "Torch profiler traces: $TORCH_PROF_DIR" +echo "" + +# Create output directory in the base directory +OUTPUT_DIR="${BASE_DIR}/tracelens_analysis" +mkdir -p "$OUTPUT_DIR" +mkdir -p "$OUTPUT_DIR/individual_reports" +mkdir -p "$OUTPUT_DIR/collective_reports" + +# Detect number of ranks +NUM_RANKS=$(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | wc -l) + +if [ $NUM_RANKS -eq 0 ]; then + echo "Error: No rank directories found in $TORCH_PROF_DIR" + exit 1 +fi + +echo "Detected $NUM_RANKS ranks" + +# Show sample trace files +echo "" +echo "Sample trace files:" +for rank_dir in $(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | sort | head -3); do + rank_name=$(basename "$rank_dir") + trace_file=$(find "$rank_dir" -name "*.json" | head -1) + if [ -n "$trace_file" ]; then + echo " $rank_name: $(basename "$trace_file")" + fi +done +if [ "$RUN_INDIVIDUAL" = true ]; then + echo "" + echo "════════════════════════════════════════════════════════════════" + echo "Step 1: Generating Individual Performance Reports" + echo "════════════════════════════════════════════════════════════════" + echo "" + +# Process each rank +for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do + # Try multiple directory naming patterns + RANK_DIR="" + if [ -d "$TORCH_PROF_DIR/rank${rank_idx}" ]; then + RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}" + elif [ -d "$TORCH_PROF_DIR/rank_${rank_idx}" ]; then + RANK_DIR="$TORCH_PROF_DIR/rank_${rank_idx}" + elif [ -d "$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)" ]; then + RANK_DIR="$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)" + fi + + if [ -z "$RANK_DIR" ] || [ ! -d "$RANK_DIR" ]; then + echo " Skip rank ${rank_idx} - directory not found" + continue + fi + + # Find trace file + TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1) + + if [ -z "$TRACE" ]; then + echo "⚠️ Skip rank ${rank_idx} - no trace file found" + continue + fi + + OUTPUT="$OUTPUT_DIR/individual_reports/perf_rank${rank_idx}.xlsx" + + echo "Processing rank ${rank_idx}..." + echo " Trace: $(basename "$TRACE")" + + $TRACELENS_WRAPPER generate_perf_report \ + --profile_json_path "$TRACE" \ + --output_xlsx_path "$OUTPUT" \ + --include_unlinked_kernels \ + --short_kernel_study \ + --short_kernel_threshold_us 50 \ + --topk_ops 100 \ + --topk_roofline_ops 100 + + echo " Done: $OUTPUT" + echo "" +done + +fi + +if [ "$RUN_COLLECTIVE" = true ]; then + echo "" + echo "════════════════════════════════════════════════════════════════" + echo "Step 2: Generating Multi-Rank Collective Report" + echo "════════════════════════════════════════════════════════════════" + echo "" + +# Find a sample trace file to get the filename pattern +SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank0" -name "*.json" -type f | head -1) +if [ -z "$SAMPLE_TRACE" ]; then + # Try alternative rank naming + SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_0" -name "*.json" -type f | head -1) +fi + +if [ -z "$SAMPLE_TRACE" ]; then + # Try rank_00 + SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_00" -name "*.json" -type f | head -1) +fi + +if [ -n "$SAMPLE_TRACE" ]; then + OUTPUT="$OUTPUT_DIR/collective_reports/collective_all_ranks.xlsx" + + echo "Generating collective report for all $NUM_RANKS ranks..." + + # Create symlinks with consistent names for collective report + for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do + RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}" + if [ -d "$RANK_DIR" ]; then + TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1) + if [ -n "$TRACE" ]; then + ln -sf "$(basename "$TRACE")" "$RANK_DIR/trace.json" + fi + fi + done + + echo " Trace pattern: rank*/trace.json" + + $TRACELENS_WRAPPER generate_multi_rank_collective \ + --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \ + --world_size $NUM_RANKS \ + --output_xlsx_path "$OUTPUT" \ + --detailed_analysis \ + --use_multiprocessing + + echo " Done: $OUTPUT" +else + echo " Could not generate collective report - no trace files found" +fi + +fi + +echo "" +echo "════════════════════════════════════════════════════════════════" +echo "Analysis Complete!" +echo "════════════════════════════════════════════════════════════════" +echo "" +echo "📁 Results saved to:" +echo " $OUTPUT_DIR/" +echo "" + +# Count generated reports +INDIV_COUNT=$(find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" 2>/dev/null | wc -l) +COLL_COUNT=$(find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" 2>/dev/null | wc -l) + +echo "Generated reports:" +echo " Individual reports (per rank): $INDIV_COUNT" +echo " Collective reports (all ranks): $COLL_COUNT" +echo "" + +echo "📊 Report Files:" +echo "" +echo "Individual Performance Reports:" +if [ $INDIV_COUNT -gt 0 ]; then + find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" | sort | sed 's/^/ /' +else + echo " (none generated)" +fi +echo "" + +echo "Collective Reports:" +if [ $COLL_COUNT -gt 0 ]; then + find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" | sed 's/^/ /' +else + echo " (none generated)" +fi + +echo "" +echo "Done!" diff --git a/scripts/tracelens_with_gemm_patch.py b/scripts/tracelens_with_gemm_patch.py new file mode 100755 index 0000000..6a200d9 --- /dev/null +++ b/scripts/tracelens_with_gemm_patch.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +TraceLens with GEMM Recognition Patches + +This script applies GEMM recognition patches and runs TraceLens commands. + +Usage: + python tracelens_with_gemm_patch.py generate_perf_report [args...] + python tracelens_with_gemm_patch.py generate_multi_rank_collective [args...] + python tracelens_with_gemm_patch.py compare_perf_reports [args...] +""" + +import re +import sys + + +def apply_gemm_patches(): + """Apply all GEMM recognition patches to TraceLens.""" + + print("Applying TraceLens GEMM recognition patches...") + + # Patch kernel_name_parser for enhanced ROCm GEMM recognition + try: + from TraceLens.PerfModel import kernel_name_parser + + def patched_is_rocm_gemm(kernel_name): + """ + Enhanced ROCm GEMM pattern matching for Tensile kernels. + Recognizes: Cijk_Alik_Bljk_... and variants with arbitrary prefixes. + """ + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + return bool(re.match(pattern, kernel_name)) + + def patched_parse_rocm_gemm(kernel_name): + """Parse ROCm GEMM kernel details.""" + # Parse transpose flags + trans_a, trans_b = None, None + if "_Ailk_" in kernel_name: + trans_a = False + elif "_Alik_" in kernel_name: + trans_a = True + if "_Bljk_" in kernel_name: + trans_b = False + elif "_Bjlk_" in kernel_name: + trans_b = True + + # Parse macro tile size (MT64x16x64) + macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name) + if macro_tile_match: + mt_m = int(macro_tile_match.group(1)) + mt_n = int(macro_tile_match.group(2)) + depth_u = int(macro_tile_match.group(3)) + else: + mt_m, mt_n, depth_u = None, None, None + + return { + "transpose": (trans_a, trans_b), + "mt_m": mt_m, + "mt_n": mt_n, + "depth_u": depth_u, + } + + def patched_gemm_name_parser(kernel_name): + """Enhanced GEMM name parser with better ROCm support.""" + if patched_is_rocm_gemm(kernel_name): + return patched_parse_rocm_gemm(kernel_name) + elif kernel_name_parser.is_cuda_gemm(kernel_name): + return kernel_name_parser.parse_cuda_gemm(kernel_name) + return None + + kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm + kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm + kernel_name_parser.gemm_name_parser = patched_gemm_name_parser + + print(" [OK] Patched kernel_name_parser (ROCm GEMM recognition)") + except ImportError as e: + print(f" [WARN] Could not patch kernel_name_parser: {e}") + + # Patch Trace2Tree util for is_gemm_kernel function + try: + from TraceLens.Trace2Tree import util as trace_util + + def patched_is_gemm_kernel(kernel_event: dict) -> bool: + """Enhanced GEMM kernel detection.""" + assert kernel_event["cat"] == "kernel" + kernel_name = kernel_event["name"] + + # ROCm Tensile GEMM pattern: C[xyz]_A[xyz]_B[xyz] + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + is_rocm_gemm = bool(re.match(pattern, kernel_name)) + + # CUDA GEMM pattern + is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name + + return is_rocm_gemm or is_cuda_gemm + + trace_util.is_gemm_kernel = patched_is_gemm_kernel + print(" [OK] Patched Trace2Tree.util (is_gemm_kernel)") + except ImportError as e: + print(f" [WARN] Could not patch Trace2Tree.util: {e}") + + # Patch TraceEventUtils to enhance GEMM keys + try: + from TraceLens import util as tracelens_util + + if hasattr(tracelens_util, 'TraceEventUtils'): + if hasattr(tracelens_util.TraceEventUtils, 'JaxOpKeys'): + original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys + enhanced_gemm_keys = ["Cijk", "gemm", "nvjet", "cublasLt", "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}"] + + all_keys = list(set(original_gemm_keys + enhanced_gemm_keys)) + tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys + + print(" [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)") + except (ImportError, AttributeError) as e: + print(f" [WARN] Could not patch TraceEventUtils: {e}") + + # Patch torch_op_mapping for better categorization + try: + from TraceLens.PerfModel import torch_op_mapping + + original_categorize = torch_op_mapping.categorize_torch_op + + def patched_categorize_torch_op(row): + """Enhanced categorization with better GEMM detection.""" + result = original_categorize(row) + + # If result is 'other', check for GEMM patterns in kernel names + if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0: + kernel_name = row["kernel_details"][0]["name"] + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + if re.match(pattern, kernel_name): + return "GEMM" + + return result + + torch_op_mapping.categorize_torch_op = patched_categorize_torch_op + print(" [OK] Patched torch_op_mapping (categorize_torch_op)") + except ImportError as e: + print(f" [WARN] Could not patch torch_op_mapping: {e}") + + print("[OK] All GEMM patches applied successfully!\n") + + +def main(): + if len(sys.argv) < 2: + print("Usage: tracelens_with_gemm_patch.py [args...]") + print("") + print("Commands:") + print(" generate_perf_report - Generate individual performance report") + print(" generate_multi_rank_collective - Generate multi-rank collective report") + print(" compare_perf_reports - Compare performance reports") + sys.exit(1) + + # Apply patches before importing TraceLens reporting modules + apply_gemm_patches() + + # Import TraceLens after patches are applied + from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_perf_report_main + from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import main as generate_multi_rank_collective_report_main + from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_perf_reports_main + + command = sys.argv[1] + + # Remove the command from argv so TraceLens sees only its args + sys.argv = [sys.argv[0]] + sys.argv[2:] + + if command == "generate_perf_report": + generate_perf_report_main() + elif command == "generate_multi_rank_collective": + generate_multi_rank_collective_report_main() + elif command == "compare_perf_reports": + compare_perf_reports_main() + else: + print(f"Error: Unknown command '{command}'") + print("") + print("Available commands:") + print(" generate_perf_report") + print(" generate_multi_rank_collective") + print(" compare_perf_reports") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/aorta/training/fsdp_trainer.py b/src/aorta/training/fsdp_trainer.py index 0a6fbc5..d3a03e0 100644 --- a/src/aorta/training/fsdp_trainer.py +++ b/src/aorta/training/fsdp_trainer.py @@ -273,6 +273,7 @@ def build_ddp_model( if device.type == "cuda": device_ids = [device.index if device.index is not None else torch.cuda.current_device()] + print(f"===> {device_ids} {model}") ddp_model = DDP( model, device_ids=device_ids, @@ -735,6 +736,7 @@ def main(args: Optional[argparse.Namespace] = None, *, enable_rocm_metrics: bool training_cfg.max_steps or training_cfg.epochs * len(dataloader), ) + print(f"Calling main trainer with device {env['device']}") profiler = StreamProfiler(env["device"]) try: