diff --git a/scripts/tracelens_single_config/README.md b/scripts/tracelens_single_config/README.md
new file mode 100644
index 0000000..b5c1321
--- /dev/null
+++ b/scripts/tracelens_single_config/README.md
@@ -0,0 +1,99 @@
+# TraceLens Single Configuration
+
+Analyze PyTorch profiler traces from one training run.
+
+For multiple configs see [../gemm_analysis/README.md](../gemm_analysis/README.md)
+
+## Quick Start
+
+```bash
+# Complete analysis
+python scripts/tracelens_single_config/run_full_analysis.py \
+ --baseline /path/to/baseline/traces \
+ --test /path/to/test/traces \
+ --output /path/to/output \
+ --all
+
+# Skip TraceLens if already done
+python scripts/tracelens_single_config/run_full_analysis.py \
+ --baseline /path/to/baseline \
+ --test /path/to/test \
+ --output /path/to/output \
+ --all --skip-tracelens
+```
+
+### Flags:
+- `--all` - Run everything including final report
+- `--gpu-timeline` - GPU timeline comparison
+- `--collective` - NCCL collective comparison
+- `--final-report` - Create comprehensive Excel report
+- `--skip-tracelens` - Skip TraceLens report generation if already done
+
+### Output:
+- `final_analysis_report.xlsx` - All comparisons with tables and color scale
+ - Color scale on percent_change: Red (worst) -> White (neutral) -> Green (best)
+
+### Using --skip-tracelens
+
+Use the same paths for `--baseline` and `--test`. The script looks for `tracelens_analysis` subdirectory:
+
+```bash
+# Expected structure when using --skip-tracelens
+baseline/
+└── tracelens_analysis/ # From previous run
+ ├── individual_reports/
+ └── collective_reports/
+
+test/
+└── tracelens_analysis/ # From previous run
+ ├── individual_reports/
+ └── collective_reports/
+```
+
+Example:
+```bash
+# Use same paths, script finds tracelens_analysis inside
+python run_full_analysis.py \
+ --baseline ~/data/baseline_traces \
+ --test ~/data/test_traces \
+ --output ~/results \
+ --all --skip-tracelens
+```
+
+
+## Expected Structure
+
+```
+traces/
+└── torch_profiler/
+ ├── rank0/
+ │ └── trace.json
+ ├── rank1/
+ │ └── trace.json
+ └── ...
+```
+
+## What the Master Script Does
+
+The `run_full_analysis.py` script automatically handles all steps:
+
+1. Runs TraceLens on baseline and test traces
+2. Processes GPU timelines using `process_gpu_timeline.py`
+3. Combines reports using `combine_reports.py`
+4. Adds comparison sheets using `add_comparison_sheets.py` and `add_collective_comparison.py`
+5. Creates final report using `create_final_report.py`
+
+All post-processing is handled automatically - no need to run individual scripts.
+
+
+## Scripts
+
+```
+run_full_analysis.py - Master script for complete pipeline
+create_final_report.py - Create comprehensive Excel report
+run_tracelens_single_config.sh - Main TraceLens report generation
+process_gpu_timeline.py - Aggregate GPU timeline across ranks
+combine_reports.py - Combine two runs
+add_comparison_sheets.py - Add GPU timeline comparison sheets
+add_collective_comparison.py - Add collective/NCCL comparison sheets
+```
diff --git a/scripts/tracelens_single_config/add_collective_comparison.py b/scripts/tracelens_single_config/add_collective_comparison.py
new file mode 100644
index 0000000..ee54d46
--- /dev/null
+++ b/scripts/tracelens_single_config/add_collective_comparison.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_collective_comparison_sheets(input_path, output_path):
+ """
+ Add comparison sheets to the combined collective reports.
+ This function will create comparison sheets for the combined collective reports.
+ The comparison sheets will contain the comparison of the baseline and saleelk data.
+ TODO : Later we need to generalize for n runs and get rid of hardcoded data labels
+ """
+ print(f"Loading: {input_path}")
+
+ xl = pd.ExcelFile(input_path)
+
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+ # Copy only summary sheets
+ for sheet_name in xl.sheet_names:
+ # Only keep sheets with 'summary' in the name
+ if "summary" not in sheet_name.lower():
+ print(f" Skip {sheet_name} (keeping only summary sheets)")
+ continue
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(f" Copied {sheet_name}")
+
+ # Process summary sheets for comparison
+ for sheet_name in ["nccl_summary_implicit_sync", "nccl_summary_long"]:
+ if sheet_name not in xl.sheet_names:
+ continue
+
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+
+ # Separate baseline and saleelk
+ baseline_df = df[df["source"] == "baseline"].copy()
+ saleelk_df = df[df["source"] == "saleelk"].copy()
+
+ if len(baseline_df) == 0 or len(saleelk_df) == 0:
+ print(f" Skip {sheet_name} - missing data")
+ continue
+
+ # Create comparison dataframe
+ comparison = pd.DataFrame()
+
+ # Identify key columns for grouping
+ group_cols = ["Collective name", "dtype", "In msg nelems"]
+ if not all(col in baseline_df.columns for col in group_cols):
+ group_cols = ["Collective name"]
+
+ # Group and compare
+ baseline_grouped = baseline_df.groupby(group_cols, as_index=False)
+ saleelk_grouped = saleelk_df.groupby(group_cols, as_index=False)
+
+ for name, base_group in baseline_grouped:
+ # Find matching saleelk group
+ if isinstance(name, tuple):
+ mask = pd.Series([True] * len(saleelk_df), index=saleelk_df.index)
+ for col, val in zip(group_cols, name):
+ mask = mask & (saleelk_df[col] == val)
+ else:
+ mask = saleelk_df[group_cols[0]] == name
+
+ sale_group = saleelk_df.loc[mask]
+
+ if len(sale_group) == 0:
+ continue
+
+ # Create comparison row
+ comp_row = {}
+
+ # Copy grouping columns
+ if isinstance(name, tuple):
+ for col, val in zip(group_cols, name):
+ comp_row[col] = val
+ else:
+ comp_row[group_cols[0]] = name
+
+ # Compare numeric columns
+ numeric_cols = [
+ "comm_latency_mean",
+ "algo bw (GB/s)_mean",
+ "bus bw (GB/s)_mean",
+ "Total comm latency (ms)",
+ "count",
+ ]
+
+ for col in numeric_cols:
+ if col not in base_group.columns or col not in sale_group.columns:
+ continue
+
+ base_val = base_group[col].values[0]
+ sale_val = sale_group[col].values[0]
+
+ comp_row[f"baseline_{col}"] = base_val
+ comp_row[f"saleelk_{col}"] = sale_val
+ comp_row[f"diff_{col}"] = sale_val - base_val
+
+ # For latency/time: positive percent_change means faster (less time)
+ # For bandwidth: positive percent_change means better (more bandwidth)
+ if "latency" in col.lower() or "time" in col.lower():
+ # Lower is better - positive when saleelk is faster
+ pct_change = (
+ (base_val - sale_val) / base_val * 100
+ if base_val != 0
+ else 0
+ )
+ comp_row[f"percent_change_{col}"] = pct_change
+ elif "bw" in col.lower() or "bandwidth" in col.lower():
+ # Higher is better - positive when saleelk is better
+ pct_change = (
+ (sale_val - base_val) / base_val * 100
+ if base_val != 0
+ else 0
+ )
+ comp_row[f"percent_change_{col}"] = pct_change
+
+ comp_row[f"ratio_{col}"] = (
+ sale_val / base_val if base_val != 0 else 0
+ )
+
+ comparison = pd.concat(
+ [comparison, pd.DataFrame([comp_row])], ignore_index=True
+ )
+
+ # Write comparison sheet (shorten name to fit Excel's 31 char limit)
+ # Replace 'nccl_summary_' with 'nccl_' and '_comparison' with '_cmp'
+ comparison_sheet_name = (
+ sheet_name.replace("nccl_summary_", "nccl_") + "_cmp"
+ )
+ comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False)
+ print(f" Added {comparison_sheet_name}")
+
+ # Add conditional formatting to percent_change columns
+ print(f" Applying conditional formatting to {comparison_sheet_name}...")
+
+ ws = writer.sheets[comparison_sheet_name]
+
+ # Format all percent_change columns with color scale
+ for col_idx, col in enumerate(comparison.columns, start=1):
+ if "percent_change" in col:
+ # Convert column index to Excel letter (A, B, C, ...)
+ if col_idx <= 26:
+ col_letter = chr(64 + col_idx)
+ else:
+ col_letter = chr(64 + (col_idx // 26)) + chr(
+ 64 + (col_idx % 26)
+ )
+
+ data_range = f"{col_letter}2:{col_letter}{len(comparison)+1}"
+
+ # Color scale: red (min/negative) -> white (0) -> green (max/positive)
+ ws.conditional_formatting.add(
+ data_range,
+ ColorScaleRule(
+ start_type="min",
+ start_color="F8696B", # Red
+ mid_type="num",
+ mid_value=0,
+ mid_color="FFFFFF", # White
+ end_type="max",
+ end_color="63BE7B", # Green
+ ),
+ )
+
+ print(f" Formatted {col}")
+
+ print(f"\nSaved: {output_path}")
+ print("\nNew comparison sheets added")
+ print("percent_change interpretation:")
+ print(" For latency/time: Positive = faster (less time)")
+ print(" For bandwidth: Positive = better (more bandwidth)")
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Add comparison sheets to combined collective reports"
+ )
+ parser.add_argument(
+ "--input", required=True, help="Input combined collective Excel file"
+ )
+ parser.add_argument(
+ "--output", required=True, help="Output Excel file with comparison sheets"
+ )
+
+ args = parser.parse_args()
+
+ return add_collective_comparison_sheets(args.input, args.output)
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/scripts/tracelens_single_config/add_comparison_sheets.py b/scripts/tracelens_single_config/add_comparison_sheets.py
new file mode 100644
index 0000000..a50114a
--- /dev/null
+++ b/scripts/tracelens_single_config/add_comparison_sheets.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def add_comparison_sheets(input_path, output_path):
+ """
+ Create comparison sheets for the combined excel file of individual reports.
+ """
+ print(f"Loading: {input_path}")
+
+ xl = pd.ExcelFile(input_path)
+
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+ # Copy all original sheets
+ for sheet_name in xl.sheet_names:
+ df = pd.read_excel(input_path, sheet_name=sheet_name)
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(f" Copied {sheet_name}")
+
+ # Add comparison sheets
+ all_combined = pd.read_excel(input_path, sheet_name="All_Ranks_Combined")
+
+ # Comparison 1: Side-by-side by rank
+ baseline_data = all_combined[all_combined["source"] == "baseline"]
+ saleelk_data = all_combined[all_combined["source"] == "saleelk"]
+
+ comparison_by_rank = pd.DataFrame()
+ for rank in sorted(baseline_data["rank"].unique()):
+ base_rank = baseline_data[baseline_data["rank"] == rank].set_index("type")
+ sale_rank = saleelk_data[saleelk_data["rank"] == rank].set_index("type")
+
+ for metric_type in base_rank.index:
+ if metric_type in sale_rank.index:
+ base_time = base_rank.loc[metric_type, "time ms"]
+ sale_time = sale_rank.loc[metric_type, "time ms"]
+ ratio_val = sale_time / base_time if base_time != 0 else 0
+ # Percentage change: positive when saleelk is faster (takes less time)
+ pct_change = (
+ (base_time - sale_time) / base_time * 100
+ if base_time != 0
+ else 0
+ )
+
+ # Determine if better or worse
+ if pct_change > 1:
+ status = "Better"
+ elif pct_change < -1:
+ status = "Worse"
+ else:
+ status = "Similar"
+
+ comparison_by_rank = pd.concat(
+ [
+ comparison_by_rank,
+ pd.DataFrame(
+ {
+ "rank": [rank],
+ "type": [metric_type],
+ "baseline_time_ms": [base_time],
+ "saleelk_time_ms": [sale_time],
+ "diff_time_ms": [sale_time - base_time],
+ "percent_change": [pct_change],
+ "status": [status],
+ "ratio": [ratio_val],
+ "baseline_percent": [
+ base_rank.loc[metric_type, "percent"]
+ ],
+ "saleelk_percent": [
+ sale_rank.loc[metric_type, "percent"]
+ ],
+ "diff_percent": [
+ sale_rank.loc[metric_type, "percent"]
+ - base_rank.loc[metric_type, "percent"]
+ ],
+ }
+ ),
+ ],
+ ignore_index=True,
+ )
+
+ comparison_by_rank.to_excel(
+ writer, sheet_name="Comparison_By_Rank", index=False
+ )
+ print(f" Added Comparison_By_Rank")
+
+ # Comparison 2: Summary comparison
+ summary = pd.read_excel(input_path, sheet_name="Summary")
+ baseline_summary = summary[summary["source"] == "baseline"].set_index("type")
+ saleelk_summary = summary[summary["source"] == "saleelk"].set_index("type")
+
+ summary_comparison = pd.DataFrame()
+ for metric_type in baseline_summary.index:
+ if metric_type in saleelk_summary.index:
+ base_time = baseline_summary.loc[metric_type, "time ms"]
+ sale_time = saleelk_summary.loc[metric_type, "time ms"]
+ ratio_val = sale_time / base_time if base_time != 0 else 0
+ # Percentage change: positive when saleelk is faster (takes less time)
+ pct_change = (
+ (base_time - sale_time) / base_time * 100 if base_time != 0 else 0
+ )
+
+ summary_comparison = pd.concat(
+ [
+ summary_comparison,
+ pd.DataFrame(
+ {
+ "type": [metric_type],
+ "baseline_time_ms": [base_time],
+ "saleelk_time_ms": [sale_time],
+ "diff_time_ms": [sale_time - base_time],
+ "percent_change": [pct_change],
+ "ratio": [ratio_val],
+ "baseline_percent": [
+ baseline_summary.loc[metric_type, "percent"]
+ ],
+ "saleelk_percent": [
+ saleelk_summary.loc[metric_type, "percent"]
+ ],
+ "diff_percent": [
+ saleelk_summary.loc[metric_type, "percent"]
+ - baseline_summary.loc[metric_type, "percent"]
+ ],
+ }
+ ),
+ ],
+ ignore_index=True,
+ )
+
+ summary_comparison.to_excel(
+ writer, sheet_name="Summary_Comparison", index=False
+ )
+ print(f" Added Summary_Comparison")
+
+ # Add conditional formatting to percent_change columns
+ print("\n Applying conditional formatting...")
+
+ # Create color scale: Red (negative) -> White (0) -> Green (positive)
+
+ # Format Comparison_By_Rank
+ ws_rank = writer.sheets["Comparison_By_Rank"]
+ # Find percent_change column
+ for col_idx, col in enumerate(comparison_by_rank.columns, start=1):
+ if col == "percent_change":
+ col_letter = chr(64 + col_idx) # Convert to Excel column letter
+ data_range = f"{col_letter}2:{col_letter}{len(comparison_by_rank)+1}"
+ # Color scale: red (min) -> white (0) -> green (max)
+ ws_rank.conditional_formatting.add(
+ data_range,
+ ColorScaleRule(
+ start_type="min",
+ start_color="F8696B", # Red
+ mid_type="num",
+ mid_value=0,
+ mid_color="FFFFFF", # White
+ end_type="max",
+ end_color="63BE7B", # Green
+ ),
+ )
+ print(f" Formatted Comparison_By_Rank column {col}")
+ break
+
+ # Format Summary_Comparison
+ ws_summary = writer.sheets["Summary_Comparison"]
+ for col_idx, col in enumerate(summary_comparison.columns, start=1):
+ if col == "percent_change":
+ col_letter = chr(64 + col_idx)
+ data_range = f"{col_letter}2:{col_letter}{len(summary_comparison)+1}"
+ # Color scale: red (min) -> white (0) -> green (max)
+ ws_summary.conditional_formatting.add(
+ data_range,
+ ColorScaleRule(
+ start_type="min",
+ start_color="F8696B", # Red
+ mid_type="num",
+ mid_value=0,
+ mid_color="FFFFFF", # White
+ end_type="max",
+ end_color="63BE7B", # Green
+ ),
+ )
+ print(f" Formatted Summary_Comparison column {col}")
+ break
+
+ print(f"\nSaved: {output_path}")
+ print("\nNew sheets:")
+ print(" Comparison_By_Rank - Side-by-side comparison for each rank")
+ print(" Summary_Comparison - Overall comparison")
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Add comparison sheets to combined GPU timeline"
+ )
+ parser.add_argument("--input", required=True, help="Input combined Excel file")
+ parser.add_argument(
+ "--output", required=True, help="Output Excel file with comparison sheets"
+ )
+
+ args = parser.parse_args()
+
+ return add_comparison_sheets(args.input, args.output)
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/scripts/tracelens_single_config/combine_reports.py b/scripts/tracelens_single_config/combine_reports.py
new file mode 100644
index 0000000..5d8bfb1
--- /dev/null
+++ b/scripts/tracelens_single_config/combine_reports.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+from pathlib import Path
+
+
+def combine_collective_reports(baseline_path, test_path, output_path):
+ """
+ Combine two collective reports into a single Excel file by adding a source column to the data.
+ """
+
+ print(f"Loading baseline: {baseline_path}")
+ baseline_xl = pd.ExcelFile(baseline_path)
+
+ print(f"Loading test: {test_path}")
+ test_xl = pd.ExcelFile(test_path)
+
+ print(f"\nBaseline sheets: {baseline_xl.sheet_names}")
+ print(f"Test sheets: {test_xl.sheet_names}")
+
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+ for sheet_name in baseline_xl.sheet_names:
+ if sheet_name not in test_xl.sheet_names:
+ print(f" Skip {sheet_name} - not in test file")
+ continue
+
+ baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
+ test_df = pd.read_excel(test_path, sheet_name=sheet_name)
+
+ baseline_df["source"] = "baseline"
+ test_df["source"] = "saleelk"
+
+ combined = pd.concat([baseline_df, test_df], ignore_index=True)
+
+ combined.to_excel(writer, sheet_name=sheet_name, index=False)
+ print(
+ f" Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows"
+ )
+
+ print(f"\nSaved: {output_path}")
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Combine two collective reports")
+ parser.add_argument(
+ "--baseline", required=True, help="Path to baseline collective_all_ranks.xlsx"
+ )
+ parser.add_argument(
+ "--test", required=True, help="Path to test collective_all_ranks.xlsx"
+ )
+ parser.add_argument(
+ "--output", required=True, help="Output path for combined Excel file"
+ )
+
+ args = parser.parse_args()
+
+ return combine_collective_reports(args.baseline, args.test, args.output)
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/scripts/tracelens_single_config/create_final_html.py b/scripts/tracelens_single_config/create_final_html.py
new file mode 100644
index 0000000..74ea0c3
--- /dev/null
+++ b/scripts/tracelens_single_config/create_final_html.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+import base64
+import argparse
+
+from html_report_config import (
+ HTML_HEADER,
+ HTML_FOOTER,
+ OVERALL_GPU_CHARTS,
+ CROSS_RANK_CHARTS,
+ NCCL_CHARTS,
+)
+
+
+def get_image_base64(image_path):
+ """Read an image file and return its base64-encoded string."""
+ try:
+ with open(image_path, "rb") as f:
+ return base64.b64encode(f.read()).decode("utf-8")
+ except Exception as e:
+ print(f"Error getting image data from {image_path}: {e}")
+ return None
+
+
+def create_chart_html(plot_dir, chart_config):
+ """Generate HTML for a single chart with title, image, and description."""
+ image_data = get_image_base64(plot_dir / chart_config["file"])
+ if image_data is None:
+ return ""
+ return f"""
+
{chart_config['name']}
+
+ {chart_config['description']}
+ """
+
+
+def create_section_html(title, plot_dir, charts):
+ """Generate HTML for a complete section with multiple charts."""
+ section_html = f"""
+ {title}
+ """
+ for chart in charts:
+ section_html += create_chart_html(plot_dir, chart)
+ return section_html
+
+
+def create_final_html(plot_file_path, output_path):
+ html_body = """
+
+
+ Performance Analysis Report
+
+
+
+ Executive Summary
+
+Comparison of GPU performance metrics between baseline and Saleelk
+implementations across 8 ranks.
+"""
+
+ # Build all sections
+ sections = [
+ create_section_html(
+ "1. Overall GPU Metrics Comparison", plot_file_path, OVERALL_GPU_CHARTS
+ ),
+ create_section_html(
+ "2. Cross-Rank Performance Comparison", plot_file_path, CROSS_RANK_CHARTS
+ ),
+ create_section_html(
+ "3. NCCL Collective Operations Analysis", plot_file_path, NCCL_CHARTS
+ ),
+ ]
+
+ final_html = HTML_HEADER + html_body + "".join(sections) + HTML_FOOTER
+ with open(output_path, "w") as f:
+ f.write(final_html)
+ print(f"Final HTML file created at: {output_path}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Create a final HTML file for the analysis report."
+ )
+ parser.add_argument(
+ "-p",
+ "--plot-files-directory",
+ type=Path,
+ required=True,
+ help="Path to the plot files direcotry.",
+ )
+ parser.add_argument(
+ "-o", "--output-html", type=None, default=None, help="Path to the output file."
+ )
+ args = parser.parse_args()
+ output_path = (
+ args.output_html
+ if args.output_html
+ else args.plot_files_directory.parent / "final_analysis_report.html"
+ )
+ create_final_html(args.plot_files_directory, output_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tracelens_single_config/create_final_plots.py b/scripts/tracelens_single_config/create_final_plots.py
new file mode 100644
index 0000000..214ec5c
--- /dev/null
+++ b/scripts/tracelens_single_config/create_final_plots.py
@@ -0,0 +1,348 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+import seaborn as sns
+
+
+def plot_improvement_chart(df, output_path):
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # Color bars based on positive/negative values
+ colors = ["#2ecc71" if val > 0 else "#e74c3c" for val in df["Improvement (%)"]]
+
+ bars = ax.barh(df["Metric"], df["Improvement (%)"], color=colors)
+ ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax.set_axisbelow(True)
+
+ ax.spines["top"].set_visible(False)
+ ax.spines["right"].set_visible(False)
+ ax.spines["bottom"].set_visible(False)
+ ax.spines["left"].set_visible(False)
+
+ # Customize the chart
+ ax.set_ylabel("Metric", fontsize=12)
+ ax.set_xlabel("Change (%)", fontsize=12)
+ ax.set_title(
+ "GPU Metrics Percentage Change (Test vs Baseline)\n(Positive = Test is better)",
+ fontsize=14,
+ fontweight="bold",
+ )
+
+ plt.tight_layout()
+ plt.savefig(output_path / "improvement_chart.png", dpi=150)
+ plt.close()
+
+
+def plot_abs_time_comparison(df, output_path):
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # Set up bar positions
+ x = range(len(df))
+ width = 0.35
+
+ # Create bars for Baseline and Test
+ bars1 = ax.bar(
+ [i - width / 2 for i in x],
+ df["Baseline"],
+ width,
+ label="Baseline",
+ color="#3498db",
+ )
+ bars2 = ax.bar(
+ [i + width / 2 for i in x], df["Test"], width, label="Test", color="#e67e22"
+ )
+
+ # Add horizontal grid lines only
+ ax.xaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax.set_axisbelow(True)
+
+ # Remove border/spines
+ ax.spines["top"].set_visible(False)
+ ax.spines["right"].set_visible(False)
+ ax.spines["bottom"].set_visible(False)
+ ax.spines["left"].set_visible(False)
+
+ # Customize the chart
+ ax.set_xlabel("Metric Type", fontsize=12)
+ ax.set_ylabel("Time (ms)", fontsize=12)
+ ax.set_title(
+ "GPU Metrics Absolute Time Comparison ", fontsize=14, fontweight="bold"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(df["Metric"], rotation=45, ha="right")
+ ax.legend()
+
+ plt.tight_layout()
+ plt.savefig(output_path / "abs_time_comparison.png", dpi=150)
+ plt.close()
+
+
+def create_summary_charts(excel_path, output_path):
+
+ # Read the Summary_Dashboard sheet
+ df = pd.read_excel(excel_path, sheet_name="Summary_Dashboard")
+
+ plot_improvement_chart(df, output_path)
+ plot_abs_time_comparison(df, output_path)
+ # Create the horizontal bar chart
+
+
+def plot_gpu_type_by_rank(total_time_df, output_path, title):
+ # Create the line plot
+ fig, ax = plt.subplots(figsize=(12, 6))
+
+ # Plot baseline total_time by rank
+ ax.plot(
+ total_time_df["rank"],
+ total_time_df["baseline_time_ms"],
+ marker="o",
+ linewidth=2,
+ markersize=8,
+ color="#3498db",
+ label="Baseline",
+ )
+
+ # Plot Saleelk (test) total_time by rank
+ ax.plot(
+ total_time_df["rank"],
+ total_time_df["saleelk_time_ms"],
+ marker="s",
+ linewidth=2,
+ markersize=8,
+ color="#e67e22",
+ label="Test",
+ )
+
+ # Add horizontal grid lines only
+ ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax.set_axisbelow(True)
+
+ # Customize the chart
+ ax.set_xlabel("Rank", fontsize=12)
+ ax.set_ylabel("Total Time (ms)", fontsize=12)
+ ax.set_title(f"{title} Comparison across all ranks", fontsize=14, fontweight="bold")
+ ax.legend()
+
+ plt.tight_layout()
+ plt.savefig(output_path, dpi=150)
+ plt.close()
+
+
+def create_gpu_time_accross_all_ranks(excel_path, output_path):
+ # Read the GPU_ByRank_Cmp sheet
+ df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+
+ # Filter for total_time rows only
+ for type in ["total_time", "computation_time", "total_comm_time", "idle_time"]:
+ total_time_df = df[df["type"] == type]
+ plot_gpu_type_by_rank(total_time_df, output_path / f"{type}_by_rank.png", type)
+
+
+def plot_gpu_time_change_percentage_summaryby_rank(df, ax):
+ colors = ["#2ecc71" if val > 0 else "#e74c3c" for val in df["percent_change"]]
+ bars = ax.bar(df["rank"].astype(str), df["percent_change"], color=colors)
+ # Add horizontal line at 0
+ ax.axhline(y=0, color="black", linestyle="-", linewidth=0.5)
+
+ # Add horizontal grid lines only
+ ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax.set_axisbelow(True)
+ ax.set_xlabel("Rank")
+ ax.set_ylabel("Percent Change (%)")
+
+
+def create_gpu_time_change_percentage_summaryby_rank(excel_path, output_path):
+ # Read the GPU_ByRank_Cmp sheet
+ df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+
+ fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(12, 6))
+
+ row_types = [
+ "busy_time",
+ "computation_time",
+ "exposed_comm_time",
+ "exposed_memcpy_time",
+ "idle_time",
+ "total_comm_time",
+ "total_memcpy_time",
+ "total_time",
+ ]
+ # Filter for total_time rows only
+ for i, type in enumerate(row_types):
+ type_df = df[df["type"] == type]
+ plot_gpu_time_change_percentage_summaryby_rank(type_df, ax[i // 4, i % 4])
+ ax[i // 4, i % 4].set_title(f"{type}")
+ plt.tight_layout()
+ plt.savefig(output_path / "gpu_time_change_percentage_summary_by_rank.png", dpi=150)
+ plt.close()
+
+
+def create_nccl_charts(excel_path, output_path):
+ # Read the NCCL_Charst sheet
+ df = pd.read_excel(excel_path, sheet_name="NCCL_ImplSync_Cmp")
+ df["label"] = df["Collective name"] + "\n" + df["In msg nelems"].astype(str)
+ x = range(len(df))
+
+ plot_item = {
+ "NCCL Communication Latency": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Communication Latency (ms)",
+ "y_col_names": ["baseline_comm_latency_mean", "saleelk_comm_latency_mean"],
+ },
+ "NCCL Algorithm Bandwidth": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Algorithm Bandwidth (GB/s)",
+ "y_col_names": [
+ "baseline_algo bw (GB/s)_mean",
+ "saleelk_algo bw (GB/s)_mean",
+ ],
+ },
+ "NCCL Bus Bandwidth": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Bus Bandwidth (GB/s)",
+ "y_col_names": [
+ "baseline_bus bw (GB/s)_mean",
+ "saleelk_bus bw (GB/s)_mean",
+ ],
+ },
+ "NCCL Total Communication Latency": {
+ "x_label": "Collective Operation (Message Size)",
+ "y_label": "Total Communication Latency (ms)",
+ "y_col_names": [
+ "baseline_Total comm latency (ms)",
+ "saleelk_Total comm latency (ms)",
+ ],
+ },
+ }
+ for item in plot_item.keys():
+ fig, ax = plt.subplots(figsize=(14, 6))
+ width = 0.35
+ bars1 = ax.bar(
+ [i - width / 2 for i in x],
+ df[plot_item[item]["y_col_names"][0]],
+ width,
+ label="Baseline",
+ color="#3498db",
+ )
+ bars2 = ax.bar(
+ [i + width / 2 for i in x],
+ df[plot_item[item]["y_col_names"][1]],
+ width,
+ label="Test",
+ color="#e67e22",
+ )
+ ax.yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax.set_axisbelow(True)
+ ax.set_xticks(x)
+ ax.set_xticklabels(df["label"], rotation=45, ha="right", fontsize=8)
+ ax.set_xlabel(plot_item[item]["x_label"], fontsize=12)
+ ax.set_ylabel(plot_item[item]["y_label"], fontsize=12)
+ ax.set_title(f"{item} Comparison", fontsize=14, fontweight="bold")
+ ax.legend()
+ plt.tight_layout()
+ plt.savefig(output_path / f'{item.replace(" ", "_")}_comparison.png', dpi=150)
+ plt.close()
+
+ percentage_chart_item = {
+ "Comm Latency": "percent_change_comm_latency_mean",
+ "Algo BW": "percent_change_algo bw (GB/s)_mean",
+ "Bus BW": "percent_change_bus bw (GB/s)_mean",
+ }
+ fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))
+ plot_item_index = 0
+ for item in percentage_chart_item.keys():
+ colors = [
+ "#2ecc71" if val > 0 else "#e74c3c"
+ for val in df[percentage_chart_item[item]]
+ ]
+ bars = ax[plot_item_index].barh(
+ df["In msg nelems"].astype(str),
+ df[percentage_chart_item[item]],
+ color=colors,
+ )
+ ax[plot_item_index].yaxis.grid(True, linestyle="--", alpha=0.7, color="gray")
+ ax[plot_item_index].set_axisbelow(True)
+ ax[plot_item_index].set_xlabel("Percent Change (%)")
+ ax[plot_item_index].set_title(f"{item} \n Percent Change (Positive = better)")
+ plot_item_index += 1
+ fig.suptitle(
+ "NCCL Performance Percentage Change By Message Size",
+ fontsize=16,
+ fontweight="bold",
+ )
+ plt.tight_layout()
+ plt.savefig(
+ output_path / f"NCCL_Performance_Percentage_Change_comparison.png", dpi=150
+ )
+ plt.close()
+
+
+def create_gpu_time_heatmap(excel_path, output_path):
+ # Read the GPU_ByRank_Cmp sheet
+ df = pd.read_excel(excel_path, sheet_name="GPU_ByRank_Cmp")
+ # Plot the GPU time heatmap
+ pivot_df = df.pivot(index="type", columns="rank", values="percent_change")
+
+ # Create heatmap
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ sns.heatmap(
+ pivot_df,
+ annot=True, # Show values in cells
+ fmt=".1f", # Format as 1 decimal
+ cmap="RdYlGn", # Red-Yellow-Green colormap (red=bad, green=good)
+ center=0, # Center colormap at 0
+ linewidths=0.5, # Add gridlines
+ cbar_kws={"label": "Percent Change (%)"},
+ )
+
+ ax.set_title(
+ "GPU Metric Percentage Change by Rank (HeatMap) \n (Positive = Better Test)",
+ fontsize=14,
+ fontweight="bold",
+ )
+ ax.set_xlabel("Rank", fontsize=12)
+ ax.set_ylabel("Metric Type", fontsize=12)
+
+ plt.tight_layout()
+ plt.savefig(output_path / "gpu_time_heatmap.png", dpi=150)
+ plt.show()
+
+
+def main():
+ import argparse
+
+ parser = argparse.ArgumentParser(
+ description="Generate improvement chart from generated reports"
+ )
+ parser.add_argument(
+ "--report-path",
+ type=Path,
+ default="~/aorta/aorta_single_config/aorta/expt_compare/final_analysis_report.xlsx",
+ help="Path to the input Excel file (should have Summary_Dashboard sheet)",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ default=None,
+ help="Path to the output directory to save PNG files",
+ )
+
+ args = parser.parse_args()
+ output_path = args.output if args.output else args.report_path.parent / "plots"
+ output_path.mkdir(exist_ok=True, parents=True)
+ create_summary_charts(args.report_path, output_path)
+ print(f"Summary charts saved to: {args.output}")
+ create_gpu_time_heatmap(args.report_path, output_path)
+ print(f"GPU time heatmap saved to: {output_path}")
+ create_gpu_time_accross_all_ranks(args.report_path, output_path)
+ print(f"GPU time across all runs saved to: {output_path}")
+ create_gpu_time_change_percentage_summaryby_rank(args.report_path, output_path)
+ print(f"GPU time change percentage summary by rank saved to: {output_path}")
+ create_nccl_charts(args.report_path, output_path)
+ print(f"NCCL communication charts saved to: {output_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/tracelens_single_config/create_final_report.py b/scripts/tracelens_single_config/create_final_report.py
new file mode 100644
index 0000000..588edb3
--- /dev/null
+++ b/scripts/tracelens_single_config/create_final_report.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+Create final comprehensive report with combined and comparison data.
+Raw data sheets are hidden and all data is formatted as Excel tables.
+"""
+import pandas as pd
+import argparse
+from pathlib import Path
+from openpyxl import load_workbook
+from openpyxl.worksheet.table import Table, TableStyleInfo
+from openpyxl.styles import Color
+from openpyxl.formatting.rule import ColorScaleRule
+
+
+def get_column_letter(col_num):
+ """Convert column number to Excel column letter."""
+ result = ""
+ while col_num > 0:
+ col_num -= 1
+ result = chr(65 + (col_num % 26)) + result
+ col_num //= 26
+ return result
+
+
+def add_excel_table(worksheet, table_name, start_row=1):
+ """Convert worksheet data to Excel table format."""
+ # Find data range
+ max_row = worksheet.max_row
+ max_col = worksheet.max_column
+
+ if max_row <= start_row:
+ return # No data
+
+ # Ensure all column headers are strings
+ for col_idx in range(1, max_col + 1):
+ cell = worksheet.cell(row=start_row, column=col_idx)
+ if cell.value is not None and not isinstance(cell.value, str):
+ cell.value = str(cell.value)
+
+ # Create table reference using proper column letter conversion
+ start_cell = f"A{start_row}"
+ end_col_letter = get_column_letter(max_col)
+ end_cell = f"{end_col_letter}{max_row}"
+ table_ref = f"{start_cell}:{end_cell}"
+
+ # Create table with style
+ try:
+ tab = Table(displayName=table_name, ref=table_ref)
+ style = TableStyleInfo(
+ name="TableStyleMedium2",
+ showFirstColumn=False,
+ showLastColumn=False,
+ showRowStripes=True,
+ showColumnStripes=False,
+ )
+ tab.tableStyleInfo = style
+
+ # Add table to worksheet
+ worksheet.add_table(tab)
+ except Exception as e:
+ print(f" Warning: Could not create table {table_name}: {e}")
+
+
+def create_final_report(
+ gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file
+):
+ """Create comprehensive report with all data."""
+
+ print("Creating comprehensive final report...")
+ print(f" Output: {output_file}")
+
+ # Track sheet info for hiding/organizing
+ raw_sheets = []
+ comparison_sheets = []
+ summary_sheets = []
+
+ with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
+
+ # === GPU TIMELINE SHEETS ===
+ print("\nAdding GPU Timeline sheets...")
+
+ # Read GPU combined (raw data)
+ gpu_comb_xl = pd.ExcelFile(gpu_combined)
+ sheet_mapping = {
+ "Summary": "GPU_Summary_Raw",
+ "All_Ranks_Combined": "GPU_AllRanks_Raw",
+ "Per_Rank_Time_ms": "GPU_Time_Raw",
+ "Per_Rank_Percent": "GPU_Pct_Raw",
+ }
+ for sheet_name in gpu_comb_xl.sheet_names:
+ df = pd.read_excel(gpu_combined, sheet_name=sheet_name)
+ new_name = sheet_mapping.get(sheet_name, f"GPU_{sheet_name}_Raw")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ raw_sheets.append(new_name)
+ print(f" Added {new_name} (will be hidden)")
+
+ # Read GPU comparison
+ gpu_comp_xl = pd.ExcelFile(gpu_comparison)
+ comp_mapping = {
+ "Summary_Comparison": "GPU_Summary_Cmp",
+ "Comparison_By_Rank": "GPU_ByRank_Cmp",
+ }
+ for sheet_name in gpu_comp_xl.sheet_names:
+ if "Comparison" in sheet_name:
+ df = pd.read_excel(gpu_comparison, sheet_name=sheet_name)
+ new_name = comp_mapping.get(sheet_name, f"GPU_{sheet_name}")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ comparison_sheets.append(new_name)
+ print(f" Added {new_name}")
+
+ # === COLLECTIVE SHEETS ===
+ print("\nAdding Collective/NCCL sheets...")
+
+ # Read collective combined (raw data for hidden sheets)
+ coll_comb_xl = pd.ExcelFile(coll_combined)
+ coll_mapping = {
+ "nccl_summary_implicit_sync": "NCCL_ImplSync_Raw",
+ "nccl_summary_long": "NCCL_Long_Raw",
+ }
+ for sheet_name in coll_comb_xl.sheet_names:
+ if "summary" in sheet_name.lower():
+ df = pd.read_excel(coll_combined, sheet_name=sheet_name)
+ new_name = coll_mapping.get(sheet_name, f"NCCL_{sheet_name}_Raw")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ raw_sheets.append(new_name)
+ print(f" Added {new_name} (will be hidden)")
+
+ # Read collective comparison
+ coll_comp_xl = pd.ExcelFile(coll_comparison)
+ coll_cmp_mapping = {
+ "nccl_implicit_sync_cmp": "NCCL_ImplSync_Cmp",
+ "nccl_long_cmp": "NCCL_Long_Cmp",
+ }
+ for sheet_name in coll_comp_xl.sheet_names:
+ if "_cmp" in sheet_name:
+ df = pd.read_excel(coll_comparison, sheet_name=sheet_name)
+ new_name = coll_cmp_mapping.get(sheet_name, f"NCCL_{sheet_name}")
+ df.to_excel(writer, sheet_name=new_name, index=False)
+ comparison_sheets.append(new_name)
+ print(f" Added {new_name}")
+
+ # === CREATE SUMMARY DASHBOARD ===
+ print("\nCreating Summary Dashboard...")
+
+ # Read key metrics for dashboard
+ gpu_summary = pd.read_excel(gpu_comparison, sheet_name="Summary_Comparison")
+
+ # Create dashboard data
+ dashboard_data = {
+ "Metric": [],
+ "Baseline": [],
+ "Test": [],
+ "Improvement (%)": [],
+ "Status": [],
+ }
+
+ # Add GPU metrics
+ for _, row in gpu_summary.iterrows():
+ metric_type = row["type"]
+ dashboard_data["Metric"].append(f"GPU_{metric_type}")
+ dashboard_data["Baseline"].append(round(row["baseline_time_ms"], 2))
+ dashboard_data["Test"].append(round(row["saleelk_time_ms"], 2))
+ dashboard_data["Improvement (%)"].append(round(row["percent_change"], 2))
+ dashboard_data["Status"].append(
+ "Better"
+ if row["percent_change"] > 0
+ else "Worse" if row["percent_change"] < -1 else "Similar"
+ )
+
+ dashboard_df = pd.DataFrame(dashboard_data)
+ dashboard_df.to_excel(writer, sheet_name="Summary_Dashboard", index=False)
+ summary_sheets.append("Summary_Dashboard")
+ print(f" Added Summary_Dashboard")
+
+ # Now modify the workbook to hide sheets and add tables
+ print("\nApplying formatting...")
+ wb = load_workbook(output_file)
+
+ # Hide raw data sheets
+ for sheet_name in raw_sheets:
+ if sheet_name in wb.sheetnames:
+ wb[sheet_name].sheet_state = "hidden"
+ print(f" Hidden: {sheet_name}")
+
+ # Convert all sheets to tables
+ for sheet_name in wb.sheetnames:
+ ws = wb[sheet_name]
+
+ # Skip if sheet is empty
+ if ws.max_row <= 1:
+ continue
+
+ # Create unique table name from sheet name (remove special chars)
+ table_name = (
+ sheet_name.replace(" ", "_")
+ .replace("-", "_")
+ .replace("(", "")
+ .replace(")", "")
+ )
+ # Ensure name starts with letter and is max 255 chars
+ if not table_name[0].isalpha():
+ table_name = "Tbl_" + table_name
+ table_name = table_name[:255]
+
+ add_excel_table(ws, table_name)
+ print(f" Converted to table: {sheet_name}")
+
+ # Add conditional formatting for percent_change columns
+ if "Cmp" in sheet_name or "Comparison" in sheet_name:
+ # Find percent_change columns
+ for col_idx in range(1, ws.max_column + 1):
+ cell_value = ws.cell(row=1, column=col_idx).value
+ if cell_value and "percent_change" in str(cell_value):
+ col_letter = get_column_letter(col_idx)
+ data_range = f"{col_letter}2:{col_letter}{ws.max_row}"
+
+ # Apply color scale: red (min/negative) -> white (0) -> green (max/positive)
+ try:
+ ws.conditional_formatting.add(
+ data_range,
+ ColorScaleRule(
+ start_type="min",
+ start_color="F8696B", # Red
+ mid_type="num",
+ mid_value=0,
+ mid_color="FFFFFF", # White
+ end_type="max",
+ end_color="63BE7B", # Green
+ ),
+ )
+ print(
+ f" Applied color scale to {sheet_name} column {cell_value}"
+ )
+ except Exception as e:
+ print(
+ f" Warning: Could not apply formatting to {cell_value}: {e}"
+ )
+
+ # Move Summary Dashboard to first position
+ if "Summary_Dashboard" in wb.sheetnames:
+ dashboard_sheet = wb["Summary_Dashboard"]
+ wb.move_sheet(dashboard_sheet, offset=-(len(wb.sheetnames) - 1))
+ wb.active = 0 # Set dashboard as active sheet
+ print("\n Moved Summary_Dashboard to first position")
+
+ # Save workbook
+ wb.save(output_file)
+ print(f"\nFinal report saved: {output_file}")
+
+ # Report structure
+ print("\nReport Structure:")
+ print(" Visible Sheets (Analysis):")
+ print(f" - Summary_Dashboard")
+ for sheet in comparison_sheets:
+ print(f" - {sheet}")
+ print("\n Hidden Sheets (Raw Data):")
+ for sheet in raw_sheets:
+ print(f" - {sheet}")
+ print("\n All data formatted as Excel tables with filters")
+ print(" Percent change columns are color-coded (green=better, red=worse)")
+ print(
+ "\nUsers can unhide raw data sheets in Excel: Right-click any sheet tab → Unhide"
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Create final comprehensive report with all data",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Example:
+ python create_final_report.py \\
+ --gpu-combined gpu_timeline_combined.xlsx \\
+ --gpu-comparison gpu_timeline_comparison.xlsx \\
+ --coll-combined collective_combined.xlsx \\
+ --coll-comparison collective_comparison.xlsx \\
+ --output final_analysis_report.xlsx
+ """,
+ )
+
+ parser.add_argument(
+ "--gpu-combined", required=True, help="Path to GPU timeline combined file"
+ )
+ parser.add_argument(
+ "--gpu-comparison", required=True, help="Path to GPU timeline comparison file"
+ )
+ parser.add_argument(
+ "--coll-combined", required=True, help="Path to collective combined file"
+ )
+ parser.add_argument(
+ "--coll-comparison", required=True, help="Path to collective comparison file"
+ )
+ parser.add_argument("--output", required=True, help="Output path for final report")
+
+ args = parser.parse_args()
+
+ # Validate inputs
+ for file_arg in [
+ "gpu_combined",
+ "gpu_comparison",
+ "coll_combined",
+ "coll_comparison",
+ ]:
+ file_path = getattr(args, file_arg)
+ if not Path(file_path).exists():
+ print(f"Error: File not found: {file_path}")
+ return 1
+
+ create_final_report(
+ args.gpu_combined,
+ args.gpu_comparison,
+ args.coll_combined,
+ args.coll_comparison,
+ args.output,
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/scripts/tracelens_single_config/html_report_config.py b/scripts/tracelens_single_config/html_report_config.py
new file mode 100644
index 0000000..5f14d2c
--- /dev/null
+++ b/scripts/tracelens_single_config/html_report_config.py
@@ -0,0 +1,119 @@
+"""Configuration constants for HTML report generation."""
+
+HTML_HEADER = """
+
+
+
+Performance Analysis Report
+
+
+"""
+
+HTML_FOOTER = """
+
+
+"""
+
+# Chart configuration for each section
+OVERALL_GPU_CHARTS = [
+ {
+ "name": "Percentage Change Overview",
+ "file": "improvement_chart.png",
+ "alt": "Summary Chart",
+ "description": "Overall performance change across key GPU metrics. Positive values indicate improvement (Test is faster/better).",
+ },
+ {
+ "name": "Absolute Time Comparison",
+ "file": "abs_time_comparison.png",
+ "alt": "Absolute Time Comparison",
+ "description": "Side-by-side comparison of absolute execution times for all GPU metrics.",
+ },
+]
+
+CROSS_RANK_CHARTS = [
+ {
+ "name": "Performance Heatmap by Rank",
+ "file": "gpu_time_heatmap.png",
+ "alt": "GPU Metric Percentage Change by Rank (HeatMap)",
+ "description": "Comprehensive heatmap showing percent change for all metrics across all ranks. Green indicates better performance (positive % change).",
+ },
+ {
+ "name": "Total Time",
+ "file": "total_time_by_rank.png",
+ "alt": "total_time by Rank",
+ "description": "Total execution time comparison across all ranks, showing end-to-end performance characteristics.",
+ },
+ {
+ "name": "Computation Time",
+ "file": "computation_time_by_rank.png",
+ "alt": "computation_time by Rank",
+ "description": "Pure computation time excluding communication overhead, analyzed per rank.",
+ },
+ {
+ "name": "Communication Time",
+ "file": "total_comm_time_by_rank.png",
+ "alt": "total_comm_time by Rank",
+ "description": "Total time spent in collective communication operations across ranks.",
+ },
+ {
+ "name": "Idle Time",
+ "file": "idle_time_by_rank.png",
+ "alt": "idle_time by Rank",
+ "description": "GPU idle time comparison showing resource utilization efficiency per rank.",
+ },
+ {
+ "name": "Detailed Percentage Change by Metric",
+ "file": "gpu_time_change_percentage_summaryby_rank.png",
+ "alt": "gpu_time_change_percentage_summaryby_rank by Rank",
+ "description": "Detailed breakdown of percent change for each metric type across all ranks.",
+ },
+]
+
+NCCL_CHARTS = [
+ {
+ "name": "NCCL Communication Latency",
+ "file": "NCCL_Communication_Latency_comparison.png",
+ "alt": "NCCL Communication Latency Comparison",
+ "description": "Mean communication latency for NCCL allreduce operations across different message sizes",
+ },
+ {
+ "name": "NCCL Algorithm Bandwidth",
+ "file": "NCCL_Algorithm_Bandwidth_comparison.png",
+ "alt": "NCCL Algorithm Bandwidth Comparison",
+ "description": "Algorithm bandwidth achieved for different message sizes in NCCL collective operations.",
+ },
+ {
+ "name": "NCCL Bus Bandwidth",
+ "file": "NCCL_Bus_Bandwidth_comparison.png",
+ "alt": "NCCL Bus Bandwidth Comparison",
+ "description": "Bus bandwidth utilization across NCCL operations and message sizes.",
+ },
+ {
+ "name": "NCCL Performance Percentage Change",
+ "file": "NCCL_Performance_Percentage_Change_comparison.png",
+ "alt": "NCCL Performance Percentage Change Comparison",
+ "description": "Percent change in communication latency and bandwidth metrics for each message sizec configuration",
+ },
+ {
+ "name": "NCCL Total Communication Latency",
+ "file": "NCCL_Total_Communication_Latency_comparison.png",
+ "alt": "NCCL Total Communication Latency Comparison",
+ "description": "Aggregate communication latency summed across all operations for each message size.",
+ },
+]
diff --git a/scripts/tracelens_single_config/process_gpu_timeline.py b/scripts/tracelens_single_config/process_gpu_timeline.py
new file mode 100644
index 0000000..145f817
--- /dev/null
+++ b/scripts/tracelens_single_config/process_gpu_timeline.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import argparse
+from pathlib import Path
+
+
+def geometric_mean(values):
+ values = np.array(values)
+ values = np.where(values == 0, 1e-10, values)
+ return np.exp(np.mean(np.log(values)))
+
+
+def process_gpu_timeline(reports_dir, use_geo_mean=False):
+ """
+ Create mean/geometric mean aggregated GPU timeline across all ranks inside tracelens analysis directory.
+ """
+ reports_path = Path(reports_dir)
+
+ if not reports_path.exists():
+ print(f"Error: Directory not found: {reports_dir}")
+ return 1
+
+ print(f"Processing GPU timeline from: {reports_dir}")
+ print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}")
+
+ perf_files = sorted(reports_path.glob("perf_rank*.xlsx"))
+
+ if not perf_files:
+ print("Error: No perf_rank*.xlsx files found")
+ return 1
+
+ print(f"Found {len(perf_files)} rank files")
+
+ rank_data = []
+ for file_path in perf_files:
+ rank_num = int(file_path.stem.replace("perf_rank", ""))
+ try:
+ df = pd.read_excel(file_path, sheet_name="gpu_timeline")
+ df["rank"] = rank_num
+ rank_data.append(df)
+ print(f" Rank {rank_num}: OK")
+ except Exception as e:
+ print(f" Rank {rank_num}: Error - {e}")
+
+ if not rank_data:
+ print("Error: No valid data loaded")
+ return 1
+
+ combined = pd.concat(rank_data, ignore_index=True)
+
+ agg_func = geometric_mean if use_geo_mean else "mean"
+ aggregated = (
+ combined.groupby("type")
+ .agg({"time ms": agg_func, "percent": agg_func})
+ .reset_index()
+ )
+
+ aggregated["num_ranks"] = len(perf_files)
+
+ method_suffix = "geomean" if use_geo_mean else "mean"
+ output_path = reports_path.parent / f"gpu_timeline_summary_{method_suffix}.xlsx"
+
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+ aggregated.to_excel(writer, sheet_name="Summary", index=False)
+
+ combined_sorted = combined.sort_values(["rank", "type"])
+ combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False)
+
+ per_rank = combined.pivot_table(
+ values="time ms", index="type", columns="rank", aggfunc="first"
+ )
+ per_rank.to_excel(writer, sheet_name="Per_Rank_Time_ms")
+
+ per_rank_pct = combined.pivot_table(
+ values="percent", index="type", columns="rank", aggfunc="first"
+ )
+ per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent")
+
+ print(f"\nSaved: {output_path}")
+ print("\nSummary:")
+ print(aggregated.to_string(index=False))
+
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Aggregate GPU timeline across ranks")
+ parser.add_argument(
+ "--reports-dir", required=True, help="Path to individual_reports directory"
+ )
+ parser.add_argument("--geo-mean", action="store_true", help="Use geometric mean")
+
+ args = parser.parse_args()
+
+ return process_gpu_timeline(args.reports_dir, args.geo_mean)
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/scripts/tracelens_single_config/run_full_analysis.py b/scripts/tracelens_single_config/run_full_analysis.py
new file mode 100644
index 0000000..5385ec4
--- /dev/null
+++ b/scripts/tracelens_single_config/run_full_analysis.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""
+Master script for complete TraceLens analysis pipeline.
+Runs analysis on baseline and test traces, then performs all comparisons.
+"""
+import argparse
+import subprocess
+import os
+import sys
+from pathlib import Path
+
+
+def run_command(cmd, description):
+ """Execute a command and handle errors."""
+ print(f"\n{'='*80}")
+ print(f"{description}")
+ print(f"{'='*80}")
+ print(f"Command: {' '.join(cmd)}")
+
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode != 0:
+ print(f"Error: {description} failed!")
+ print(f"Stderr: {result.stderr}")
+ return False
+
+ print(result.stdout)
+ return True
+
+
+def run_tracelens_analysis(
+ trace_dir, output_name, individual_only=False, collective_only=False
+):
+ """Run TraceLens analysis on a single trace directory."""
+ print(f"\nAnalyzing: {trace_dir}")
+
+ # Build command
+ script_path = Path(__file__).parent / "run_tracelens_single_config.sh"
+ cmd = ["bash", str(script_path), trace_dir]
+
+ if individual_only:
+ cmd.append("--individual-only")
+ elif collective_only:
+ cmd.append("--collective-only")
+
+ return run_command(cmd, f"TraceLens analysis for {output_name}")
+
+
+def process_gpu_timeline(reports_dir):
+ """Process GPU timeline from individual reports."""
+ script_path = Path(__file__).parent / "process_gpu_timeline.py"
+ cmd = ["python3", str(script_path), "--reports-dir", reports_dir]
+
+ return run_command(cmd, "Processing GPU timeline")
+
+
+def combine_reports(baseline_file, test_file, output_file):
+ """Combine baseline and test reports."""
+ script_path = Path(__file__).parent / "combine_reports.py"
+ cmd = [
+ "python3",
+ str(script_path),
+ "--baseline",
+ baseline_file,
+ "--test",
+ test_file,
+ "--output",
+ output_file,
+ ]
+
+ return run_command(cmd, f"Combining reports to {output_file}")
+
+
+def add_comparison_sheets(input_file, output_file):
+ """Add comparison sheets for GPU timeline."""
+ script_path = Path(__file__).parent / "add_comparison_sheets.py"
+ cmd = ["python3", str(script_path), "--input", input_file, "--output", output_file]
+
+ return run_command(cmd, "Adding GPU timeline comparison sheets")
+
+
+def add_collective_comparison(input_file, output_file):
+ """Add comparison sheets for collective operations."""
+ script_path = Path(__file__).parent / "add_collective_comparison.py"
+ cmd = ["python3", str(script_path), "--input", input_file, "--output", output_file]
+
+ return run_command(cmd, "Adding collective comparison sheets")
+
+
+def create_final_report(
+ gpu_combined, gpu_comparison, coll_combined, coll_comparison, output_file
+):
+ """Create comprehensive final report with all data."""
+ script_path = Path(__file__).parent / "create_final_report.py"
+ cmd = [
+ "python3",
+ str(script_path),
+ "--gpu-combined",
+ gpu_combined,
+ "--gpu-comparison",
+ gpu_comparison,
+ "--coll-combined",
+ coll_combined,
+ "--coll-comparison",
+ coll_comparison,
+ "--output",
+ output_file,
+ ]
+
+ if run_command(cmd, "Creating comprehensive final report"):
+ plot_script_path = Path(__file__).parent / "create_final_plots.py"
+ cmd = ["python3", str(plot_script_path), "--report-path", output_file]
+ if run_command(cmd, "Creating final plots"):
+ html_script_path = Path(__file__).parent / "create_final_html.py"
+ cmd = [
+ "python3",
+ str(html_script_path),
+ "--plot-files-directory",
+ str(Path(output_file).parent / "plots"),
+ ]
+ if run_command(cmd, "Creating final HTML"):
+ return True
+ return False
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Complete TraceLens analysis pipeline with comparisons",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Full analysis with everything including final report
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline/traces \\
+ --test /path/to/test/traces \\
+ --output /path/to/output \\
+ --all
+
+ # Only GPU timeline comparison
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline \\
+ --test /path/to/test \\
+ --output /path/to/output \\
+ --gpu-timeline
+
+ # Create final report (skip TraceLens if already done)
+ python run_full_analysis.py \\
+ --baseline /path/to/baseline \\
+ --test /path/to/test \\
+ --output /path/to/output \\
+ --gpu-timeline --collective --final-report \\
+ --skip-tracelens
+ """,
+ )
+
+ # Required arguments
+ parser.add_argument(
+ "--baseline", required=True, help="Path to baseline trace directory"
+ )
+ parser.add_argument("--test", required=True, help="Path to test trace directory")
+ parser.add_argument(
+ "--output", required=True, help="Output directory for comparison results"
+ )
+
+ # Analysis options
+ parser.add_argument(
+ "--skip-tracelens",
+ action="store_true",
+ help="Skip TraceLens report generation (if already done)",
+ )
+ parser.add_argument(
+ "--individual-only",
+ action="store_true",
+ help="Generate only individual reports",
+ )
+ parser.add_argument(
+ "--collective-only",
+ action="store_true",
+ help="Generate only collective reports",
+ )
+
+ # Comparison options
+ parser.add_argument(
+ "--gpu-timeline", action="store_true", help="Perform GPU timeline comparison"
+ )
+ parser.add_argument(
+ "--collective", action="store_true", help="Perform collective/NCCL comparison"
+ )
+ parser.add_argument(
+ "--final-report",
+ action="store_true",
+ help="Create comprehensive final report with tables and hidden raw data",
+ )
+ parser.add_argument(
+ "--all",
+ action="store_true",
+ help="Perform all analyses and comparisons including final report",
+ )
+
+ args = parser.parse_args()
+
+ # Handle --all flag
+ if args.all:
+ args.gpu_timeline = True
+ args.collective = True
+ args.final_report = True
+
+ # Validate inputs
+ baseline_path = Path(args.baseline)
+ test_path = Path(args.test)
+ output_path = Path(args.output)
+
+ if not baseline_path.exists():
+ print(f"Error: Baseline path not found: {args.baseline}")
+ return 1
+
+ if not test_path.exists():
+ print(f"Error: Test path not found: {args.test}")
+ return 1
+
+ # Create output directory
+ output_path.mkdir(parents=True, exist_ok=True)
+
+ print("\n" + "=" * 80)
+ print("TRACELENS FULL ANALYSIS PIPELINE")
+ print("=" * 80)
+ print(f"Baseline: {args.baseline}")
+ print(f"Test: {args.test}")
+ print(f"Output: {args.output}")
+ print(f"Options:")
+ print(f" Skip TraceLens: {args.skip_tracelens}")
+ print(f" GPU timeline: {args.gpu_timeline}")
+ print(f" Collective: {args.collective}")
+ print(f" Final report: {args.final_report}")
+
+ # Step 1: Run TraceLens analysis on both directories
+ if not args.skip_tracelens:
+ print("\n" + "=" * 80)
+ print("STEP 1: Running TraceLens Analysis")
+ print("=" * 80)
+
+ if not run_tracelens_analysis(
+ args.baseline, "baseline", args.individual_only, args.collective_only
+ ):
+ return 1
+
+ if not run_tracelens_analysis(
+ args.test, "test", args.individual_only, args.collective_only
+ ):
+ return 1
+ else:
+ print("\nSkipping TraceLens report generation (--skip-tracelens flag)")
+
+ # Determine analysis directories
+ baseline_analysis = baseline_path / "tracelens_analysis"
+ test_analysis = test_path / "tracelens_analysis"
+
+ if not baseline_analysis.exists():
+ print(f"Error: Baseline analysis not found: {baseline_analysis}")
+ print("Run without --skip-tracelens flag first")
+ return 1
+
+ if not test_analysis.exists():
+ print(f"Error: Test analysis not found: {test_analysis}")
+ print("Run without --skip-tracelens flag first")
+ return 1
+
+ # Step 2: GPU Timeline Comparison
+ if args.gpu_timeline:
+ print("\n" + "=" * 80)
+ print("STEP 2: GPU Timeline Comparison")
+ print("=" * 80)
+
+ # Process GPU timelines
+ baseline_reports = baseline_analysis / "individual_reports"
+ test_reports = test_analysis / "individual_reports"
+
+ if not baseline_reports.exists() or not test_reports.exists():
+ print(
+ "Error: Individual reports not found. Run without --individual-only flag"
+ )
+ return 1
+
+ print("\nProcessing baseline GPU timeline...")
+ if not process_gpu_timeline(str(baseline_reports)):
+ return 1
+
+ print("\nProcessing test GPU timeline...")
+ if not process_gpu_timeline(str(test_reports)):
+ return 1
+
+ # Combine GPU timeline summaries
+ baseline_gpu = baseline_analysis / "gpu_timeline_summary_mean.xlsx"
+ test_gpu = test_analysis / "gpu_timeline_summary_mean.xlsx"
+ combined_gpu = output_path / "gpu_timeline_combined.xlsx"
+
+ if not combine_reports(str(baseline_gpu), str(test_gpu), str(combined_gpu)):
+ return 1
+
+ # Add comparison sheets
+ gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+ if not add_comparison_sheets(str(combined_gpu), str(gpu_comparison)):
+ return 1
+
+ print(f"\nGPU timeline comparison saved to: {gpu_comparison}")
+
+ # Step 3: Collective Comparison
+ if args.collective:
+ print("\n" + "=" * 80)
+ print("STEP 3: Collective/NCCL Comparison")
+ print("=" * 80)
+
+ baseline_collective = (
+ baseline_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+ )
+ test_collective = (
+ test_analysis / "collective_reports" / "collective_all_ranks.xlsx"
+ )
+
+ if not baseline_collective.exists() or not test_collective.exists():
+ print(
+ "Error: Collective reports not found. Run without --collective-only flag"
+ )
+ return 1
+
+ # Combine collective reports
+ combined_collective = output_path / "collective_combined.xlsx"
+ if not combine_reports(
+ str(baseline_collective), str(test_collective), str(combined_collective)
+ ):
+ return 1
+
+ # Add collective comparison
+ collective_comparison = output_path / "collective_comparison.xlsx"
+ if not add_collective_comparison(
+ str(combined_collective), str(collective_comparison)
+ ):
+ return 1
+
+ print(f"\nCollective comparison saved to: {collective_comparison}")
+
+ # Step 4: Create final comprehensive report
+ if args.final_report and args.gpu_timeline and args.collective:
+ print("\n" + "=" * 80)
+ print("STEP 4: Creating Final Comprehensive Report")
+ print("=" * 80)
+
+ gpu_combined = output_path / "gpu_timeline_combined.xlsx"
+ gpu_comparison = output_path / "gpu_timeline_comparison.xlsx"
+ collective_combined = output_path / "collective_combined.xlsx"
+ collective_comparison = output_path / "collective_comparison.xlsx"
+ final_report = output_path / "final_analysis_report.xlsx"
+
+ if not create_final_report(
+ str(gpu_combined),
+ str(gpu_comparison),
+ str(collective_combined),
+ str(collective_comparison),
+ str(final_report),
+ ):
+ return 1
+
+ print(f"\nFinal comprehensive report saved to: {final_report}")
+ print(" - Summary Dashboard as first sheet")
+ print(" - All comparison sheets visible")
+ print(" - Raw data sheets hidden (can be unhidden in Excel)")
+ print(" - All data formatted as Excel tables with filters")
+ print(" - Color coding applied (green=better, red=worse)")
+
+ # Summary
+ print("\n" + "=" * 80)
+ print("ANALYSIS COMPLETE!")
+ print("=" * 80)
+ print(f"\nResults saved to: {output_path}")
+
+ files = list(output_path.glob("*.xlsx"))
+ if files:
+ print("\nGenerated files:")
+ for f in sorted(files):
+ print(f" - {f.name}")
+
+ print("\nAnalysis pipeline completed successfully!")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/tracelens_single_config/run_tracelens_single_config.sh b/scripts/tracelens_single_config/run_tracelens_single_config.sh
new file mode 100644
index 0000000..96831ff
--- /dev/null
+++ b/scripts/tracelens_single_config/run_tracelens_single_config.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+# TraceLens Analysis for Single Configuration (No Sweep)
+# Usage: ./run_tracelens_single_config.sh
+#
+# The script accepts either:
+# - Path to parent directory containing torch_profiler/
+# - Path to torch_profiler/ directory directly
+#
+# Examples:
+# ./run_tracelens_single_config.sh /path/to/traces
+# ./run_tracelens_single_config.sh /path/to/traces/torch_profiler
+#
+# Note: Uses GEMM-patched TraceLens wrapper to recognize ROCm Tensile kernels
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Use patched TraceLens wrapper for GEMM recognition
+TRACELENS_WRAPPER="python $SCRIPT_DIR/../tracelens_with_gemm_patch.py"
+
+# Parse options
+RUN_INDIVIDUAL=true
+RUN_COLLECTIVE=true
+
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --individual-only)
+ RUN_COLLECTIVE=false
+ shift
+ ;;
+ --collective-only)
+ RUN_INDIVIDUAL=false
+ shift
+ ;;
+ *)
+ INPUT_DIR="$1"
+ shift
+ ;;
+ esac
+done
+
+# Check if directory provided
+if [ -z "$INPUT_DIR" ]; then
+ echo "Error: Please provide trace directory"
+ echo ""
+ echo "Usage: $0 [options]"
+ echo ""
+ echo "Options:"
+ echo " --individual-only Generate only individual reports"
+ echo " --collective-only Generate only collective report"
+ echo ""
+ echo "Examples:"
+ echo " $0 /path/to/traces"
+ echo " $0 /path/to/traces --individual-only"
+ echo " $0 /path/to/traces --collective-only"
+ echo ""
+ exit 1
+fi
+
+# Verify directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+ echo "Error: Directory not found: $INPUT_DIR"
+ exit 1
+fi
+
+# Auto-detect structure: is this torch_profiler/ or its parent?
+TORCH_PROF_DIR=""
+BASE_DIR=""
+
+# Check if INPUT_DIR contains rank directories (i.e., it IS torch_profiler/)
+if find "$INPUT_DIR" -maxdepth 1 -type d -name "rank*" | grep -q .; then
+ TORCH_PROF_DIR="$INPUT_DIR"
+ BASE_DIR=$(dirname "$INPUT_DIR")
+ echo "Detected torch_profiler directory: $TORCH_PROF_DIR"
+# Check if INPUT_DIR contains torch_profiler/ subdirectory
+elif [ -d "$INPUT_DIR/torch_profiler" ]; then
+ TORCH_PROF_DIR="$INPUT_DIR/torch_profiler"
+ BASE_DIR="$INPUT_DIR"
+ echo "Found torch_profiler subdirectory: $TORCH_PROF_DIR"
+else
+ echo "Error: Cannot find rank directories in expected structure"
+ echo ""
+ echo "Expected one of:"
+ echo " 1. Directory with rank0/, rank1/, ... subdirectories (torch_profiler/)"
+ echo " 2. Parent directory containing torch_profiler/rank0/, rank1/, ..."
+ echo ""
+ echo "Provided: $INPUT_DIR"
+ exit 1
+fi
+
+echo "════════════════════════════════════════════════════════════════"
+echo " TraceLens Analysis - Single Configuration"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "Input directory: $INPUT_DIR"
+echo "Torch profiler traces: $TORCH_PROF_DIR"
+echo ""
+
+# Create output directory in the base directory
+OUTPUT_DIR="${BASE_DIR}/tracelens_analysis"
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR/individual_reports"
+mkdir -p "$OUTPUT_DIR/collective_reports"
+
+# Detect number of ranks
+NUM_RANKS=$(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | wc -l)
+
+if [ $NUM_RANKS -eq 0 ]; then
+ echo "Error: No rank directories found in $TORCH_PROF_DIR"
+ exit 1
+fi
+
+echo "Detected $NUM_RANKS ranks"
+
+# Show sample trace files
+echo ""
+echo "Sample trace files:"
+for rank_dir in $(find "$TORCH_PROF_DIR" -maxdepth 1 -type d -name "rank*" | sort | head -3); do
+ rank_name=$(basename "$rank_dir")
+ trace_file=$(find "$rank_dir" -name "*.json" | head -1)
+ if [ -n "$trace_file" ]; then
+ echo " $rank_name: $(basename "$trace_file")"
+ fi
+done
+if [ "$RUN_INDIVIDUAL" = true ]; then
+ echo ""
+ echo "════════════════════════════════════════════════════════════════"
+ echo "Step 1: Generating Individual Performance Reports"
+ echo "════════════════════════════════════════════════════════════════"
+ echo ""
+
+# Process each rank
+for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+ # Try multiple directory naming patterns
+ RANK_DIR=""
+ if [ -d "$TORCH_PROF_DIR/rank${rank_idx}" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+ elif [ -d "$TORCH_PROF_DIR/rank_${rank_idx}" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank_${rank_idx}"
+ elif [ -d "$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)" ]; then
+ RANK_DIR="$TORCH_PROF_DIR/rank_$(printf "%02d" $rank_idx)"
+ fi
+
+ if [ -z "$RANK_DIR" ] || [ ! -d "$RANK_DIR" ]; then
+ echo " Skip rank ${rank_idx} - directory not found"
+ continue
+ fi
+
+ # Find trace file
+ TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+
+ if [ -z "$TRACE" ]; then
+ echo "⚠️ Skip rank ${rank_idx} - no trace file found"
+ continue
+ fi
+
+ OUTPUT="$OUTPUT_DIR/individual_reports/perf_rank${rank_idx}.xlsx"
+
+ echo "Processing rank ${rank_idx}..."
+ echo " Trace: $(basename "$TRACE")"
+
+ $TRACELENS_WRAPPER generate_perf_report \
+ --profile_json_path "$TRACE" \
+ --output_xlsx_path "$OUTPUT" \
+ --include_unlinked_kernels \
+ --short_kernel_study \
+ --short_kernel_threshold_us 50 \
+ --topk_ops 100 \
+ --topk_roofline_ops 100
+
+ echo " Done: $OUTPUT"
+ echo ""
+done
+
+fi
+
+if [ "$RUN_COLLECTIVE" = true ]; then
+ echo ""
+ echo "════════════════════════════════════════════════════════════════"
+ echo "Step 2: Generating Multi-Rank Collective Report"
+ echo "════════════════════════════════════════════════════════════════"
+ echo ""
+
+# Find a sample trace file to get the filename pattern
+SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank0" -name "*.json" -type f | head -1)
+if [ -z "$SAMPLE_TRACE" ]; then
+ # Try alternative rank naming
+ SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_0" -name "*.json" -type f | head -1)
+fi
+
+if [ -z "$SAMPLE_TRACE" ]; then
+ # Try rank_00
+ SAMPLE_TRACE=$(find "$TORCH_PROF_DIR/rank_00" -name "*.json" -type f | head -1)
+fi
+
+if [ -n "$SAMPLE_TRACE" ]; then
+ OUTPUT="$OUTPUT_DIR/collective_reports/collective_all_ranks.xlsx"
+
+ echo "Generating collective report for all $NUM_RANKS ranks..."
+
+ # Create symlinks with consistent names for collective report
+ for rank_idx in $(seq 0 $((NUM_RANKS - 1))); do
+ RANK_DIR="$TORCH_PROF_DIR/rank${rank_idx}"
+ if [ -d "$RANK_DIR" ]; then
+ TRACE=$(find "$RANK_DIR" -name "*.json" -type f | head -1)
+ if [ -n "$TRACE" ]; then
+ ln -sf "$(basename "$TRACE")" "$RANK_DIR/trace.json"
+ fi
+ fi
+ done
+
+ echo " Trace pattern: rank*/trace.json"
+
+ $TRACELENS_WRAPPER generate_multi_rank_collective \
+ --trace_pattern "$TORCH_PROF_DIR/rank*/trace.json" \
+ --world_size $NUM_RANKS \
+ --output_xlsx_path "$OUTPUT" \
+ --detailed_analysis \
+ --use_multiprocessing
+
+ echo " Done: $OUTPUT"
+else
+ echo " Could not generate collective report - no trace files found"
+fi
+
+fi
+
+echo ""
+echo "════════════════════════════════════════════════════════════════"
+echo "Analysis Complete!"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "📁 Results saved to:"
+echo " $OUTPUT_DIR/"
+echo ""
+
+# Count generated reports
+INDIV_COUNT=$(find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+COLL_COUNT=$(find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" 2>/dev/null | wc -l)
+
+echo "Generated reports:"
+echo " Individual reports (per rank): $INDIV_COUNT"
+echo " Collective reports (all ranks): $COLL_COUNT"
+echo ""
+
+echo "📊 Report Files:"
+echo ""
+echo "Individual Performance Reports:"
+if [ $INDIV_COUNT -gt 0 ]; then
+ find "$OUTPUT_DIR/individual_reports" -name "*.xlsx" | sort | sed 's/^/ /'
+else
+ echo " (none generated)"
+fi
+echo ""
+
+echo "Collective Reports:"
+if [ $COLL_COUNT -gt 0 ]; then
+ find "$OUTPUT_DIR/collective_reports" -name "*.xlsx" | sed 's/^/ /'
+else
+ echo " (none generated)"
+fi
+
+echo ""
+echo "Done!"