Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions scripts/tracelens_single_config/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# TraceLens Single Configuration

Analyze PyTorch profiler traces from one training run.

For multiple configs see [../gemm_analysis/README.md](../gemm_analysis/README.md)

## Quick Start

```bash
# Complete analysis
python scripts/tracelens_single_config/run_full_analysis.py \
--baseline /path/to/baseline/traces \
--test /path/to/test/traces \
--output /path/to/output \
--all

# Skip TraceLens if already done
python scripts/tracelens_single_config/run_full_analysis.py \
--baseline /path/to/baseline \
--test /path/to/test \
--output /path/to/output \
--all --skip-tracelens
```

### Flags:
- `--all` - Run everything including final report
- `--gpu-timeline` - GPU timeline comparison
- `--collective` - NCCL collective comparison
- `--final-report` - Create comprehensive Excel report
- `--skip-tracelens` - Skip TraceLens report generation if already done

### Output:
- `final_analysis_report.xlsx` - All comparisons with tables and color scale
- Color scale on percent_change: Red (worst) -> White (neutral) -> Green (best)

### Using --skip-tracelens

Use the same paths for `--baseline` and `--test`. The script looks for `tracelens_analysis` subdirectory:

```bash
# Expected structure when using --skip-tracelens
baseline/
└── tracelens_analysis/ # From previous run
├── individual_reports/
└── collective_reports/

test/
└── tracelens_analysis/ # From previous run
├── individual_reports/
└── collective_reports/
```

Example:
```bash
# Use same paths, script finds tracelens_analysis inside
python run_full_analysis.py \
--baseline ~/data/baseline_traces \
--test ~/data/test_traces \
--output ~/results \
--all --skip-tracelens
```


## Expected Structure

```
traces/
└── torch_profiler/
├── rank0/
│ └── trace.json
├── rank1/
│ └── trace.json
└── ...
```

## What the Master Script Does

The `run_full_analysis.py` script automatically handles all steps:

1. Runs TraceLens on baseline and test traces
2. Processes GPU timelines using `process_gpu_timeline.py`
3. Combines reports using `combine_reports.py`
4. Adds comparison sheets using `add_comparison_sheets.py` and `add_collective_comparison.py`
5. Creates final report using `create_final_report.py`

All post-processing is handled automatically - no need to run individual scripts.


## Scripts

```
run_full_analysis.py - Master script for complete pipeline
create_final_report.py - Create comprehensive Excel report
run_tracelens_single_config.sh - Main TraceLens report generation
process_gpu_timeline.py - Aggregate GPU timeline across ranks
combine_reports.py - Combine two runs
add_comparison_sheets.py - Add GPU timeline comparison sheets
add_collective_comparison.py - Add collective/NCCL comparison sheets
```
195 changes: 195 additions & 0 deletions scripts/tracelens_single_config/add_collective_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/usr/bin/env python3
import pandas as pd
import argparse
from openpyxl.styles import Color
from openpyxl.formatting.rule import ColorScaleRule


def add_collective_comparison_sheets(input_path, output_path):
"""
Add comparison sheets to the combined collective reports.
This function will create comparison sheets for the combined collective reports.
The comparison sheets will contain the comparison of the baseline and saleelk data.
TODO : Later we need to generalize for n runs and get rid of hardcoded data labels
"""
print(f"Loading: {input_path}")

xl = pd.ExcelFile(input_path)

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
# Copy only summary sheets
for sheet_name in xl.sheet_names:
# Only keep sheets with 'summary' in the name
if "summary" not in sheet_name.lower():
print(f" Skip {sheet_name} (keeping only summary sheets)")
continue
df = pd.read_excel(input_path, sheet_name=sheet_name)
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f" Copied {sheet_name}")

# Process summary sheets for comparison
for sheet_name in ["nccl_summary_implicit_sync", "nccl_summary_long"]:
if sheet_name not in xl.sheet_names:
continue

df = pd.read_excel(input_path, sheet_name=sheet_name)

# Separate baseline and saleelk
baseline_df = df[df["source"] == "baseline"].copy()
saleelk_df = df[df["source"] == "saleelk"].copy()

if len(baseline_df) == 0 or len(saleelk_df) == 0:
print(f" Skip {sheet_name} - missing data")
continue

# Create comparison dataframe
comparison = pd.DataFrame()

# Identify key columns for grouping
group_cols = ["Collective name", "dtype", "In msg nelems"]
if not all(col in baseline_df.columns for col in group_cols):
group_cols = ["Collective name"]

# Group and compare
baseline_grouped = baseline_df.groupby(group_cols, as_index=False)
saleelk_grouped = saleelk_df.groupby(group_cols, as_index=False)

for name, base_group in baseline_grouped:
# Find matching saleelk group
if isinstance(name, tuple):
mask = pd.Series([True] * len(saleelk_df), index=saleelk_df.index)
for col, val in zip(group_cols, name):
mask = mask & (saleelk_df[col] == val)
else:
mask = saleelk_df[group_cols[0]] == name

sale_group = saleelk_df.loc[mask]

if len(sale_group) == 0:
continue

# Create comparison row
comp_row = {}

# Copy grouping columns
if isinstance(name, tuple):
for col, val in zip(group_cols, name):
comp_row[col] = val
else:
comp_row[group_cols[0]] = name

# Compare numeric columns
numeric_cols = [
"comm_latency_mean",
"algo bw (GB/s)_mean",
"bus bw (GB/s)_mean",
"Total comm latency (ms)",
"count",
]

for col in numeric_cols:
if col not in base_group.columns or col not in sale_group.columns:
continue

base_val = base_group[col].values[0]
sale_val = sale_group[col].values[0]

comp_row[f"baseline_{col}"] = base_val
comp_row[f"saleelk_{col}"] = sale_val
comp_row[f"diff_{col}"] = sale_val - base_val

# For latency/time: positive percent_change means faster (less time)
# For bandwidth: positive percent_change means better (more bandwidth)
if "latency" in col.lower() or "time" in col.lower():
# Lower is better - positive when saleelk is faster
pct_change = (
(base_val - sale_val) / base_val * 100
if base_val != 0
else 0
)
comp_row[f"percent_change_{col}"] = pct_change
elif "bw" in col.lower() or "bandwidth" in col.lower():
# Higher is better - positive when saleelk is better
pct_change = (
(sale_val - base_val) / base_val * 100
if base_val != 0
else 0
)
comp_row[f"percent_change_{col}"] = pct_change

comp_row[f"ratio_{col}"] = (
sale_val / base_val if base_val != 0 else 0
)

comparison = pd.concat(
[comparison, pd.DataFrame([comp_row])], ignore_index=True
)

# Write comparison sheet (shorten name to fit Excel's 31 char limit)
# Replace 'nccl_summary_' with 'nccl_' and '_comparison' with '_cmp'
comparison_sheet_name = (
sheet_name.replace("nccl_summary_", "nccl_") + "_cmp"
)
comparison.to_excel(writer, sheet_name=comparison_sheet_name, index=False)
print(f" Added {comparison_sheet_name}")

# Add conditional formatting to percent_change columns
print(f" Applying conditional formatting to {comparison_sheet_name}...")

ws = writer.sheets[comparison_sheet_name]

# Format all percent_change columns with color scale
for col_idx, col in enumerate(comparison.columns, start=1):
if "percent_change" in col:
# Convert column index to Excel letter (A, B, C, ...)
if col_idx <= 26:
col_letter = chr(64 + col_idx)
else:
col_letter = chr(64 + (col_idx // 26)) + chr(
64 + (col_idx % 26)
)

data_range = f"{col_letter}2:{col_letter}{len(comparison)+1}"

# Color scale: red (min/negative) -> white (0) -> green (max/positive)
ws.conditional_formatting.add(
data_range,
ColorScaleRule(
start_type="min",
start_color="F8696B", # Red
mid_type="num",
mid_value=0,
mid_color="FFFFFF", # White
end_type="max",
end_color="63BE7B", # Green
),
)

print(f" Formatted {col}")

print(f"\nSaved: {output_path}")
print("\nNew comparison sheets added")
print("percent_change interpretation:")
print(" For latency/time: Positive = faster (less time)")
print(" For bandwidth: Positive = better (more bandwidth)")
return 0


def main():
parser = argparse.ArgumentParser(
description="Add comparison sheets to combined collective reports"
)
parser.add_argument(
"--input", required=True, help="Input combined collective Excel file"
)
parser.add_argument(
"--output", required=True, help="Output Excel file with comparison sheets"
)

args = parser.parse_args()

return add_collective_comparison_sheets(args.input, args.output)


if __name__ == "__main__":
exit(main())
Loading