Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions scripts/tracelens_single_config/combine_reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is combine_reports related to process_gpu_timeline? Should they be separate PR's?

import pandas as pd
import argparse
from pathlib import Path


def combine_collective_reports(baseline_path, test_path, output_path):
"""
Combine two collective reports into a single Excel file by adding a source column to the data.
"""

print(f"Loading baseline: {baseline_path}")
baseline_xl = pd.ExcelFile(baseline_path)

print(f"Loading test: {test_path}")
test_xl = pd.ExcelFile(test_path)

print(f"\nBaseline sheets: {baseline_xl.sheet_names}")
print(f"Test sheets: {test_xl.sheet_names}")

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
for sheet_name in baseline_xl.sheet_names:
if sheet_name not in test_xl.sheet_names:
print(f" Skip {sheet_name} - not in test file")
continue

baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name)
test_df = pd.read_excel(test_path, sheet_name=sheet_name)

baseline_df["source"] = "baseline"
test_df["source"] = "saleelk"

combined = pd.concat([baseline_df, test_df], ignore_index=True)

combined.to_excel(writer, sheet_name=sheet_name, index=False)
print(
f" Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows"
)

print(f"\nSaved: {output_path}")
return 0


def main():
parser = argparse.ArgumentParser(description="Combine two collective reports")
parser.add_argument(
"--baseline", required=True, help="Path to baseline collective_all_ranks.xlsx"
)
parser.add_argument(
"--test", required=True, help="Path to test collective_all_ranks.xlsx"
)
parser.add_argument(
"--output", required=True, help="Output path for combined Excel file"
)

args = parser.parse_args()

return combine_collective_reports(args.baseline, args.test, args.output)


if __name__ == "__main__":
exit(main())
100 changes: 100 additions & 0 deletions scripts/tracelens_single_config/process_gpu_timeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import argparse
from pathlib import Path


def geometric_mean(values):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function already exists here

values = np.array(values)
values = np.where(values == 0, 1e-10, values)
return np.exp(np.mean(np.log(values)))


def process_gpu_timeline(reports_dir, use_geo_mean=False):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks very similar in functionality to

def process_gpu_timeline_data(sweep_dir, use_geo_mean=False):
. Is there a reason we don't just reuse or modify that?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ian, these two functions are different, other than reading an excel and grouping by type, nothing is almost common. Referred one read a different directory structure and produce an excel with thread and channel information, whereas the one in the PR is a lot simplified version, it reads a directory structure without thread and channel information and hence produce a lot simpler excel. We can refactor them to use same base, but the common part is just reading excel and group by type along with the mean calculation (mean/geometric mean). Rest all will be different.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A sample implementation of what I explained : #33

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we need to have a different directory structure for gemm_analysis? Seem to me that we could better abstract and combine the two single config and gemm analysis to use the same overarching function and pass in the types that you want to group along as an arg and the gemm analysis can loop through the larger set of configs with a maybe slightly modified file structure that it sweeps through.

For sake of time this could be done later if you add a TODO comment of how we could better merge these two

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially GEMM is done for sweep run which is 8 single config (2 thread x 4 channels as of now).
Tracelens dumps comes in different format .. But we can create an abstraction to read the excel file in single config and iteration in sweep analysis (GEMM). Something what we already did in #33 but with some more abstraction.

"""
Create mean/geometric mean aggregated GPU timeline across all ranks inside tracelens analysis directory.
"""
reports_path = Path(reports_dir)

if not reports_path.exists():
print(f"Error: Directory not found: {reports_dir}")
return 1

print(f"Processing GPU timeline from: {reports_dir}")
print(f"Aggregation: {'Geometric Mean' if use_geo_mean else 'Arithmetic Mean'}")

perf_files = sorted(reports_path.glob("perf_rank*.xlsx"))

if not perf_files:
print("Error: No perf_rank*.xlsx files found")
return 1

print(f"Found {len(perf_files)} rank files")

rank_data = []
for file_path in perf_files:
rank_num = int(file_path.stem.replace("perf_rank", ""))
try:
df = pd.read_excel(file_path, sheet_name="gpu_timeline")
df["rank"] = rank_num
rank_data.append(df)
print(f" Rank {rank_num}: OK")
except Exception as e:
print(f" Rank {rank_num}: Error - {e}")

if not rank_data:
print("Error: No valid data loaded")
return 1

combined = pd.concat(rank_data, ignore_index=True)

agg_func = geometric_mean if use_geo_mean else "mean"
aggregated = (
combined.groupby("type")
.agg({"time ms": agg_func, "percent": agg_func})
.reset_index()
)

aggregated["num_ranks"] = len(perf_files)

method_suffix = "geomean" if use_geo_mean else "mean"
output_path = reports_path.parent / f"gpu_timeline_summary_{method_suffix}.xlsx"

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
aggregated.to_excel(writer, sheet_name="Summary", index=False)

combined_sorted = combined.sort_values(["rank", "type"])
combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False)

per_rank = combined.pivot_table(
values="time ms", index="type", columns="rank", aggfunc="first"
)
per_rank.to_excel(writer, sheet_name="Per_Rank_Time_ms")

per_rank_pct = combined.pivot_table(
values="percent", index="type", columns="rank", aggfunc="first"
)
per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent")

print(f"\nSaved: {output_path}")
print("\nSummary:")
print(aggregated.to_string(index=False))

return 0


def main():
parser = argparse.ArgumentParser(description="Aggregate GPU timeline across ranks")
parser.add_argument(
"--reports-dir", required=True, help="Path to individual_reports directory"
)
parser.add_argument("--geo-mean", action="store_true", help="Use geometric mean")

args = parser.parse_args()

return process_gpu_timeline(args.reports_dir, args.geo_mean)


if __name__ == "__main__":
exit(main())