diff --git a/scripts/gemm_analysis/process_gpu_timeline.py b/scripts/gemm_analysis/process_gpu_timeline.py index 2f5ddc5..93a750e 100644 --- a/scripts/gemm_analysis/process_gpu_timeline.py +++ b/scripts/gemm_analysis/process_gpu_timeline.py @@ -13,11 +13,22 @@ """ import pandas as pd -import numpy as np import os import glob import argparse from pathlib import Path +import sys + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from utils.gpu_timeline_utils import ( + read_gpu_timeline_from_excel, + aggregate_gpu_timeline, + get_method_suffix, + get_aggregation_description, + print_section, +) # ============================================================================= @@ -25,21 +36,6 @@ # ============================================================================= -def geometric_mean(values): - """Calculate geometric mean, handling zeros.""" - values = np.array(values) - # Replace zeros with small value to avoid log(0) - values = np.where(values == 0, 1e-10, values) - return np.exp(np.mean(np.log(values))) - - -def print_section(title, char="=", width=80): - """Print a formatted section header.""" - print(f"\n{char * width}") - print(title) - print(char * width) - - def parse_perf_filename(filename): """ Parse performance filename to extract channel config and rank. @@ -95,49 +91,32 @@ def read_rank_data(rank_files): """ rank_data = [] for rank, file_path in rank_files: - try: - df = pd.read_excel(file_path, sheet_name="gpu_timeline") - df["rank"] = rank + df, success = read_gpu_timeline_from_excel(Path(file_path), rank=rank) + if success: rank_data.append(df) - except Exception as e: - print(f" Warning: Could not read {os.path.basename(file_path)}: {e}") return rank_data -def aggregate_rank_data( - rank_data, thread_config, channel_config, num_ranks, use_geo_mean -): +def add_config_metadata(df, thread_config, channel_config, num_ranks): """ - Aggregate data across ranks and add metadata. + Add configuration metadata columns to the dataframe. Args: - rank_data: List of DataFrames + df: Source DataFrame thread_config: Thread configuration string (e.g., '256thread') channel_config: Channel configuration string (e.g., '28ch') num_ranks: Number of ranks - use_geo_mean: Whether to use geometric mean Returns: - DataFrame: Aggregated data with metadata + DataFrame: DataFrame with added metadata columns """ - combined = pd.concat(rank_data, ignore_index=True) - - agg_func = geometric_mean if use_geo_mean else "mean" - aggregated = ( - combined.groupby("type") - .agg({"time ms": agg_func, "percent": agg_func}) - .reset_index() - ) - - # Add metadata - aggregated["thread_config"] = thread_config - aggregated["threads_num"] = int(thread_config.replace("thread", "")) - aggregated["channel_config"] = channel_config - aggregated["channels_num"] = int(channel_config.replace("ch", "")) - aggregated["full_config"] = f"{thread_config}_{channel_config}" - aggregated["num_ranks"] = num_ranks - - return aggregated + df["thread_config"] = thread_config + df["threads_num"] = int(thread_config.replace("thread", "")) + df["channel_config"] = channel_config + df["channels_num"] = int(channel_config.replace("ch", "")) + df["full_config"] = f"{thread_config}_{channel_config}" + df["num_ranks"] = num_ranks + return df def process_channel_config(channel_config, channel_groups, use_geo_mean, thread_config): @@ -164,9 +143,8 @@ def process_channel_config(channel_config, channel_groups, use_geo_mean, thread_ print(f" No valid data for {channel_config}") return None - aggregated = aggregate_rank_data( - rank_data, thread_config, channel_config, num_ranks, use_geo_mean - ) + aggregated = aggregate_gpu_timeline(rank_data, use_geo_mean) + aggregated = add_config_metadata(aggregated, thread_config, channel_config, num_ranks) print(f" [OK] Aggregated across {num_ranks} ranks") return aggregated @@ -203,9 +181,7 @@ def process_thread_config(thread_config, tracelens_dir, use_geo_mean): results = [] # Process each channel configuration (sorted by channel number) - sorted_channels = sorted( - channel_groups.keys(), key=lambda x: int(x.replace("ch", "")) - ) + sorted_channels = sorted(channel_groups.keys(), key=lambda x: int(x.replace("ch", ""))) for channel_config in sorted_channels: aggregated = process_channel_config( channel_config, channel_groups, use_geo_mean, thread_config @@ -232,9 +208,7 @@ def create_pivot_sheet(df, value_col): Returns: DataFrame: Pivot table """ - return df.pivot_table( - values=value_col, index="type", columns="full_config", aggfunc="first" - ) + return df.pivot_table(values=value_col, index="type", columns="full_config", aggfunc="first") def create_summary_sheet(df): @@ -278,15 +252,9 @@ def save_excel_output(final_df, output_path): """ with pd.ExcelWriter(output_path, engine="openpyxl") as writer: final_df.to_excel(writer, sheet_name="All_Data", index=False) - create_pivot_sheet(final_df, "time ms").to_excel( - writer, sheet_name="Pivot_Time_ms" - ) - create_pivot_sheet(final_df, "percent").to_excel( - writer, sheet_name="Pivot_Percent" - ) - create_summary_sheet(final_df).to_excel( - writer, sheet_name="Summary_By_Config", index=False - ) + create_pivot_sheet(final_df, "time ms").to_excel(writer, sheet_name="Pivot_Time_ms") + create_pivot_sheet(final_df, "percent").to_excel(writer, sheet_name="Pivot_Percent") + create_summary_sheet(final_df).to_excel(writer, sheet_name="Summary_By_Config", index=False) print(f"[SAVED] {output_path}") print(" Sheets created:") @@ -310,9 +278,9 @@ def print_metric_comparison(df, metric_type, description): metric_type: Type of metric to filter description: Description to print """ - metric_data = df[df["type"] == metric_type][ - ["full_config", "time ms", "percent"] - ].sort_values("time ms") + metric_data = df[df["type"] == metric_type][["full_config", "time ms", "percent"]].sort_values( + "time ms" + ) print(f"\n{description}:") print(metric_data.to_string(index=False)) @@ -356,16 +324,14 @@ def process_gpu_timeline_data(sweep_dir, use_geo_mean=False): print(f"Error: tracelens_analysis directory not found in {sweep_dir}") return - agg_method = "Geometric Mean" if use_geo_mean else "Arithmetic Mean" + agg_method = get_aggregation_description(use_geo_mean) print("=" * 80) print(f"Processing GPU Timeline data from: {sweep_dir}") print(f"Aggregation method: {agg_method}") print("=" * 80) # Find all thread configurations - thread_configs = [ - d.name for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name - ] + thread_configs = [d.name for d in tracelens_dir.iterdir() if d.is_dir() and "thread" in d.name] if not thread_configs: print("Error: No thread configuration directories found") @@ -404,7 +370,7 @@ def process_gpu_timeline_data(sweep_dir, use_geo_mean=False): final_df = final_df.sort_values(["threads_num", "channels_num", "type"]) # Save to Excel - method_suffix = "geomean" if use_geo_mean else "mean" + method_suffix = get_method_suffix(use_geo_mean) output_path = tracelens_dir / f"gpu_timeline_all_configs_{method_suffix}.xlsx" save_excel_output(final_df, output_path) diff --git a/scripts/tracelens_single_config/combine_reports.py b/scripts/tracelens_single_config/combine_reports.py new file mode 100644 index 0000000..5d8bfb1 --- /dev/null +++ b/scripts/tracelens_single_config/combine_reports.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +import pandas as pd +import argparse +from pathlib import Path + + +def combine_collective_reports(baseline_path, test_path, output_path): + """ + Combine two collective reports into a single Excel file by adding a source column to the data. + """ + + print(f"Loading baseline: {baseline_path}") + baseline_xl = pd.ExcelFile(baseline_path) + + print(f"Loading test: {test_path}") + test_xl = pd.ExcelFile(test_path) + + print(f"\nBaseline sheets: {baseline_xl.sheet_names}") + print(f"Test sheets: {test_xl.sheet_names}") + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + for sheet_name in baseline_xl.sheet_names: + if sheet_name not in test_xl.sheet_names: + print(f" Skip {sheet_name} - not in test file") + continue + + baseline_df = pd.read_excel(baseline_path, sheet_name=sheet_name) + test_df = pd.read_excel(test_path, sheet_name=sheet_name) + + baseline_df["source"] = "baseline" + test_df["source"] = "saleelk" + + combined = pd.concat([baseline_df, test_df], ignore_index=True) + + combined.to_excel(writer, sheet_name=sheet_name, index=False) + print( + f" Combined {sheet_name}: {len(baseline_df)} + {len(test_df)} = {len(combined)} rows" + ) + + print(f"\nSaved: {output_path}") + return 0 + + +def main(): + parser = argparse.ArgumentParser(description="Combine two collective reports") + parser.add_argument( + "--baseline", required=True, help="Path to baseline collective_all_ranks.xlsx" + ) + parser.add_argument( + "--test", required=True, help="Path to test collective_all_ranks.xlsx" + ) + parser.add_argument( + "--output", required=True, help="Output path for combined Excel file" + ) + + args = parser.parse_args() + + return combine_collective_reports(args.baseline, args.test, args.output) + + +if __name__ == "__main__": + exit(main()) diff --git a/scripts/tracelens_single_config/process_gpu_timeline.py b/scripts/tracelens_single_config/process_gpu_timeline.py new file mode 100644 index 0000000..b914b7c --- /dev/null +++ b/scripts/tracelens_single_config/process_gpu_timeline.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +GPU Timeline Processing for Single Configuration. + +Aggregates gpu_timeline data across all ranks in a tracelens analysis directory. + +Usage: + python process_gpu_timeline.py --reports-dir /path/to/individual_reports [--geo-mean] +""" + +import pandas as pd +import argparse +from pathlib import Path +import sys + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from utils.gpu_timeline_utils import ( + read_gpu_timeline_from_excel, + aggregate_gpu_timeline, + get_method_suffix, + get_aggregation_description, +) + + +def process_gpu_timeline(reports_dir: str, use_geo_mean: bool = False) -> int: + """ + Create mean/geometric mean aggregated GPU timeline across all ranks. + + Args: + reports_dir: Path to directory containing perf_rank*.xlsx files + use_geo_mean: If True, use geometric mean; otherwise arithmetic mean + + Returns: + int: Exit code (0 for success, 1 for error) + """ + reports_path = Path(reports_dir) + + if not reports_path.exists(): + print(f"Error: Directory not found: {reports_dir}") + return 1 + + print(f"Processing GPU timeline from: {reports_dir}") + print(f"Aggregation: {get_aggregation_description(use_geo_mean)}") + + perf_files = sorted(reports_path.glob("perf_rank*.xlsx")) + + if not perf_files: + print("Error: No perf_rank*.xlsx files found") + return 1 + + print(f"Found {len(perf_files)} rank files") + + rank_data = [] + for file_path in perf_files: + rank_num = int(file_path.stem.replace("perf_rank", "")) + df, success = read_gpu_timeline_from_excel(file_path, rank=rank_num) + if success: + rank_data.append(df) + print(f" Rank {rank_num}: OK") + else: + print(f" Rank {rank_num}: Error") + + if not rank_data: + print("Error: No valid data loaded") + return 1 + + combined = pd.concat(rank_data, ignore_index=True) + aggregated = aggregate_gpu_timeline(rank_data, use_geo_mean) + aggregated["num_ranks"] = len(perf_files) + + method_suffix = get_method_suffix(use_geo_mean) + output_path = reports_path.parent / f"gpu_timeline_summary_{method_suffix}.xlsx" + + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + aggregated.to_excel(writer, sheet_name="Summary", index=False) + + combined_sorted = combined.sort_values(["rank", "type"]) + combined_sorted.to_excel(writer, sheet_name="All_Ranks_Combined", index=False) + + per_rank = combined.pivot_table( + values="time ms", index="type", columns="rank", aggfunc="first" + ) + per_rank.to_excel(writer, sheet_name="Per_Rank_Time_ms") + + per_rank_pct = combined.pivot_table( + values="percent", index="type", columns="rank", aggfunc="first" + ) + per_rank_pct.to_excel(writer, sheet_name="Per_Rank_Percent") + + print(f"\nSaved: {output_path}") + print("\nSummary:") + print(aggregated.to_string(index=False)) + + return 0 + + +def main(): + parser = argparse.ArgumentParser(description="Aggregate GPU timeline across ranks") + parser.add_argument("--reports-dir", required=True, help="Path to individual_reports directory") + parser.add_argument("--geo-mean", action="store_true", help="Use geometric mean") + + args = parser.parse_args() + + return process_gpu_timeline(args.reports_dir, args.geo_mean) + + +if __name__ == "__main__": + exit(main()) diff --git a/scripts/utils/__init__.py b/scripts/utils/__init__.py new file mode 100644 index 0000000..47e20c7 --- /dev/null +++ b/scripts/utils/__init__.py @@ -0,0 +1 @@ +# Utils package for shared script utilities diff --git a/scripts/utils/gpu_timeline_utils.py b/scripts/utils/gpu_timeline_utils.py new file mode 100644 index 0000000..1c0c0d0 --- /dev/null +++ b/scripts/utils/gpu_timeline_utils.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Shared utilities for GPU timeline processing. + +Common functions used by: +- gemm_analysis/process_gpu_timeline.py +- tracelens_single_config/process_gpu_timeline.py +""" + +import pandas as pd +import numpy as np +from pathlib import Path +from typing import List, Tuple, Callable, Union + + +def geometric_mean(values): + """ + Calculate geometric mean, handling zeros. + + Args: + values: Array-like of numeric values + + Returns: + float: Geometric mean of the values + """ + values = np.array(values) + values = np.where(values == 0, 1e-10, values) + return np.exp(np.mean(np.log(values))) + + +def get_aggregation_func(use_geo_mean: bool) -> Union[Callable, str]: + """ + Get the appropriate aggregation function. + + Args: + use_geo_mean: If True, return geometric_mean function; otherwise "mean" + + Returns: + Aggregation function or string for pandas + """ + return geometric_mean if use_geo_mean else "mean" + + +def read_gpu_timeline_from_excel(file_path: Path, rank: int = None) -> Tuple[pd.DataFrame, bool]: + """ + Read gpu_timeline sheet from an Excel file. + + Args: + file_path: Path to the Excel file + rank: Optional rank number to add as column + + Returns: + Tuple of (DataFrame, success_bool) + """ + try: + df = pd.read_excel(file_path, sheet_name="gpu_timeline") + if rank is not None: + df["rank"] = rank + return df, True + except Exception as e: + print(f" Warning: Could not read {file_path.name}: {e}") + return None, False + + +def aggregate_gpu_timeline( + rank_data: List[pd.DataFrame], use_geo_mean: bool = False +) -> pd.DataFrame: + """ + Aggregate GPU timeline data across multiple ranks. + + Args: + rank_data: List of DataFrames with gpu_timeline data + use_geo_mean: If True, use geometric mean; otherwise arithmetic mean + + Returns: + DataFrame: Aggregated data grouped by 'type' + """ + combined = pd.concat(rank_data, ignore_index=True) + + agg_func = get_aggregation_func(use_geo_mean) + aggregated = ( + combined.groupby("type").agg({"time ms": agg_func, "percent": agg_func}).reset_index() + ) + + return aggregated + + +def print_section(title: str, char: str = "=", width: int = 80): + """ + Print a formatted section header. + + Args: + title: Section title to display + char: Character to use for the separator line + width: Width of the separator line + """ + print(f"\n{char * width}") + print(title) + print(char * width) + + +def get_method_suffix(use_geo_mean: bool) -> str: + """ + Get the file suffix based on aggregation method. + + Args: + use_geo_mean: Whether geometric mean is used + + Returns: + str: "geomean" or "mean" + """ + return "geomean" if use_geo_mean else "mean" + + +def get_aggregation_description(use_geo_mean: bool) -> str: + """ + Get a human-readable description of the aggregation method. + + Args: + use_geo_mean: Whether geometric mean is used + + Returns: + str: "Geometric Mean" or "Arithmetic Mean" + """ + return "Geometric Mean" if use_geo_mean else "Arithmetic Mean"