diff --git a/scripts/gemm_analysis/README.md b/scripts/gemm_analysis/README.md index ee949be..d77288a 100644 --- a/scripts/gemm_analysis/README.md +++ b/scripts/gemm_analysis/README.md @@ -111,6 +111,36 @@ python scripts/gemm_analysis/analyze_gemm_reports.py \ This generates `top5_gemm_kernels_time_variance.csv` with the kernels showing highest time variance across runs. +### 5. Generate rocprof YAML for Targeted Profiling + +After identifying the top GEMM kernels, generate a rocprof configuration that profiles only those specific kernels: + +```bash +python scripts/gemm_analysis/generate_rocprof_yaml_from_csv.py \ + --input-csv experiments/sweep_20251212_141317/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --output-yaml scripts/gemm_analysis/rocprof_top5_kernels.yaml +``` + +This script: +- Extracts all unique kernel names from the CSV +- Creates a rocprof YAML with exact kernel name matching (using regex anchors) +- Includes the same performance counters as `rocprof_cu_only.yaml` (CU utilization, waves, etc.) +- Generates a companion `_kernel_list.txt` file with all kernel names for reference + +**Usage with profiling:** +```bash +bash scripts/gemm_analysis/run_train_various_channels.sh \ + --rocprof \ + --rocprof-input scripts/gemm_analysis/rocprof_top5_kernels.yaml \ + --channels 28,42,56 --threads 256,512 \ + --config config/single_node/gemm_overlap_comm.yaml +``` + +**Benefits:** +- Reduces profiling overhead by focusing only on high-variance kernels +- Speeds up trace collection and analysis +- Enables deeper analysis with more performance counters on fewer kernels + ## Output Structure ``` @@ -150,13 +180,6 @@ bash scripts/gemm_analysis/run_train_various_channels.sh \ --channels 28,42,56,70 \ --threads 256,512 -# Run with rocprof using CU-only YAML (recommended) -bash scripts/gemm_analysis/run_train_various_channels.sh \ - --rocprof --stats \ - --rocprof-input scripts/gemm_analysis/rocprof_cu_only.yaml \ - --channels 28,42,56,70 \ - --threads 256,512 - # Generate TraceLens reports bash scripts/gemm_analysis/run_tracelens_analysis.sh experiments/sweep_YYYYMMDD_HHMMSS @@ -164,6 +187,27 @@ bash scripts/gemm_analysis/run_tracelens_analysis.sh experiments/sweep_YYYYMMDD_ python scripts/gemm_analysis/analyze_gemm_reports.py \ --base-path experiments/sweep_YYYYMMDD_HHMMSS/tracelens_analysis \ --threads 256 512 --channels 28 42 56 70 --top-k 5 + +# Generate rocprof YAML for targeted profiling +python scripts/gemm_analysis/generate_rocprof_yaml_from_csv.py \ + --input-csv experiments/sweep_YYYYMMDD_HHMMSS/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --output-yaml scripts/gemm_analysis/rocprof_top5_kernels.yaml + +# Run with rocprof using top kernels YAML (recommended) +bash scripts/gemm_analysis/run_train_various_channels.sh \ + --rocprof --stats \ + --rocprof-input scripts/gemm_analysis/rocprof_top5_kernels.yaml \ + --channels 28,42,56,70 \ + --threads 256,512 + +# OR + +# Run with rocprof using CU-only YAML +bash scripts/gemm_analysis/run_train_various_channels.sh \ + --rocprof --stats \ + --rocprof-input scripts/gemm_analysis/rocprof_cu_only.yaml \ + --channels 28,42,56,70 \ + --threads 256,512 ``` # GEMM Visualization and Reporting diff --git a/scripts/gemm_analysis/generate_rocprof_yaml_from_csv.py b/scripts/gemm_analysis/generate_rocprof_yaml_from_csv.py new file mode 100755 index 0000000..e24c53f --- /dev/null +++ b/scripts/gemm_analysis/generate_rocprof_yaml_from_csv.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +""" +Generate a rocprof yaml configuration file from top5_gemm_kernels_time_variance.csv +This will create a yaml file that profiles only the kernels mentioned in the CSV. +""" + +import argparse +import csv +import re +from pathlib import Path + + +def truncate_kernel_name(kernel_name): + """ + Extract the shortest meaningful kernel name using delimiters. + + Logic: + 1. Remove 'void ' prefix if present + 2. Remove template parameters first (split by '<' and take first part) + 3. Remove function parameters (split by '(' and take first part) + 4. Split by '::' and take the last part (actual kernel/function name) + + Examples: + - 'void at::native::elementwise_kernel_manual_unroll<...>' -> 'elementwise_kernel_manual_unroll' + - 'Cijk_Ailk_Bjlk_BBS_BH_...' -> 'Cijk_Ailk_Bjlk_BBS_BH_...' (unchanged) + """ + name = kernel_name.strip() + + # Remove 'void ' prefix if present (common for function kernels) + if name.startswith('void '): + name = name[5:] # len('void ') = 5 + + # Remove template parameters first (everything after '<') + # This must be done before splitting by :: to avoid issues with nested templates + if '<' in name: + name = name.split('<')[0] + + # Remove function parameters (everything after '(') + if '(' in name: + name = name.split('(')[0] + + # Now split by '::' and take the last part (the actual function/kernel name) + if '::' in name: + name = name.split('::')[-1] + + return name.strip() + + +def escape_regex_special_chars(kernel_name): + """Escape special regex characters in kernel name.""" + # Characters that have special meaning in regex + special_chars = r'\.[]{}()*+?^$|' + escaped = kernel_name + for char in special_chars: + escaped = escaped.replace(char, '\\' + char) + return escaped + + +def generate_rocprof_yaml(csv_path, output_yaml_path): + """Generate rocprof yaml from CSV file.""" + + # Read CSV and extract unique kernel names + kernel_names = set() + kernel_mapping = {} # Track original -> processed names + + with open(csv_path, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + kernel_name_orig = row['kernel_name'].strip() + if kernel_name_orig: + # Apply generic truncation logic using delimiters + kernel_name = truncate_kernel_name(kernel_name_orig) + + # Track if truncation occurred + if kernel_name != kernel_name_orig: + kernel_mapping[kernel_name] = kernel_name_orig + + kernel_names.add(kernel_name) + + print(f"Found {len(kernel_names)} unique kernel patterns in CSV") + if kernel_mapping: + print(f"Truncated {len(kernel_mapping)} kernel name(s) using delimiter-based extraction") + + # Escape special regex characters and create regex pattern + # No anchors - allow substring matching + escaped_kernels = [escape_regex_special_chars(name) for name in sorted(kernel_names)] + + # Create regex pattern - join with OR operator + # For rocprof, we want to match any of these kernel names + kernel_regex = '(' + '|'.join(escaped_kernels) + ')' + + # Generate YAML content + # Use single quotes for the regex to avoid YAML escape sequence issues + yaml_content = f"""# Auto-generated rocprof configuration for top GEMM kernels +# Generated from: {csv_path} +# Number of unique kernel patterns: {len(kernel_names)} +# +# This configuration profiles only the specific kernels identified in the variance analysis. +# Kernel names are matched as substrings (no anchors) to allow flexible matching. +# Note: Kernel names are extracted using delimiters (::, <, space) for brevity. +# Use this with rocprofv3 for targeted profiling of high-variance GEMM kernels. + +jobs: + - kernel_include_regex: '{kernel_regex}' + kernel_trace: true + output_format: [json, csv] + pmc: + - SQ_BUSY_CU_CYCLES # CU utilization (most important) + - SQ_WAVES # Active waves (occupancy indicator) + - SQ_WAVE_CYCLES # Total wave cycles + - SQ_INSTS_MFMA # Matrix ops (GEMM-specific) +""" + + # Write YAML file + with open(output_yaml_path, 'w') as f: + f.write(yaml_content) + + print(f"Generated rocprof yaml: {output_yaml_path}") + print(f"Kernel regex length: {len(kernel_regex)} characters") + + # Also save a text file with the list of kernels for reference + kernel_list_path = str(output_yaml_path).replace('.yaml', '_kernel_list.txt') + with open(kernel_list_path, 'w') as f: + f.write("List of kernel patterns included in rocprof configuration:\n") + f.write("=" * 80 + "\n") + f.write("Note: Kernel names are matched as substrings (no regex anchors).\n") + f.write("Note: Names extracted using delimiters (::, <, (), 'void' prefix).\n") + if kernel_mapping: + f.write(f"Note: {len(kernel_mapping)} kernel name(s) were truncated for brevity.\n") + f.write("\n") + for i, kernel in enumerate(sorted(kernel_names), 1): + f.write(f"{i}. {kernel}\n") + if kernel in kernel_mapping: + # Show first 100 chars of original, indicate if longer + orig = kernel_mapping[kernel] + if len(orig) > 100: + f.write(f" (Original: {orig[:100]}...)\n") + else: + f.write(f" (Original: {orig})\n") + + print(f"Saved kernel list to: {kernel_list_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Generate rocprof yaml configuration from top5_gemm_kernels_time_variance.csv" + ) + parser.add_argument( + '--input-csv', + type=str, + required=True, + help='Path to top5_gemm_kernels_time_variance.csv file' + ) + parser.add_argument( + '--output-yaml', + type=str, + default=None, + help='Output yaml file path (default: rocprof_top5_kernels.yaml in same directory as CSV)' + ) + + args = parser.parse_args() + + csv_path = Path(args.input_csv) + + if not csv_path.exists(): + print(f"Error: CSV file not found: {csv_path}") + return 1 + + # Determine output path + if args.output_yaml: + output_yaml_path = Path(args.output_yaml) + else: + output_yaml_path = csv_path.parent / 'rocprof_top5_kernels.yaml' + + # Generate the yaml + generate_rocprof_yaml(csv_path, output_yaml_path) + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/gemm_analysis/rocprof_cu_only.yaml b/scripts/gemm_analysis/rocprof_cu_only.yaml index bb503d1..1a9e292 100644 --- a/scripts/gemm_analysis/rocprof_cu_only.yaml +++ b/scripts/gemm_analysis/rocprof_cu_only.yaml @@ -3,7 +3,7 @@ jobs: - kernel_include_regex: "(gemm|Cijk_.*)" kernel_trace: true - output_format: [csv] + output_format: [json] pmc: - SQ_BUSY_CU_CYCLES # CU utilization (most important) - SQ_WAVES # Active waves (occupancy indicator) diff --git a/scripts/gemm_analysis/run_train_various_channels.sh b/scripts/gemm_analysis/run_train_various_channels.sh index d665148..976fb7f 100755 --- a/scripts/gemm_analysis/run_train_various_channels.sh +++ b/scripts/gemm_analysis/run_train_various_channels.sh @@ -231,10 +231,11 @@ for THREADS in "${THREADS_TO_RUN[@]}"; do if [ -n "${ROCPROF_INPUT}" ]; then log "[INFO] Using rocprofv3 input file: ${ROCPROF_INPUT}" log "[INFO] Kernel filtering/stats should be set inside the input file" - rocprofv3 -i "${ROCPROF_INPUT}" -d "${ROCPROF_DIR}" -- \ - ${BASE_CMD} ${BASE_OVERRIDES} \ - --override training.output_dir=${OUTPUT_DIR} \ - 2>&1 | tee "${OUTPUT_DIR}/run_output.log" + rocprof_cmd="rocprofv3 -i ${ROCPROF_INPUT} -d ${ROCPROF_DIR} -- " + rocprof_cmd+="${BASE_CMD} ${BASE_OVERRIDES} " + rocprof_cmd+="--override training.output_dir=${OUTPUT_DIR}" + log "[INFO] Rocprof command: ${rocprof_cmd}" + ${rocprof_cmd} 2>&1 | tee "${OUTPUT_DIR}/run_output.log" else # Current rocprofv3 build does not support --kernel-names; run unfiltered # to avoid argument errors. Expect larger traces.