Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions .github/workflows/gemm-sweep-analysis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
name: GEMM Sweep Profiling and Analysis

on:
schedule:
# Run every other day at midnight UTC
- cron: '0 0 */2 * *'
workflow_dispatch:
inputs:
channels:
description: 'Comma-separated NCCL channel values'
required: true
default: '28,56'
threads:
description: 'Comma-separated thread values'
required: true
default: '256,512'
config:
description: 'Path to GEMM config YAML'
required: false
default: 'config/single_node/gemm_overlap_comm.yaml'
top_k:
description: 'Number of top GEMM kernels to extract'
required: false
default: '5'
push:
branches:
- main
paths:
- 'scripts/gemm_analysis/**'
- 'config/gemm_overlap/**'

env:
DOCKER_COMPOSE_FILE: docker/docker-compose.rocm70_9-1.yaml
CONTAINER_NAME: training-overlap-bugs-rocm70_9-1

jobs:
gemm-sweep:
name: Run GEMM Sweep Profiling
runs-on: [self-hosted, gpu, rocm]
timeout-minutes: 180
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

timeout-minutes seem high.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please suggest an apropriate limit.

outputs:
sweep_dir: ${{ steps.setup.outputs.sweep_dir }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up experiment directory
id: setup
run: |
SWEEP_DIR="experiments/sweep_$(date +%Y%m%d_%H%M%S)"
echo "sweep_dir=$SWEEP_DIR" >> $GITHUB_OUTPUT
mkdir -p $SWEEP_DIR

- name: Build Docker container
working-directory: docker
run: |
docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} build
docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} up -d

- name: Run training sweep
run: |
docker exec ${{ env.CONTAINER_NAME }} bash -c "
bash scripts/gemm_analysis/run_train_various_channels.sh \
--output-dir ${{ steps.setup.outputs.sweep_dir }} \
--channels ${{ github.event.inputs.channels || '28,56' }} \
--threads ${{ github.event.inputs.threads || '256,512' }} \
--config ${{ github.event.inputs.config || 'config/single_node/gemm_overlap_comm.yaml' }}
"

- name: Generate TraceLens reports
run: |
docker exec ${{ env.CONTAINER_NAME }} bash -c "
pip install -r requirements.txt && \
bash scripts/gemm_analysis/run_tracelens_analysis.sh ${{ steps.setup.outputs.sweep_dir }}
"

- name: Extract top GEMM kernels
run: |
# Parse channels and threads into space-separated format
CHANNELS=$(echo "${{ github.event.inputs.channels || '28,56' }}" | tr ',' ' ')
THREADS=$(echo "${{ github.event.inputs.threads || '256,512' }}" | tr ',' ' ')

docker exec ${{ env.CONTAINER_NAME }} bash -c "
python scripts/gemm_analysis/analyze_gemm_reports.py \
--base-path ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis \
--threads $THREADS \
--channels $CHANNELS \
--ranks 0 1 2 3 4 5 6 7 \
--top-k ${{ github.event.inputs.top_k || '5' }}
"

- name: Upload sweep results
uses: actions/upload-artifact@v4
with:
name: gemm-sweep-results
path: ${{ steps.setup.outputs.sweep_dir }}
retention-days: 30

- name: Cleanup Docker container
if: always()
run: |
docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down || true

visualization:
name: Generate Visualizations and Reports
needs: gemm-sweep
runs-on: [self-hosted, gpu, rocm]
timeout-minutes: 60
env:
SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Download sweep results
uses: actions/download-artifact@v4
with:
name: gemm-sweep-results
path: ${{ env.SWEEP_DIR }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'

- name: Install Python dependencies
run: |
pip install -r requirements.txt

- name: Generate variance plots
run: |
python scripts/gemm_analysis/plot_gemm_variance.py \
--csv-path ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \
--output-dir ${{ env.SWEEP_DIR }}/tracelens_analysis/plots

- name: Add timestamp information
run: |
python scripts/gemm_analysis/enhance_gemm_variance_with_timestamps.py \
--input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \
--base-path ${{ env.SWEEP_DIR }}

- name: Analyze collective overlap
run: |
python scripts/gemm_analysis/gemm_report_with_collective_overlap.py \
--input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance_with_timestamps.csv \
--tracelens-path ${{ env.SWEEP_DIR }}/tracelens_analysis

- name: Process GPU timeline
run: |
python scripts/gemm_analysis/process_gpu_timeline.py \
--sweep-dir ${{ env.SWEEP_DIR }}

- name: Process NCCL communication data
run: |
python scripts/gemm_analysis/process_comms.py \
--sweep-dir ${{ env.SWEEP_DIR }}

- name: Upload analysis results
uses: actions/upload-artifact@v4
with:
name: gemm-analysis-results
path: |
${{ env.SWEEP_DIR }}/tracelens_analysis/plots/
${{ env.SWEEP_DIR }}/tracelens_analysis/*.csv
${{ env.SWEEP_DIR }}/tracelens_analysis/*.xlsx
retention-days: 30

comparison-report:
name: Generate Comparison Report
needs: [gemm-sweep, visualization]
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch'
env:
SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Download analysis results
uses: actions/download-artifact@v4
with:
name: gemm-analysis-results
path: ${{ env.SWEEP_DIR }}/tracelens_analysis

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'

- name: Install Python dependencies
run: |
pip install -r requirements.txt

- name: Generate summary report
run: |
echo "## GEMM Sweep Analysis Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Configuration" >> $GITHUB_STEP_SUMMARY
echo "- **Sweep Directory**: ${{ env.SWEEP_DIR }}" >> $GITHUB_STEP_SUMMARY
echo "- **Channels**: ${{ github.event.inputs.channels || '28,56' }}" >> $GITHUB_STEP_SUMMARY
echo "- **Threads**: ${{ github.event.inputs.threads || '256,512' }}" >> $GITHUB_STEP_SUMMARY
echo "- **Top-K Kernels**: ${{ github.event.inputs.top_k || '5' }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Generated Artifacts" >> $GITHUB_STEP_SUMMARY
echo "- Variance plots (box plots, violin plots)" >> $GITHUB_STEP_SUMMARY
echo "- GEMM kernels with timestamps" >> $GITHUB_STEP_SUMMARY
echo "- Collective overlap analysis" >> $GITHUB_STEP_SUMMARY
echo "- GPU timeline data" >> $GITHUB_STEP_SUMMARY
echo "- NCCL communication data" >> $GITHUB_STEP_SUMMARY

- name: Upload final report
uses: actions/upload-artifact@v4
with:
name: gemm-final-report
path: ${{ env.SWEEP_DIR }}/
retention-days: 90
8 changes: 7 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@ pyyaml>=6.0

# For analysis scripts
matplotlib>=3.7.0
numpy>=1.20.0
pandas>=1.3.0
openpyxl>=3.0.0
seaborn>=0.12.0

# For GEMM analysis (scripts/gemm_analysis/)
git+https://github.com/AMD-AGI/TraceLens.git

# For hw_queue_eval (optional - install with: pip install -e ".[hw-queue]")
# click>=8.0.0
# numpy>=1.20.0
# pandas>=1.3.0
# tabulate>=0.9.0

# For trace processing (merge_gpu_trace_ranks.py, etc.)
Expand Down
15 changes: 13 additions & 2 deletions scripts/gemm_analysis/run_train_various_channels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ NPROC_PER_NODE=$DEFAULT_NPROC
ENABLE_ROCPROF=false
ROCPROF_STATS=false
ROCPROF_INPUT=""
OUTPUT_DIR_ARG=""

usage() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " -c, --channels CHANNELS Comma-separated list of channels (default: 38,42,56,70)"
echo " -t, --threads THREADS Comma-separated list of threads per block (default: 256,512)"
echo " -o, --output-dir DIR Base output directory (default: experiments/sweep_<timestamp>)"
echo " -f, --config CONFIG Config file path (default: config/distributed.yaml)"
echo " -p, --nproc NPROC Number of processes per node (default: 8)"
echo " -r, --rocprof Enable rocprofv3 tracing"
Expand All @@ -38,6 +40,7 @@ usage() {
echo ""
echo "Examples:"
echo " $0 --channels 28,42,56 --threads 256,512 --skip-existing"
echo " $0 --output-dir experiments/my_sweep --channels 28,42"
echo " $0 --rocprof --channels 28,42,56 # Trace all kernels"
echo " $0 --rocprof --stats --channels 28 # Add CU stats"
echo " $0 --rocprof --rocprof-input path/to/rocprof.yaml # Use yaml to filter kernels"
Expand All @@ -51,6 +54,7 @@ for arg in "$@"; do
case "$arg" in
--channels) set -- "$@" "-c" ;;
--threads) set -- "$@" "-t" ;;
--output-dir) set -- "$@" "-o" ;;
--config) set -- "$@" "-f" ;;
--nproc) set -- "$@" "-p" ;;
--rocprof) set -- "$@" "-r" ;;
Expand All @@ -63,14 +67,17 @@ for arg in "$@"; do
esac
done

while getopts "c:t:f:p:snrmh-:" opt; do
while getopts "c:t:o:f:p:snrmh-:" opt; do
case $opt in
c)
IFS=',' read -ra CHANNELS_TO_RUN <<< "$OPTARG"
;;
t)
IFS=',' read -ra THREADS_TO_RUN <<< "$OPTARG"
;;
o)
OUTPUT_DIR_ARG="$OPTARG"
;;
f)
CONFIG_FILE="$OPTARG"
;;
Expand Down Expand Up @@ -127,7 +134,11 @@ BASE_OVERRIDES="--override profiling.tensorboard=false"

# Base output directory
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BASE_OUTPUT_DIR="experiments/sweep_${TIMESTAMP}"
if [ -n "${OUTPUT_DIR_ARG}" ]; then
BASE_OUTPUT_DIR="${OUTPUT_DIR_ARG}"
else
BASE_OUTPUT_DIR="experiments/sweep_${TIMESTAMP}"
fi

# Create base output directory first
mkdir -p "${BASE_OUTPUT_DIR}"
Expand Down