diff --git a/.github/workflows/gemm-sweep-analysis.yml b/.github/workflows/gemm-sweep-analysis.yml new file mode 100644 index 0000000..1a58d89 --- /dev/null +++ b/.github/workflows/gemm-sweep-analysis.yml @@ -0,0 +1,221 @@ +name: GEMM Sweep Profiling and Analysis + +on: + schedule: + # Run every other day at midnight UTC + - cron: '0 0 */2 * *' + workflow_dispatch: + inputs: + channels: + description: 'Comma-separated NCCL channel values' + required: true + default: '28,56' + threads: + description: 'Comma-separated thread values' + required: true + default: '256,512' + config: + description: 'Path to GEMM config YAML' + required: false + default: 'config/single_node/gemm_overlap_comm.yaml' + top_k: + description: 'Number of top GEMM kernels to extract' + required: false + default: '5' + push: + branches: + - main + paths: + - 'scripts/gemm_analysis/**' + - 'config/gemm_overlap/**' + +env: + DOCKER_COMPOSE_FILE: docker/docker-compose.rocm70_9-1.yaml + CONTAINER_NAME: training-overlap-bugs-rocm70_9-1 + +jobs: + gemm-sweep: + name: Run GEMM Sweep Profiling + runs-on: [self-hosted, gpu, rocm] + timeout-minutes: 180 + outputs: + sweep_dir: ${{ steps.setup.outputs.sweep_dir }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up experiment directory + id: setup + run: | + SWEEP_DIR="experiments/sweep_$(date +%Y%m%d_%H%M%S)" + echo "sweep_dir=$SWEEP_DIR" >> $GITHUB_OUTPUT + mkdir -p $SWEEP_DIR + + - name: Build Docker container + working-directory: docker + run: | + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} build + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} up -d + + - name: Run training sweep + run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " + bash scripts/gemm_analysis/run_train_various_channels.sh \ + --output-dir ${{ steps.setup.outputs.sweep_dir }} \ + --channels ${{ github.event.inputs.channels || '28,56' }} \ + --threads ${{ github.event.inputs.threads || '256,512' }} \ + --config ${{ github.event.inputs.config || 'config/single_node/gemm_overlap_comm.yaml' }} + " + + - name: Generate TraceLens reports + run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " + pip install -r requirements.txt && \ + bash scripts/gemm_analysis/run_tracelens_analysis.sh ${{ steps.setup.outputs.sweep_dir }} + " + + - name: Extract top GEMM kernels + run: | + # Parse channels and threads into space-separated format + CHANNELS=$(echo "${{ github.event.inputs.channels || '28,56' }}" | tr ',' ' ') + THREADS=$(echo "${{ github.event.inputs.threads || '256,512' }}" | tr ',' ' ') + + docker exec ${{ env.CONTAINER_NAME }} bash -c " + python scripts/gemm_analysis/analyze_gemm_reports.py \ + --base-path ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis \ + --threads $THREADS \ + --channels $CHANNELS \ + --ranks 0 1 2 3 4 5 6 7 \ + --top-k ${{ github.event.inputs.top_k || '5' }} + " + + - name: Upload sweep results + uses: actions/upload-artifact@v4 + with: + name: gemm-sweep-results + path: ${{ steps.setup.outputs.sweep_dir }} + retention-days: 30 + + - name: Cleanup Docker container + if: always() + run: | + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down || true + + visualization: + name: Generate Visualizations and Reports + needs: gemm-sweep + runs-on: [self-hosted, gpu, rocm] + timeout-minutes: 60 + env: + SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download sweep results + uses: actions/download-artifact@v4 + with: + name: gemm-sweep-results + path: ${{ env.SWEEP_DIR }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install Python dependencies + run: | + pip install -r requirements.txt + + - name: Generate variance plots + run: | + python scripts/gemm_analysis/plot_gemm_variance.py \ + --csv-path ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --output-dir ${{ env.SWEEP_DIR }}/tracelens_analysis/plots + + - name: Add timestamp information + run: | + python scripts/gemm_analysis/enhance_gemm_variance_with_timestamps.py \ + --input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --base-path ${{ env.SWEEP_DIR }} + + - name: Analyze collective overlap + run: | + python scripts/gemm_analysis/gemm_report_with_collective_overlap.py \ + --input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance_with_timestamps.csv \ + --tracelens-path ${{ env.SWEEP_DIR }}/tracelens_analysis + + - name: Process GPU timeline + run: | + python scripts/gemm_analysis/process_gpu_timeline.py \ + --sweep-dir ${{ env.SWEEP_DIR }} + + - name: Process NCCL communication data + run: | + python scripts/gemm_analysis/process_comms.py \ + --sweep-dir ${{ env.SWEEP_DIR }} + + - name: Upload analysis results + uses: actions/upload-artifact@v4 + with: + name: gemm-analysis-results + path: | + ${{ env.SWEEP_DIR }}/tracelens_analysis/plots/ + ${{ env.SWEEP_DIR }}/tracelens_analysis/*.csv + ${{ env.SWEEP_DIR }}/tracelens_analysis/*.xlsx + retention-days: 30 + + comparison-report: + name: Generate Comparison Report + needs: [gemm-sweep, visualization] + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' + env: + SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download analysis results + uses: actions/download-artifact@v4 + with: + name: gemm-analysis-results + path: ${{ env.SWEEP_DIR }}/tracelens_analysis + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install Python dependencies + run: | + pip install -r requirements.txt + + - name: Generate summary report + run: | + echo "## GEMM Sweep Analysis Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Configuration" >> $GITHUB_STEP_SUMMARY + echo "- **Sweep Directory**: ${{ env.SWEEP_DIR }}" >> $GITHUB_STEP_SUMMARY + echo "- **Channels**: ${{ github.event.inputs.channels || '28,56' }}" >> $GITHUB_STEP_SUMMARY + echo "- **Threads**: ${{ github.event.inputs.threads || '256,512' }}" >> $GITHUB_STEP_SUMMARY + echo "- **Top-K Kernels**: ${{ github.event.inputs.top_k || '5' }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Generated Artifacts" >> $GITHUB_STEP_SUMMARY + echo "- Variance plots (box plots, violin plots)" >> $GITHUB_STEP_SUMMARY + echo "- GEMM kernels with timestamps" >> $GITHUB_STEP_SUMMARY + echo "- Collective overlap analysis" >> $GITHUB_STEP_SUMMARY + echo "- GPU timeline data" >> $GITHUB_STEP_SUMMARY + echo "- NCCL communication data" >> $GITHUB_STEP_SUMMARY + + - name: Upload final report + uses: actions/upload-artifact@v4 + with: + name: gemm-final-report + path: ${{ env.SWEEP_DIR }}/ + retention-days: 90 diff --git a/requirements.txt b/requirements.txt index 985670f..1e16fe4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,11 +14,17 @@ pyyaml>=6.0 # For analysis scripts matplotlib>=3.7.0 +numpy>=1.20.0 +pandas>=1.3.0 +openpyxl>=3.0.0 +seaborn>=0.12.0 + +# For GEMM analysis (scripts/gemm_analysis/) +git+https://github.com/AMD-AGI/TraceLens.git # For hw_queue_eval (optional - install with: pip install -e ".[hw-queue]") # click>=8.0.0 # numpy>=1.20.0 -# pandas>=1.3.0 # tabulate>=0.9.0 # For trace processing (merge_gpu_trace_ranks.py, etc.) diff --git a/scripts/gemm_analysis/run_train_various_channels.sh b/scripts/gemm_analysis/run_train_various_channels.sh index d665148..07a746a 100755 --- a/scripts/gemm_analysis/run_train_various_channels.sh +++ b/scripts/gemm_analysis/run_train_various_channels.sh @@ -21,12 +21,14 @@ NPROC_PER_NODE=$DEFAULT_NPROC ENABLE_ROCPROF=false ROCPROF_STATS=false ROCPROF_INPUT="" +OUTPUT_DIR_ARG="" usage() { echo "Usage: $0 [OPTIONS]" echo "Options:" echo " -c, --channels CHANNELS Comma-separated list of channels (default: 38,42,56,70)" echo " -t, --threads THREADS Comma-separated list of threads per block (default: 256,512)" + echo " -o, --output-dir DIR Base output directory (default: experiments/sweep_)" echo " -f, --config CONFIG Config file path (default: config/distributed.yaml)" echo " -p, --nproc NPROC Number of processes per node (default: 8)" echo " -r, --rocprof Enable rocprofv3 tracing" @@ -38,6 +40,7 @@ usage() { echo "" echo "Examples:" echo " $0 --channels 28,42,56 --threads 256,512 --skip-existing" + echo " $0 --output-dir experiments/my_sweep --channels 28,42" echo " $0 --rocprof --channels 28,42,56 # Trace all kernels" echo " $0 --rocprof --stats --channels 28 # Add CU stats" echo " $0 --rocprof --rocprof-input path/to/rocprof.yaml # Use yaml to filter kernels" @@ -51,6 +54,7 @@ for arg in "$@"; do case "$arg" in --channels) set -- "$@" "-c" ;; --threads) set -- "$@" "-t" ;; + --output-dir) set -- "$@" "-o" ;; --config) set -- "$@" "-f" ;; --nproc) set -- "$@" "-p" ;; --rocprof) set -- "$@" "-r" ;; @@ -63,7 +67,7 @@ for arg in "$@"; do esac done -while getopts "c:t:f:p:snrmh-:" opt; do +while getopts "c:t:o:f:p:snrmh-:" opt; do case $opt in c) IFS=',' read -ra CHANNELS_TO_RUN <<< "$OPTARG" @@ -71,6 +75,9 @@ while getopts "c:t:f:p:snrmh-:" opt; do t) IFS=',' read -ra THREADS_TO_RUN <<< "$OPTARG" ;; + o) + OUTPUT_DIR_ARG="$OPTARG" + ;; f) CONFIG_FILE="$OPTARG" ;; @@ -127,7 +134,11 @@ BASE_OVERRIDES="--override profiling.tensorboard=false" # Base output directory TIMESTAMP=$(date +%Y%m%d_%H%M%S) -BASE_OUTPUT_DIR="experiments/sweep_${TIMESTAMP}" +if [ -n "${OUTPUT_DIR_ARG}" ]; then + BASE_OUTPUT_DIR="${OUTPUT_DIR_ARG}" +else + BASE_OUTPUT_DIR="experiments/sweep_${TIMESTAMP}" +fi # Create base output directory first mkdir -p "${BASE_OUTPUT_DIR}"