diff --git a/.github/workflows/gemm-sweep-analysis.yml b/.github/workflows/gemm-sweep-analysis.yml index 1a58d89..380dfd9 100644 --- a/.github/workflows/gemm-sweep-analysis.yml +++ b/.github/workflows/gemm-sweep-analysis.yml @@ -22,12 +22,7 @@ on: description: 'Number of top GEMM kernels to extract' required: false default: '5' - push: - branches: - - main - paths: - - 'scripts/gemm_analysis/**' - - 'config/gemm_overlap/**' + env: DOCKER_COMPOSE_FILE: docker/docker-compose.rocm70_9-1.yaml @@ -36,29 +31,35 @@ env: jobs: gemm-sweep: name: Run GEMM Sweep Profiling - runs-on: [self-hosted, gpu, rocm] + runs-on: self-hosted timeout-minutes: 180 - outputs: - sweep_dir: ${{ steps.setup.outputs.sweep_dir }} steps: - - name: Checkout repository + - name: Checkout AORTA repository uses: actions/checkout@v4 + with: + repository: ROCm/aorta + ref : prosenj_gh_action + path: aorta - name: Set up experiment directory id: setup + working-directory: aorta run: | SWEEP_DIR="experiments/sweep_$(date +%Y%m%d_%H%M%S)" echo "sweep_dir=$SWEEP_DIR" >> $GITHUB_OUTPUT mkdir -p $SWEEP_DIR - name: Build Docker container - working-directory: docker + working-directory: aorta run: | + docker compose version + docker login -u rocmshared -p ${{ secrets.ROCM_SHARED_KEY }} docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} build docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} up -d - name: Run training sweep + working-directory: aorta run: | docker exec ${{ env.CONTAINER_NAME }} bash -c " bash scripts/gemm_analysis/run_train_various_channels.sh \ @@ -69,6 +70,7 @@ jobs: " - name: Generate TraceLens reports + working-directory: aorta run: | docker exec ${{ env.CONTAINER_NAME }} bash -c " pip install -r requirements.txt && \ @@ -76,6 +78,7 @@ jobs: " - name: Extract top GEMM kernels + working-directory: aorta run: | # Parse channels and threads into space-separated format CHANNELS=$(echo "${{ github.event.inputs.channels || '28,56' }}" | tr ',' ' ') @@ -94,32 +97,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: gemm-sweep-results - path: ${{ steps.setup.outputs.sweep_dir }} + path: aorta/${{ steps.setup.outputs.sweep_dir }} retention-days: 30 - - name: Cleanup Docker container - if: always() - run: | - docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down || true - - visualization: - name: Generate Visualizations and Reports - needs: gemm-sweep - runs-on: [self-hosted, gpu, rocm] - timeout-minutes: 60 - env: - SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download sweep results - uses: actions/download-artifact@v4 - with: - name: gemm-sweep-results - path: ${{ env.SWEEP_DIR }} - - name: Set up Python uses: actions/setup-python@v5 with: @@ -127,95 +107,86 @@ jobs: cache: 'pip' - name: Install Python dependencies + working-directory: aorta run: | - pip install -r requirements.txt + docker exec ${{ env.CONTAINER_NAME }} bash -c "pip install -r requirements.txt" - name: Generate variance plots + working-directory: aorta run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " python scripts/gemm_analysis/plot_gemm_variance.py \ - --csv-path ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ - --output-dir ${{ env.SWEEP_DIR }}/tracelens_analysis/plots + --csv-path ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --output-dir ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/plots" - name: Add timestamp information + working-directory: aorta run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " python scripts/gemm_analysis/enhance_gemm_variance_with_timestamps.py \ - --input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ - --base-path ${{ env.SWEEP_DIR }} + --input-csv ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/top5_gemm_kernels_time_variance.csv \ + --base-path ${{ steps.setup.outputs.sweep_dir }}" - name: Analyze collective overlap + working-directory: aorta run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " python scripts/gemm_analysis/gemm_report_with_collective_overlap.py \ - --input-csv ${{ env.SWEEP_DIR }}/tracelens_analysis/top5_gemm_kernels_time_variance_with_timestamps.csv \ - --tracelens-path ${{ env.SWEEP_DIR }}/tracelens_analysis + --input-csv ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/top5_gemm_kernels_time_variance_with_timestamps.csv \ + --tracelens-path ${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis" - name: Process GPU timeline + working-directory: aorta run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " python scripts/gemm_analysis/process_gpu_timeline.py \ - --sweep-dir ${{ env.SWEEP_DIR }} + --sweep-dir ${{ steps.setup.outputs.sweep_dir }}" - name: Process NCCL communication data + working-directory: aorta run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " python scripts/gemm_analysis/process_comms.py \ - --sweep-dir ${{ env.SWEEP_DIR }} + --sweep-dir ${{ steps.setup.outputs.sweep_dir }}" + + - name: Stop Docker container + if: always() + working-directory: aorta + run: | + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down - name: Upload analysis results uses: actions/upload-artifact@v4 with: name: gemm-analysis-results path: | - ${{ env.SWEEP_DIR }}/tracelens_analysis/plots/ - ${{ env.SWEEP_DIR }}/tracelens_analysis/*.csv - ${{ env.SWEEP_DIR }}/tracelens_analysis/*.xlsx + aorta/${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/plots/ + aorta/${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/*.csv + aorta/${{ steps.setup.outputs.sweep_dir }}/tracelens_analysis/*.xlsx retention-days: 30 - comparison-report: - name: Generate Comparison Report - needs: [gemm-sweep, visualization] - runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' - env: - SWEEP_DIR: ${{ needs.gemm-sweep.outputs.sweep_dir }} - steps: - - name: Checkout repository + - name: Checkout aorta-report repository uses: actions/checkout@v4 - - - name: Download analysis results - uses: actions/download-artifact@v4 - with: - name: gemm-analysis-results - path: ${{ env.SWEEP_DIR }}/tracelens_analysis - - - name: Set up Python - uses: actions/setup-python@v5 with: - python-version: '3.10' - cache: 'pip' + repository: ROCm/aorta-report + ref: main + token: ${{ secrets.AORTA_REPORT_GITHUB_TOKEN }} + path: aorta-report - - name: Install Python dependencies + - name: Create date directory and copy sweep results run: | - pip install -r requirements.txt + date=$(date '+%Y-%m-%d') + mkdir -p aorta-report/${date}/gemm-sweep + cp -r aorta/${{ steps.setup.outputs.sweep_dir }}/* aorta-report/${date}/gemm-sweep/ - - name: Generate summary report + - name: Push results to aorta-report + working-directory: aorta-report run: | - echo "## GEMM Sweep Analysis Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Configuration" >> $GITHUB_STEP_SUMMARY - echo "- **Sweep Directory**: ${{ env.SWEEP_DIR }}" >> $GITHUB_STEP_SUMMARY - echo "- **Channels**: ${{ github.event.inputs.channels || '28,56' }}" >> $GITHUB_STEP_SUMMARY - echo "- **Threads**: ${{ github.event.inputs.threads || '256,512' }}" >> $GITHUB_STEP_SUMMARY - echo "- **Top-K Kernels**: ${{ github.event.inputs.top_k || '5' }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Generated Artifacts" >> $GITHUB_STEP_SUMMARY - echo "- Variance plots (box plots, violin plots)" >> $GITHUB_STEP_SUMMARY - echo "- GEMM kernels with timestamps" >> $GITHUB_STEP_SUMMARY - echo "- Collective overlap analysis" >> $GITHUB_STEP_SUMMARY - echo "- GPU timeline data" >> $GITHUB_STEP_SUMMARY - echo "- NCCL communication data" >> $GITHUB_STEP_SUMMARY - - - name: Upload final report - uses: actions/upload-artifact@v4 - with: - name: gemm-final-report - path: ${{ env.SWEEP_DIR }}/ - retention-days: 90 + git config user.name "GitHub Actions Bot" + git config user.email "<>" + git pull --rebase origin main + date=$(date '+%Y-%m-%d') + git add ${date} + git commit -m "Add GEMM sweep results for ${date}" + git push origin main diff --git a/.github/workflows/rccl-warp-speed-analysis.yml b/.github/workflows/rccl-warp-speed-analysis.yml new file mode 100644 index 0000000..15e4bdb --- /dev/null +++ b/.github/workflows/rccl-warp-speed-analysis.yml @@ -0,0 +1,297 @@ +name: RCCL Warp Speed Performance Testing + +on: + schedule: + # Run every other day at midnight UTC + - cron: '0 0 */2 * *' + workflow_dispatch: + inputs: + config_pairs: + description: 'Space-separated CU_count,threads pairs (e.g., "56,256 37,384 32,512")' + required: true + default: '56,256 37,384 32,512' + baseline_config: + description: 'Baseline configuration (CU,threads format, e.g., "56,256")' + required: true + default: '56,256' + training_config: + description: 'Path to training config YAML' + required: false + default: 'config/single_node/gemm_overlap_comm.yaml' + gpu_target: + description: 'GPU architecture target (run "rocminfo | grep gfx" to find)' + required: true + default: 'gfx950' + rccl_branch: + description: 'RCCL branch to test' + required: false + default: 'warp_speed_v1' + + +env: + DOCKER_COMPOSE_FILE: docker/rccl_test/docker-compose.rocm70_9-1.yaml + CONTAINER_NAME: training-overlap-bugs-rocm70_9-1 + +jobs: + rccl-warp-speed-test: + name: Run RCCL Warp Speed Performance Tests and Analysis + runs-on: self-hosted + timeout-minutes: 300 + + steps: + - name: Checkout AORTA repository + uses: actions/checkout@v4 + with: + repository: ROCm/aorta + ref: ${{ github.ref }} + path: aorta + + - name: Cleanup existing container + working-directory: aorta + run: | + # Stop and remove any existing container with the same name + docker stop ${{ env.CONTAINER_NAME }} 2>/dev/null || true + docker rm ${{ env.CONTAINER_NAME }} 2>/dev/null || true + # Also try docker compose down in case it was started via compose + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down 2>/dev/null || true + + - name: Build Docker container + working-directory: aorta + run: | + docker compose version + docker login -u rocmshared -p ${{ secrets.ROCM_SHARED_KEY }} + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} build + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} up -d + + - name: Clone and build RCCL warp_speed branch + working-directory: aorta + run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " + mkdir -p /rccl && cd /rccl + + if [ -d 'rccl' ]; then + cd rccl + git fetch origin + git checkout ${{ github.event.inputs.rccl_branch || 'warp_speed_v1' }} + git pull + else + git clone --recursive https://github.com/mustafabar/rccl.git + cd rccl + git checkout ${{ github.event.inputs.rccl_branch || 'warp_speed_v1' }} + fi + + echo 'Building RCCL with GPU target: ${{ github.event.inputs.gpu_target || 'gfx950' }}' + ./install.sh -l --amdgpu_targets=${{ github.event.inputs.gpu_target || 'gfx950' }} + + # Verify build + echo 'RCCL build completed. Library location:' + ls -la /rccl/rccl/build/release/ || echo 'Build directory not found' + " + + - name: Install Python dependencies + working-directory: aorta + run: | + docker exec ${{ env.CONTAINER_NAME }} bash -c " + pip install -r requirements.txt + pip install pandas openpyxl matplotlib seaborn numpy + " + + - name: Run RCCL warp speed comparison tests + working-directory: aorta + run: | + CONFIG_PAIRS="${{ github.event.inputs.config_pairs || '56,256 37,384 32,512' }}" + TRAINING_CONFIG="${{ github.event.inputs.training_config || 'config/single_node/gemm_overlap_comm.yaml' }}" + + docker exec ${{ env.CONTAINER_NAME }} bash -c " + # Set RCCL library path + export LD_LIBRARY_PATH=/rccl/rccl/build/release:\$LD_LIBRARY_PATH + + # Run the RCCL warp speed comparison script + bash ./scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh \ + -p \"$CONFIG_PAIRS\" \ + -c $TRAINING_CONFIG + " + + - name: Find experiment directory + id: find_experiment + working-directory: aorta + run: | + # Find the most recently created rccl_warp_speed experiment directory + EXPERIMENT_DIR=$(ls -td experiments/rccl_warp_speed_* 2>/dev/null | head -1) + if [ -z "$EXPERIMENT_DIR" ]; then + echo "Error: No experiment directory found" + exit 1 + fi + echo "Found experiment directory: $EXPERIMENT_DIR" + echo "experiment_dir=$EXPERIMENT_DIR" >> $GITHUB_OUTPUT + + - name: Run pairwise comparison analysis + working-directory: aorta + run: | + BASELINE_CONFIG="${{ github.event.inputs.baseline_config || '56,256' }}" + CONFIG_PAIRS="${{ github.event.inputs.config_pairs || '56,256 37,384 32,512' }}" + EXPERIMENT_DIR="${{ steps.find_experiment.outputs.experiment_dir }}" + + # Parse baseline + BASELINE_CU=$(echo $BASELINE_CONFIG | cut -d',' -f1) + BASELINE_THREADS=$(echo $BASELINE_CONFIG | cut -d',' -f2) + BASELINE_DIR="${EXPERIMENT_DIR}/${BASELINE_CU}cu_${BASELINE_THREADS}threads" + + docker exec ${{ env.CONTAINER_NAME }} bash -c " + OUTPUT_DIR=\"${EXPERIMENT_DIR}/comparison_results\" + mkdir -p \"\$OUTPUT_DIR\" + + # Run comparison for each non-baseline configuration + for pair in $CONFIG_PAIRS; do + CU_COUNT=\$(echo \$pair | cut -d',' -f1) + THREADS=\$(echo \$pair | cut -d',' -f2) + + # Skip if this is the baseline + if [ \"\$CU_COUNT\" = \"$BASELINE_CU\" ] && [ \"\$THREADS\" = \"$BASELINE_THREADS\" ]; then + continue + fi + + TEST_DIR=\"${EXPERIMENT_DIR}/\${CU_COUNT}cu_\${THREADS}threads\" + COMPARISON_OUTPUT=\"\$OUTPUT_DIR/baseline_vs_\${CU_COUNT}cu_\${THREADS}threads\" + + echo '========================================' + echo \"Comparing baseline ($BASELINE_CU cu, $BASELINE_THREADS threads) vs test (\$CU_COUNT cu, \$THREADS threads)\" + echo '========================================' + + python scripts/tracelens_single_config/run_full_analysis.py \ + --baseline \"$BASELINE_DIR\" \ + --test \"\$TEST_DIR\" \ + --output \"\$COMPARISON_OUTPUT\" \ + --all + done + " + + - name: Run compare-all-runs analysis + working-directory: aorta + run: | + BASELINE_CONFIG="${{ github.event.inputs.baseline_config || '56,256' }}" + CONFIG_PAIRS="${{ github.event.inputs.config_pairs || '56,256 37,384 32,512' }}" + EXPERIMENT_DIR="${{ steps.find_experiment.outputs.experiment_dir }}" + + # Parse baseline + BASELINE_CU=$(echo $BASELINE_CONFIG | cut -d',' -f1) + BASELINE_THREADS=$(echo $BASELINE_CONFIG | cut -d',' -f2) + BASELINE_DIR="${EXPERIMENT_DIR}/${BASELINE_CU}cu_${BASELINE_THREADS}threads" + + # Build list of all test directories (excluding baseline) + TEST_DIRS="" + for pair in $CONFIG_PAIRS; do + CU_COUNT=$(echo $pair | cut -d',' -f1) + THREADS=$(echo $pair | cut -d',' -f2) + + # Skip if this is the baseline + if [ "$CU_COUNT" = "$BASELINE_CU" ] && [ "$THREADS" = "$BASELINE_THREADS" ]; then + continue + fi + + TEST_DIRS="$TEST_DIRS ${EXPERIMENT_DIR}/${CU_COUNT}cu_${THREADS}threads" + done + + echo "========================================" + echo "Comparing all runs together" + echo " Baseline: $BASELINE_DIR" + echo " Test directories: $TEST_DIRS" + echo "========================================" + + docker exec ${{ env.CONTAINER_NAME }} bash -c " + OUTPUT_DIR=\"${EXPERIMENT_DIR}/compare_all_runs\" + mkdir -p \"\$OUTPUT_DIR\" + + python scripts/tracelens_single_config/run_full_analysis.py \ + --baseline \"$BASELINE_DIR\" \ + --test $TEST_DIRS \ + --output \"\$OUTPUT_DIR\" \ + --skip-tracelens \ + --compare-all-runs + " + + - name: Generate GitHub Step Summary + run: | + BASELINE_CONFIG="${{ github.event.inputs.baseline_config || '56,256' }}" + CONFIG_PAIRS="${{ github.event.inputs.config_pairs || '56,256 37,384 32,512' }}" + EXPERIMENT_DIR="${{ steps.find_experiment.outputs.experiment_dir }}" + + echo "## RCCL Warp Speed Performance Analysis Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Configuration" >> $GITHUB_STEP_SUMMARY + echo "- **Experiment Directory**: $EXPERIMENT_DIR" >> $GITHUB_STEP_SUMMARY + echo "- **Baseline**: $BASELINE_CONFIG (CU,Threads)" >> $GITHUB_STEP_SUMMARY + echo "- **RCCL Branch**: ${{ github.event.inputs.rccl_branch || 'warp_speed_v1' }}" >> $GITHUB_STEP_SUMMARY + echo "- **GPU Target**: ${{ github.event.inputs.gpu_target || 'gfx950' }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Tested Configurations" >> $GITHUB_STEP_SUMMARY + for pair in $CONFIG_PAIRS; do + CU=$(echo $pair | cut -d',' -f1) + THREADS=$(echo $pair | cut -d',' -f2) + echo "- CU=$CU, Threads=$THREADS" >> $GITHUB_STEP_SUMMARY + done + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Generated Artifacts" >> $GITHUB_STEP_SUMMARY + echo "- TraceLens individual reports (per configuration)" >> $GITHUB_STEP_SUMMARY + echo "- TraceLens collective reports (all ranks)" >> $GITHUB_STEP_SUMMARY + echo "- Pairwise comparison reports (baseline vs each test)" >> $GITHUB_STEP_SUMMARY + echo "- Compare-all-runs merged report (all configurations)" >> $GITHUB_STEP_SUMMARY + echo "- GPU timeline comparison" >> $GITHUB_STEP_SUMMARY + echo "- NCCL/Collective comparison" >> $GITHUB_STEP_SUMMARY + echo "- Final analysis report (Excel)" >> $GITHUB_STEP_SUMMARY + echo "- Performance visualization plots" >> $GITHUB_STEP_SUMMARY + echo "- HTML performance report" >> $GITHUB_STEP_SUMMARY + + - name: Upload test results + uses: actions/upload-artifact@v4 + with: + name: rccl-warp-speed-results + path: aorta/${{ steps.find_experiment.outputs.experiment_dir }} + retention-days: 30 + + - name: Upload comparison results + uses: actions/upload-artifact@v4 + with: + name: rccl-comparison-results + path: | + aorta/${{ steps.find_experiment.outputs.experiment_dir }}/comparison_results/ + aorta/${{ steps.find_experiment.outputs.experiment_dir }}/compare_all_runs/ + retention-days: 30 + + - name: Upload final report + uses: actions/upload-artifact@v4 + with: + name: rccl-final-report + path: aorta/${{ steps.find_experiment.outputs.experiment_dir }}/ + retention-days: 90 + + - name: Cleanup Docker container + if: always() + working-directory: aorta + run: | + docker compose -f ${{ env.DOCKER_COMPOSE_FILE }} down || true + + - name: Checkout aorta-report repository + uses: actions/checkout@v4 + with: + repository: ROCm/aorta-report + ref: main + token: ${{ secrets.AORTA_REPORT_GITHUB_TOKEN }} + path: aorta-report + + - name: Create date directory and copy experiment results + run: | + date=$(date '+%Y-%m-%d') + mkdir -p aorta-report/${date}/rccl-warp-speed + cp -r aorta/${{ steps.find_experiment.outputs.experiment_dir }}/* aorta-report/${date}/rccl-warp-speed/ + + - name: Push results to aorta-report + working-directory: aorta-report + run: | + git config user.name "GitHub Actions Bot" + git config user.email "<>" + git pull --rebase origin main + date=$(date '+%Y-%m-%d') + git add ${date} + git commit -m "Add RCCL warp speed results for ${date}" + git push origin main diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml index c4704ae..f87cbc6 100644 --- a/docker/docker-compose.rocm70_9-1.yaml +++ b/docker/docker-compose.rocm70_9-1.yaml @@ -20,8 +20,9 @@ services: - TORCH_NCCL_HIGH_PRIORITY=1 volumes: - - /home/manrao:/manrao - - /home/oyazdanb/aorta:/workspace/aorta + # Mount parent directory (aorta/) to /workspace/aorta + # Use AORTA_WORKSPACE env var if set, otherwise default to parent dir (..) + - ${AORTA_WORKSPACE:-..}:/workspace/aorta devices: - /dev/kfd - /dev/dri diff --git a/docker/rccl_test/Dockerfile.rocm70_9-1 b/docker/rccl_test/Dockerfile.rocm70_9-1 new file mode 100644 index 0000000..1111ec3 --- /dev/null +++ b/docker/rccl_test/Dockerfile.rocm70_9-1 @@ -0,0 +1,41 @@ +# Start from the existing PyTorch ROCm image +FROM rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly + +# Switch to root to install packages +USER root + +# Install wget if not available (try both yum and apt-get) +RUN yum install -y wget 2>/dev/null || apt-get update && apt-get install -y wget || true + +# Download and install amdgpu-install package for RHEL +RUN wget https://artifactory-cdn.amd.com/artifactory/list/amdgpu-rpm/rhel/amdgpu-install-internal-7.0_9-1.noarch.rpm && \ + yum install -y ./amdgpu-install-internal-7.0_9-1.noarch.rpm || rpm -ivh ./amdgpu-install-internal-7.0_9-1.noarch.rpm && \ + rm amdgpu-install-internal-7.0_9-1.noarch.rpm + +# Update amdgpu-repo with specific builds +RUN amdgpu-repo --amdgpu-build=2247890 --rocm-build=compute-rocm-rel-7.0-meta/7 + +# Since base image already has ROCm, just update the key runtime components +# instead of doing a full reinstall which causes dependency conflicts +RUN yum update -y --skip-broken \ + rocm-hip \ + rocm-libs \ + rocm-hip-libraries \ + rocm-hip-runtime-devel \ + hip-base \ + hip-dev \ + hip-runtime-amd \ + rocm-core || echo "Updated available ROCm packages" + +RUN python3.10 -m pip install git+https://github.com/AMD-AGI/TraceLens.git +RUN python3.10 -m pip install openpyxl seaborn + +# Update environment variables to ensure new ROCm is used +ENV ROCM_HOME=/opt/rocm +ENV PATH=$ROCM_HOME/bin:$PATH +ENV LD_LIBRARY_PATH=$ROCM_HOME/lib:$LD_LIBRARY_PATH + +# Set working directory +WORKDIR /workspace/aorta + +CMD ["/bin/bash"] diff --git a/docker/rccl_test/docker-compose.rocm70_9-1.yaml b/docker/rccl_test/docker-compose.rocm70_9-1.yaml new file mode 100644 index 0000000..e3071bc --- /dev/null +++ b/docker/rccl_test/docker-compose.rocm70_9-1.yaml @@ -0,0 +1,39 @@ +services: + torchenv-rocm70: + container_name: training-overlap-bugs-rocm70_9-1 + build: + context: . + dockerfile: Dockerfile.rocm70_9-1 + user: root + privileged: true + network_mode: host + group_add: + - video + ipc: host + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + environment: + - RCCL_FOLDER=/rccl/rccl + - LD_LIBRARY_PATH=/rccl/rccl/build/release:${LD_LIBRARY_PATH:-} + - TORCH_NCCL_HIGH_PRIORITY=1 + + volumes: + # Mount aorta/ directory to /workspace/aorta + # From docker/rccl_test/, go up two levels to reach aorta/ + # Use AORTA_WORKSPACE env var if set, otherwise default to ../.. + - ${AORTA_WORKSPACE:-../..}:/workspace/aorta + # RCCL build directory - use relative path for bind mount + # Use RCCL_WORKSPACE env var if set, otherwise default to ./rccl_build + - ${RCCL_WORKSPACE:-./rccl_build}:/rccl + devices: + - /dev/kfd + - /dev/dri + working_dir: /workspace/aorta + shm_size: 17G + ulimits: + memlock: -1 + stack: 67108864 + stdin_open: true + tty: true diff --git a/scripts/tracelens_single_config/run_full_analysis.py b/scripts/tracelens_single_config/run_full_analysis.py index 8d4a416..75591e4 100644 --- a/scripts/tracelens_single_config/run_full_analysis.py +++ b/scripts/tracelens_single_config/run_full_analysis.py @@ -292,7 +292,7 @@ def main(): temp_test_paths.append(baseline_path) for test_path in temp_test_paths: if not run_tracelens_analysis( - test_path, test_path.name, args.individual_only, args.collective_only + str(test_path), test_path.name, args.individual_only, args.collective_only ): return 1 else: diff --git a/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh new file mode 100644 index 0000000..7df1dd3 --- /dev/null +++ b/scripts/tracelens_single_config/run_rccl_warp_speed_comparison.sh @@ -0,0 +1,334 @@ +#!/bin/bash + +# Compare specific RCCL Warp Speed configurations +# Usage: ./run_rccl_warp_speed_comparison.sh [OPTIONS] +# -c CONFIG_FILE Config file (default: config/distributed.yaml) +# -p PAIRS CU,threads pairs (e.g., "56,256 37,384 32,512") +# -h Show help +# +# Examples: +# # Use default 3 configurations +# ./run_rccl_warp_speed_comparison.sh +# +# # Custom configurations +# ./run_rccl_warp_speed_comparison.sh -p "56,256 37,384 32,512" +# +# # Different config file with custom pairs +# ./run_rccl_warp_speed_comparison.sh -c myconfig.yaml -p "40,256 30,512" + +CONFIG_FILE="config/distributed.yaml" +CUSTOM_PAIRS="" + +# Parse command line arguments +while getopts "c:p:h" opt; do + case $opt in + c) + CONFIG_FILE="$OPTARG" + ;; + p) + CUSTOM_PAIRS="$OPTARG" + ;; + h) + echo "Usage: $0 [OPTIONS]" + echo " -c CONFIG_FILE Config file (default: config/single_node/gemm_overlap_comm.yaml)" + echo " -p PAIRS CU,threads pairs (e.g., \"56,256 37,384 32,512\")" + echo " -h Show help" + echo "" + echo "Examples:" + echo " # Use default 3 configurations" + echo " $0" + echo "" + echo " # Custom configurations" + echo " $0 -p \"56,256 37,384 32,512\"" + echo "" + echo " # Different config file with custom pairs" + echo " $0 -c myconfig.yaml -p \"40,256 30,512\"" + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" + exit 1 + ;; + esac +done +BASE_CMD="torchrun --nproc_per_node 8 train.py --config ${CONFIG_FILE}" +BASE_OVERRIDES="--override training.max_steps=100 --override profiling.tensorboard=false" + +# Base output directory +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BASE_OUTPUT_DIR="experiments/rccl_warp_speed_${TIMESTAMP}" + +# Create base output directory +mkdir -p "${BASE_OUTPUT_DIR}" + +# Log file +SWEEP_LOG="${BASE_OUTPUT_DIR}/rccl_warp_speed_comparison_${TIMESTAMP}.log" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Function to log with timestamp +log() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] ${message}" | tee -a "${SWEEP_LOG}" +} + +# Cleanup function for Ctrl+C +cleanup() { + echo "" + echo -e "${RED}=== Caught interrupt signal (Ctrl+C) ===${NC}" | tee -a "${SWEEP_LOG}" + log "Cleaning up all training processes..." + sudo pkill -9 -f "train.py" 2>/dev/null || true + sudo pkill -9 -f "torchrun" 2>/dev/null || true + log "Cleanup complete. Exiting." + exit 130 +} + +trap cleanup SIGINT SIGTERM + +echo -e "${GREEN}=== RCCL Warp Speed Configuration Comparison ===${NC}" | tee "${SWEEP_LOG}" +log "Config file: ${CONFIG_FILE}" +log "Results directory: ${BASE_OUTPUT_DIR}" +echo "" + +# Check RCCL version and configuration +echo -e "${BLUE}=== RCCL Version Check ===${NC}" | tee -a "${SWEEP_LOG}" + +# Check if custom RCCL is available +if [ -d "/opt/rccl/build/release" ]; then + echo -e "${GREEN}[OK] Custom RCCL found at /opt/rccl/build/release${NC}" | tee -a "${SWEEP_LOG}" + + # Check branch and commit + if [ -d "/opt/rccl/.git" ]; then + cd /opt/rccl + RCCL_BRANCH=$(git branch --show-current 2>/dev/null) + RCCL_COMMIT=$(git log --oneline -1 2>/dev/null) + cd - > /dev/null + + echo " Branch: ${RCCL_BRANCH}" | tee -a "${SWEEP_LOG}" + echo " Commit: ${RCCL_COMMIT}" | tee -a "${SWEEP_LOG}" + + # Verify it's warp_speed_v1 + if [[ "${RCCL_BRANCH}" == "warp_speed_v1" ]]; then + echo -e " ${GREEN}[OK] Using warp_speed_v1 branch${NC}" | tee -a "${SWEEP_LOG}" + else + echo -e " ${YELLOW}[WARNING] Not on warp_speed_v1 branch (current: ${RCCL_BRANCH})${NC}" | tee -a "${SWEEP_LOG}" + fi + fi + + # Check library size to verify it's built + RCCL_LIB_SIZE=$(ls -lh /opt/rccl/build/release/librccl.so.1.0 2>/dev/null | awk '{print $5}') + echo " Library size: ${RCCL_LIB_SIZE}" | tee -a "${SWEEP_LOG}" +else + echo -e "${YELLOW}[WARNING] Custom RCCL not found, will use PyTorch bundled version${NC}" | tee -a "${SWEEP_LOG}" + echo " PyTorch's bundled RCCL may not have warp_speed features!" | tee -a "${SWEEP_LOG}" +fi + +# Test if RCCL responds to warp_speed environment variables +echo "" | tee -a "${SWEEP_LOG}" +echo "Testing warp_speed environment variable response..." | tee -a "${SWEEP_LOG}" +export RCCL_WARP_SPEED_ENABLE=1 +export RCCL_WARP_SPEED_CU_COUNT=56 +export NCCL_DEBUG=VERSION + +python -c " +import torch +print('PyTorch version:', torch.__version__) +if torch.cuda.is_available(): + print('ROCm/CUDA available:', True) + print('Device count:', torch.cuda.device_count()) +" 2>&1 | tee -a "${SWEEP_LOG}" + +# Clean up test variables +unset RCCL_WARP_SPEED_CU_COUNT +unset NCCL_DEBUG + +echo -e "${BLUE}===========================${NC}" | tee -a "${SWEEP_LOG}" +echo "" + +# Define configurations to test +# Format: "NAME|CU_COUNT|THREADS_PER_BLOCK" +if [ -n "$CUSTOM_PAIRS" ]; then + # Parse custom pairs + CONFIGS=() + for pair in $CUSTOM_PAIRS; do + IFS=',' read -r cu threads <<< "$pair" + CONFIGS+=("${cu}cu_${threads}threads|${cu}|${threads}") + done + log "Using custom configurations: ${CUSTOM_PAIRS}" +else + # Use default configurations + CONFIGS=( + "56cu_256threads|56|256" + "37cu_384threads|37|384" + "32cu_512threads|32|512" + ) + log "Using default RCCL Warp Speed configurations" +fi + +# Track results +declare -A RUN_STATUS +declare -A RUN_TIMES + +# Run each configuration +for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + + OUTPUT_DIR="${BASE_OUTPUT_DIR}/${NAME}" + + echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}" + log "Running configuration: ${NAME}" + log " RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT}" + log " RCCL_THREADS_PER_BLOCK=${THREADS}" + log " Output directory: ${OUTPUT_DIR}" + echo -e "${YELLOW}========================================${NC}" | tee -a "${SWEEP_LOG}" + + # Create output directory + mkdir -p "${OUTPUT_DIR}" + + # Record start time + START_TIME=$(date +%s) + + # [FIX] Disable GPU core dumps to prevent @gpucore files + ulimit -c 0 # Disable core dumps entirely + export HSA_ENABLE_COREDUMP=0 # Disable AMD GPU core dumps + + # [FIX] Reduce NCCL timeout to fail faster instead of waiting 10 minutes + export NCCL_TIMEOUT=60 # Timeout after 60 seconds instead of 600 seconds + + # Export environment variables so child processes inherit them + export RCCL_WARP_SPEED_ENABLE=1 + export RCCL_UNROLL_FACTOR=1 + export RCCL_WARP_SPEED_CU_COUNT=${CU_COUNT} + export RCCL_THREADS_PER_BLOCK=${THREADS} + export HSA_ENABLE_SDMA=0 + + # Add debugging to catch errors better + export AMD_SERIALIZE_KERNEL=3 # Better error reporting + export HIP_LAUNCH_BLOCKING=1 # Synchronous kernel launches + + # [FIX] ROCm Profiler Configuration + # When enabled, redirect output to experiment directory instead of /tmp + export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=0 # Set to 1 only when needed for profiling + + # If profiling is enabled, redirect output files away from /tmp + if [[ "${PYTORCH_ROCM_PROFILER_ENABLE_TRACING}" == "1" ]]; then + export ROCPROFILER_OUTPUT_PATH="${OUTPUT_DIR}/rocprofiler" + export ROCPROFILER_LOG_PATH="${OUTPUT_DIR}/rocprofiler" + export HSA_TOOLS_LIB_PATH="${OUTPUT_DIR}/rocprofiler" + mkdir -p "${OUTPUT_DIR}/rocprofiler" + log " [WARN] ROCm profiling enabled - output redirected to ${OUTPUT_DIR}/rocprofiler" + log " [WARN] This will generate large trace files (several GB per rank)" + fi + + # [FIX] Disable WarpSpeed Auto Mode - causes non-deterministic behavior + #export RCCL_WARP_SPEED_AUTO_MODE=0 + + # Use custom RCCL if available + if [ -d "/opt/rccl/build/release" ]; then + export LD_LIBRARY_PATH=/opt/rccl/build/release:${LD_LIBRARY_PATH:-} + log " Using custom RCCL from /opt/rccl/build/release" + fi + + # Run the command + ${BASE_CMD} ${BASE_OVERRIDES} \ + --override training.output_dir=${OUTPUT_DIR} \ + 2>&1 | tee "${OUTPUT_DIR}/run_output.log" + + EXIT_CODE=${PIPESTATUS[0]} + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + + # Unset environment variables to avoid affecting next run + unset RCCL_WARP_SPEED_CU_COUNT + unset RCCL_THREADS_PER_BLOCK + + RUN_TIMES[${NAME}]=${DURATION} + + if [ $EXIT_CODE -eq 0 ]; then + log "[OK] Completed ${NAME} (duration: ${DURATION}s)" + RUN_STATUS[${NAME}]="SUCCESS" + else + log "[ERROR] Failed ${NAME} (exit code: $EXIT_CODE, duration: ${DURATION}s)" + RUN_STATUS[${NAME}]="FAILED" + + # [FIX] Clean up any hanging GPU processes after failure + log " Cleaning up any remaining GPU processes..." + pkill -9 -f "train.py" 2>/dev/null || true + sleep 2 + fi + + # Fix permissions if running as root in container + if [ "$EUID" -eq 0 ]; then + chmod -R 755 "${OUTPUT_DIR}" 2>/dev/null || true + fi + + echo "" + log "Waiting 5 seconds before next run..." + sleep 5 +done + +# Generate summary report +echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}" +echo -e "${BLUE}SUMMARY REPORT${NC}" | tee -a "${SWEEP_LOG}" +echo -e "${BLUE}========================================${NC}" | tee -a "${SWEEP_LOG}" + +SUMMARY_FILE="${BASE_OUTPUT_DIR}/rccl_warp_speed_summary_${TIMESTAMP}.txt" +{ + echo "RCCL Warp Speed Configuration Comparison" + echo "Generated: $(date)" + echo "" + printf "%-20s %-10s %-15s %-10s\n" "CONFIGURATION" "CU_COUNT" "THREADS" "STATUS" + echo "----------------------------------------------------------------" + + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + STATUS="${RUN_STATUS[${NAME}]:-UNKNOWN}" + DURATION="${RUN_TIMES[${NAME}]:-N/A}" + printf "%-20s %-10s %-15s %-10s (duration: %ss)\n" "${NAME}" "${CU_COUNT}" "${THREADS}" "${STATUS}" "${DURATION}" + done + + echo "" + echo "Output directories:" + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/" + done + + echo "" + echo "Trace files for each configuration:" + for config in "${CONFIGS[@]}"; do + IFS='|' read -r NAME CU_COUNT THREADS <<< "$config" + echo " ${NAME}: ${BASE_OUTPUT_DIR}/${NAME}/torch_profiler/" + done +} | tee "${SUMMARY_FILE}" + +log "Summary saved to: ${SUMMARY_FILE}" + +# Fix permissions for the entire output directory if running as root +if [ "$EUID" -eq 0 ]; then + echo "Fixing permissions for output directory..." | tee -a "${SWEEP_LOG}" + chmod -R 755 "${BASE_OUTPUT_DIR}" 2>/dev/null || true +fi + +echo "" +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Next Steps: Run TraceLens Analysis${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "To analyze and compare these configurations:" +echo "" +echo "./scripts/tracelens_single_config/run_tracelens_analysis.sh ${BASE_OUTPUT_DIR}" +echo "" +echo "This will generate:" +echo " - Individual reports for each rank (all 3 configs)" +echo " - Collective reports (all 3 configs)" +echo " - Comparison reports across the 3 configurations" +echo "" + +log "All runs completed! Run TraceLens analysis next." diff --git a/scripts/tracelens_with_gemm_patch.py b/scripts/tracelens_with_gemm_patch.py new file mode 100644 index 0000000..80227db --- /dev/null +++ b/scripts/tracelens_with_gemm_patch.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +""" +AUTHOR: oyazdanb +TraceLens with GEMM Recognition Patches + +This script applies GEMM recognition patches and runs TraceLens commands. + +Usage: + python tracelens_with_gemm_patch.py generate_perf_report [args...] + python tracelens_with_gemm_patch.py generate_multi_rank_collective [args...] + python tracelens_with_gemm_patch.py compare_perf_reports [args...] +""" + +import argparse +import re +import sys + + +def apply_gemm_patches(): + """Apply all GEMM recognition patches to TraceLens.""" + + print("Applying TraceLens GEMM recognition patches...") + + # Patch kernel_name_parser for enhanced ROCm GEMM recognition + try: + from TraceLens.PerfModel import kernel_name_parser + + def patched_is_rocm_gemm(kernel_name): + """ + Enhanced ROCm GEMM pattern matching for Tensile kernels. + Recognizes: Cijk_Alik_Bljk_... and variants with arbitrary prefixes. + """ + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + return bool(re.match(pattern, kernel_name)) + + def patched_parse_rocm_gemm(kernel_name): + """Parse ROCm GEMM kernel details.""" + # Parse transpose flags + trans_a, trans_b = None, None + if "_Ailk_" in kernel_name: + trans_a = False + elif "_Alik_" in kernel_name: + trans_a = True + if "_Bljk_" in kernel_name: + trans_b = False + elif "_Bjlk_" in kernel_name: + trans_b = True + + # Parse macro tile size (MT64x16x64) + macro_tile_match = re.search(r"MT(\d+)x(\d+)x(\d+)", kernel_name) + if macro_tile_match: + mt_m = int(macro_tile_match.group(1)) + mt_n = int(macro_tile_match.group(2)) + depth_u = int(macro_tile_match.group(3)) + else: + mt_m, mt_n, depth_u = None, None, None + + return { + "transpose": (trans_a, trans_b), + "mt_m": mt_m, + "mt_n": mt_n, + "depth_u": depth_u, + } + + def patched_gemm_name_parser(kernel_name): + """Enhanced GEMM name parser with better ROCm support.""" + if patched_is_rocm_gemm(kernel_name): + return patched_parse_rocm_gemm(kernel_name) + elif kernel_name_parser.is_cuda_gemm(kernel_name): + return kernel_name_parser.parse_cuda_gemm(kernel_name) + return None + + kernel_name_parser.is_rocm_gemm = patched_is_rocm_gemm + kernel_name_parser.parse_rocm_gemm = patched_parse_rocm_gemm + kernel_name_parser.gemm_name_parser = patched_gemm_name_parser + + print(" [OK] Patched kernel_name_parser (ROCm GEMM recognition)") + except ImportError as e: + print(f" [WARN] Could not patch kernel_name_parser: {e}") + + # Patch Trace2Tree util for is_gemm_kernel function + try: + from TraceLens.Trace2Tree import util as trace_util + + def patched_is_gemm_kernel(kernel_event: dict) -> bool: + """Enhanced GEMM kernel detection.""" + assert kernel_event["cat"] == "kernel" + kernel_name = kernel_event["name"] + + # ROCm Tensile GEMM pattern: C[xyz]_A[xyz]_B[xyz] + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + is_rocm_gemm = bool(re.match(pattern, kernel_name)) + + # CUDA GEMM pattern + is_cuda_gemm = kernel_name.startswith("nvjet") or "cublasLt" in kernel_name + + return is_rocm_gemm or is_cuda_gemm + + trace_util.is_gemm_kernel = patched_is_gemm_kernel + print(" [OK] Patched Trace2Tree.util (is_gemm_kernel)") + except ImportError as e: + print(f" [WARN] Could not patch Trace2Tree.util: {e}") + + # Patch TraceEventUtils to enhance GEMM keys + try: + from TraceLens import util as tracelens_util + + if hasattr(tracelens_util, "TraceEventUtils"): + if hasattr(tracelens_util.TraceEventUtils, "JaxOpKeys"): + original_gemm_keys = tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys + enhanced_gemm_keys = [ + "Cijk", + "gemm", + "nvjet", + "cublasLt", + "C[a-z]{3}_A[a-z]{3}_B[a-z]{3}", + ] + + all_keys = list(set(original_gemm_keys + enhanced_gemm_keys)) + tracelens_util.TraceEventUtils.JaxOpKeys.GemmKeys = all_keys + + print(" [OK] Patched TraceEventUtils.JaxOpKeys (GEMM keys enhanced)") + except (ImportError, AttributeError) as e: + print(f" [WARN] Could not patch TraceEventUtils: {e}") + + # Patch torch_op_mapping for better categorization + try: + from TraceLens.PerfModel import torch_op_mapping + + original_categorize = torch_op_mapping.categorize_torch_op + + def patched_categorize_torch_op(row): + """Enhanced categorization with better GEMM detection.""" + result = original_categorize(row) + + # If result is 'other', check for GEMM patterns in kernel names + if result == "other" and "kernel_details" in row and len(row["kernel_details"]) > 0: + kernel_name = row["kernel_details"][0]["name"] + pattern = r"^.*C[a-z]{3}_A[a-z]{3}_B[a-z]{3}.*$" + if re.match(pattern, kernel_name): + return "GEMM" + + return result + + torch_op_mapping.categorize_torch_op = patched_categorize_torch_op + print(" [OK] Patched torch_op_mapping (categorize_torch_op)") + except ImportError as e: + print(f" [WARN] Could not patch torch_op_mapping: {e}") + + print("[OK] All GEMM patches applied successfully!\n") + + +def create_parser(): + """Create argument parser with subcommands.""" + parser = argparse.ArgumentParser( + prog="tracelens_with_gemm_patch.py", + description="TraceLens with GEMM Recognition Patches - Apply GEMM recognition patches and run TraceLens commands.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python tracelens_with_gemm_patch.py generate_perf_report --trace-dir /path/to/traces --output report.xlsx + python tracelens_with_gemm_patch.py generate_multi_rank_collective --trace-dir /path/to/traces + python tracelens_with_gemm_patch.py compare_perf_reports --baseline base.xlsx --test test.xlsx + """, + ) + + subparsers = parser.add_subparsers( + dest="command", + title="commands", + description="Available TraceLens commands", + help="Command to run (use ' --help' for command-specific help)", + ) + + # Subparser for generate_perf_report + subparsers.add_parser( + "generate_perf_report", + help="Generate individual performance report from trace data", + add_help=False, # Let TraceLens handle its own --help + ) + + # Subparser for generate_multi_rank_collective + subparsers.add_parser( + "generate_multi_rank_collective", + help="Generate multi-rank collective report from trace data", + add_help=False, # Let TraceLens handle its own --help + ) + + # Subparser for compare_perf_reports + subparsers.add_parser( + "compare_perf_reports", + help="Compare two performance reports", + add_help=False, # Let TraceLens handle its own --help + ) + + return parser + + +def main(): + parser = create_parser() + + # Parse only the command, let TraceLens handle the rest + args, remaining_args = parser.parse_known_args() + + if args.command is None: + parser.print_help() + sys.exit(1) + + # Apply patches before importing TraceLens reporting modules + apply_gemm_patches() + + # Import TraceLens after patches are applied + from TraceLens.Reporting.generate_perf_report_pytorch import main as generate_perf_report_main + from TraceLens.Reporting.generate_multi_rank_collective_report_pytorch import ( + main as generate_multi_rank_collective_report_main, + ) + from TraceLens.Reporting.compare_perf_reports_pytorch import main as compare_perf_reports_main + + # Update sys.argv so TraceLens sees only its arguments + sys.argv = [sys.argv[0]] + remaining_args + + if args.command == "generate_perf_report": + generate_perf_report_main() + elif args.command == "generate_multi_rank_collective": + generate_multi_rank_collective_report_main() + elif args.command == "compare_perf_reports": + compare_perf_reports_main() + + +if __name__ == "__main__": + main()