Skip to content

Flash Attention Benchmark #112

Flash Attention Benchmark

Flash Attention Benchmark #112

name: Flash Attention Benchmark
# To remotely trigger a FA Benchmarking run, use the following:
# curl -L -X POST -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" -H "Authorization: Bearer $TOKEN" https://api.github.com/repos/pytorch/pytorch-integration-testing/dispatches -d '{"event_type": "benchmark_flash_attention"}'
on:
schedule:
- cron: "0 6 * * *" # Run every day at 6AM
push:
paths:
- .github/workflows/flash_attention.yml
repository_dispatch:
types: benchmark_flash_attention
workflow_dispatch:
jobs:
benchmark-flash-attn:
name: Flash Attention CuTe DSL Benchmark
runs-on: linux.dgx.b200.8
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Checkout Flash Attention repository
uses: actions/checkout@v4
with:
repository: Dao-AILab/flash-attention
path: fa4
submodules: recursive
- name: Setup GPU flags for docker run
run: |
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
- name: Run Flash Attention benchmark in Docker
env:
DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.06-py3
run: |
set -eux
container_name=$(docker run \
${GPU_FLAG} \
--ipc=host \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--tty \
--detach \
--security-opt seccomp=unconfined \
--shm-size=4g \
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-w /tmp/workspace \
"${DOCKER_IMAGE}"
)
# Install CuTe DSL
docker exec -t "${container_name}" bash -c "
set -x
echo 'Installing nvidia-cutlass-dsl'
pip install nvidia-cutlass-dsl==4.1.0
"
# Build and run FlashAttention CuTe DSL
docker exec -t "${container_name}" bash -c "
set -x
pushd fa4
python setup.py install
echo '<h1>B200 1000W</h1>' >> /tmp/workspace/fa4_output.txt
nvidia-smi
export PYTHONPATH=\$(pwd)
python benchmarks/benchmark_attn.py >> /tmp/workspace/fa4_output.txt
popd
"
# Display results in GitHub step summary
if [ -f fa4_output.txt ]; then
cat fa4_output.txt >> $GITHUB_STEP_SUMMARY
fi