nod-ai · Jan 27, 2025
diff --git a/‎.github/workflows/ci-sharktank-nightly.yaml
+87 b/‎.github/workflows/ci-sharktank-nightly.yaml
+87
diff --git a/‎.github/workflows/ci-sharktank.yml
+5-1 b/‎.github/workflows/ci-sharktank.yml
+5-1
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎sharktank/pyproject.toml
+1 b/‎sharktank/pyproject.toml
+1
diff --git a/‎sharktank/requirements-tests.txt
+1 b/‎sharktank/requirements-tests.txt
+1
diff --git a/‎sharktank/sharktank/models/flux/benchmark.py
+92 b/‎sharktank/sharktank/models/flux/benchmark.py
+92
diff --git a/‎sharktank/sharktank/models/flux/export.py
+20 b/‎sharktank/sharktank/models/flux/export.py
+20
diff --git a/‎sharktank/sharktank/models/flux/testing.py
+3 b/‎sharktank/sharktank/models/flux/testing.py
+3
@@ -0,0 +1,87 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: Sharktank Nightly Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays at 10:00 AM UTC = 02:00 AM PST / 03:00 AM PDT
+    - cron: "0 10 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-mi300x:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "Nightly tests and benchmarks"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-3
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      HF_HOME: "/data/huggingface"
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+      - name: Create Python venv
+        run: python -m venv ${VENV_DIR}
+
+      - name: Install pip deps
+        run: |
+          source ${VENV_DIR}/bin/activate
+          python -m pip install --no-compile --upgrade pip
+
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install -r requirements-iree-unpinned.txt
+          pip install --no-compile \
+            -r sharktank/requirements-tests.txt \
+            -e sharktank/
+
+          pip freeze
+
+      - name: Run benchmarks
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest \
+            --verbose \
+            --capture=no \
+            --iree-hip-target=gfx942 \
+            --iree-device=hip://6 \
+            --with-flux-data \
+            -m="benchmark and expensive" \
+            --html=out/benchmark/index.html \
+            sharktank/tests
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/benchmark
+          destination_dir: ./benchmark
+          keep_files: true
@@ -85,7 +85,11 @@ jobs:
       - name: Run sharktank tests
         if: ${{ !cancelled() }}
         run: |
-          pytest -n 4 sharktank/ --durations=10
+          pytest \
+            -n 4 \
+            --durations=10 \
+            -m "not expensive" \
+            sharktank/
 
 
   test_with_data:
 
@@ -0,0 +1,3 @@
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark
@@ -44,6 +44,7 @@ addopts = [
     "-m=unit",
 ]
 markers = [
+    "benchmark: model benchmarks",
     "expensive: tests that are very expensive",
     "export: tests that require export from torch",
     "golden: tests that compare to some golden values",
 
@@ -6,6 +6,7 @@ accelerate
 
 datasets==3.0.0
 diffusers
+google-benchmark
 parameterized
 protobuf
 pytest==8.0.0
 
@@ -0,0 +1,92 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+import iree.compiler
+import iree.runtime
+import os
+from iree.turbine.support.tools import iree_tool_prepare_input_args
+
+from .export import (
+    export_flux_transformer_from_hugging_face,
+    flux_transformer_default_batch_sizes,
+    iree_compile_flags,
+)
+from ...types import Dataset
+from .flux import FluxModelV1, FluxParams
+from ...utils.export_artifacts import ExportArtifacts
+from ...utils.iree import flatten_for_iree_signature
+from ...utils.benchmark import iree_benchmark_module
+
+
+def iree_benchmark_flux_dev_transformer(
+    artifacts_dir: Path,
+    iree_device: str,
+    json_result_output_path: Path,
+    caching: bool = False,
+) -> str:
+    mlir_path = artifacts_dir / "model.mlir"
+    parameters_path = artifacts_dir / "parameters.irpa"
+    if (
+        not caching
+        or not os.path.exists(mlir_path)
+        or not os.path.exists(parameters_path)
+    ):
+        export_flux_transformer_from_hugging_face(
+            "black-forest-labs/FLUX.1-dev/black-forest-labs-transformer",
+            mlir_output_path=mlir_path,
+            parameters_output_path=parameters_path,
+        )
+    return iree_benchmark_flux_transformer(
+        mlir_path=mlir_path,
+        parameters_path=parameters_path,
+        artifacts_dir=artifacts_dir,
+        iree_device=iree_device,
+        json_result_output_path=json_result_output_path,
+        caching=caching,
+    )
+
+
+def iree_benchmark_flux_transformer(
+    artifacts_dir: Path,
+    mlir_path: Path,
+    parameters_path: Path,
+    iree_device: str,
+    json_result_output_path: Path,
+    caching: bool = False,
+) -> str:
+    dataset = Dataset.load(parameters_path)
+    model = FluxModelV1(
+        theta=dataset.root_theta,
+        params=FluxParams.from_hugging_face_properties(dataset.properties),
+    )
+    input_args = flatten_for_iree_signature(
+        model.sample_inputs(batch_size=flux_transformer_default_batch_sizes[0])
+    )
+    cli_input_args = iree_tool_prepare_input_args(
+        input_args, file_path_prefix=f"{artifacts_dir / 'arg'}"
+    )
+    cli_input_args = [f"--input={v}" for v in cli_input_args]
+
+    iree_module_path = artifacts_dir / "model.vmfb"
+    if not caching or not os.path.exists(iree_module_path):
+        iree.compiler.compile_file(
+            mlir_path,
+            output_file=iree_module_path,
+            extra_args=iree_compile_flags,
+        )
+
+    iree_benchmark_args = [
+        f"--device={iree_device}",
+        f"--module={iree_module_path}",
+        f"--parameters=model={parameters_path}",
+        f"--function=forward_bs{flux_transformer_default_batch_sizes[0]}",
+        "--benchmark_repetitions=30",
+        "--benchmark_min_warmup_time=1.0",
+        "--benchmark_out_format=json",
+        f"--benchmark_out={json_result_output_path}",
+    ] + cli_input_args
+    return iree_benchmark_module(iree_benchmark_args)
@@ -17,6 +17,26 @@
 
 flux_transformer_default_batch_sizes = [1]
 
+iree_compile_flags = [
+    "--iree-hal-target-device=hip",
+    "--iree-hip-target=gfx942",
+    "--iree-opt-const-eval=false",
+    "--iree-opt-strip-assertions=true",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-dispatch-creation-enable-fuse-horizontal-contractions=true",
+    "--iree-dispatch-creation-enable-aggressive-fusion=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-vm-target-truncate-unsupported-floats",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-opt-data-tiling=false",
+    "--iree-codegen-gpu-native-math-precision=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution",
+    "--iree-hip-waves-per-eu=2",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline,iree-preprocessing-pad-to-intrinsics)",
+]
+
 
 def export_flux_transformer_model_mlir(
     model: FluxModelV1,
 
@@ -7,6 +7,7 @@
 import torch
 from os import PathLike
 from collections import OrderedDict
+import pytest
 
 from .flux import FluxParams, FluxModelV1
 from .export import export_flux_transformer, flux_transformer_default_batch_sizes
@@ -17,6 +18,8 @@
     make_mmdit_single_block_random_theta,
 )
 
+with_flux_data = pytest.mark.skipif("not config.getoption('with_flux_data')")
+
 
 def convert_flux_transformer_input_for_hugging_face_model(
     img: torch.Tensor,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "third_party/benchmark"]`
	`2`	`+ path = third_party/benchmark`
	`3`	`+ url = https://github.com/google/benchmark`
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ addopts = [`
`44`	`44`	`"-m=unit",`
`45`	`45`	`]`
`46`	`46`	`markers = [`
	`47`	`+ "benchmark: model benchmarks",`
`47`	`48`	`"expensive: tests that are very expensive",`
`48`	`49`	`"export: tests that require export from torch",`
`49`	`50`	`"golden: tests that compare to some golden values",`