iree-org
diff --git a/‎.github/workflows/test_sharktank_models.yml
+2-1 b/‎.github/workflows/test_sharktank_models.yml
+2-1
diff --git a/‎sharktank_models/clip/test_clip.py
+32-19 b/‎sharktank_models/clip/test_clip.py
+32-19
diff --git a/‎sharktank_models/requirements.txt
+5 b/‎sharktank_models/requirements.txt
+5
diff --git a/‎sharktank_models/test_suite/README.md
+3 b/‎sharktank_models/test_suite/README.md
+3
diff --git a/‎sharktank_models/test_suite/benchmarks/README.md
+45 b/‎sharktank_models/test_suite/benchmarks/README.md
+45
diff --git a/‎sharktank_models/test_suite/benchmarks/external_test_files/sdxl_pipeline_bench_f16.mlir
+23 b/‎sharktank_models/test_suite/benchmarks/external_test_files/sdxl_pipeline_bench_f16.mlir
+23
diff --git a/‎sharktank_models/test_suite/benchmarks/run_benchmarks.py
+44 b/‎sharktank_models/test_suite/benchmarks/run_benchmarks.py
+44
diff --git a/‎sharktank_models/test_suite/benchmarks/sdxl/clip_rocm.json
+35 b/‎sharktank_models/test_suite/benchmarks/sdxl/clip_rocm.json
+35
diff --git a/‎sharktank_models/test_suite/benchmarks/sdxl/e2e_rocm.json
+44 b/‎sharktank_models/test_suite/benchmarks/sdxl/e2e_rocm.json
+44
diff --git a/‎sharktank_models/test_suite/benchmarks/sdxl/punet_int8_fp16_rocm.json
+35 b/‎sharktank_models/test_suite/benchmarks/sdxl/punet_int8_fp16_rocm.json
+35
@@ -65,7 +65,8 @@ jobs:
             --durations=0 \
             --log-cli-level=info \
             --html=${HTML_REPORT_PATH} \
-            --self-contained-html
+            --self-contained-html \
+            --ignore=sharktank_models/test_suite
 
       - name: Upload HTML report
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
 
@@ -17,47 +17,51 @@
 
 THIS_DIR = pathlib.Path(__file__).parent
 
+
 def load_tensor_from_irpa(path: PathLike) -> np.ndarray:
     index = iree.runtime.ParameterIndex()
     index.load(str(path))
     index_entry: iree.runtime.ParameterIndexEntry = index.items()[0][1]
     return iree.runtime.parameter_index_entry_as_numpy_ndarray(index_entry)
 
+
 @pytest.fixture(
-        params=[
-            pytest.param("local-task", marks=pytest.mark.target_cpu),
-            pytest.param("hip", marks=pytest.mark.target_hip),
-        ]
+    params=[
+        pytest.param("local-task", marks=pytest.mark.target_cpu),
+        pytest.param("hip", marks=pytest.mark.target_hip),
+    ]
 )
 def device_id(request: pytest.FixtureRequest) -> str:
     return request.param
 
 
-@pytest.fixture(
-        params=["bf16", "f32"]
-)
+@pytest.fixture(params=["bf16", "f32"])
 def model_variant(request: pytest.FixtureRequest) -> str:
     return request.param
 
 
 mlir_path = {
     "bf16": THIS_DIR / "assets/text_model/toy/bf16.mlir",
-    "f32": THIS_DIR / "assets/text_model/toy/f32.mlir"
+    "f32": THIS_DIR / "assets/text_model/toy/f32.mlir",
 }
 
 parameters_path = {
     "bf16": THIS_DIR / "assets/text_model/toy/bf16_parameters.irpa",
-    "f32": THIS_DIR / "assets/text_model/toy/f32_parameters.irpa"
+    "f32": THIS_DIR / "assets/text_model/toy/f32_parameters.irpa",
 }
 
 function_arg0_path = THIS_DIR / "assets/text_model/toy/forward_bs4_arg0_input_ids.irpa"
-function_expected_result0 = THIS_DIR / "assets/text_model/toy/forward_bs4_expected_result0_last_hidden_state_f32.irpa"
+function_expected_result0 = (
+    THIS_DIR
+    / "assets/text_model/toy/forward_bs4_expected_result0_last_hidden_state_f32.irpa"
+)
 
 absolute_tolerance = {
     "bf16": 1e-3,
-    "f32" : 1e-5,
+    "f32": 1e-5,
 }
 
+
 def compiler_args(device_id: str) -> list[str]:
     if device_id == "local-task":
         return ["--iree-hal-target-device=llvm-cpu", "--iree-llvmcpu-target-cpu=host"]
@@ -70,16 +74,21 @@ def compiler_args(device_id: str) -> list[str]:
 
     raise KeyError(f"Compiler args for {device_id} not found")
 
-def compile_and_run(mlir_path: str, compiler_args: list[str], function: str, args: list[np.ndarray]) -> list[np.ndarray]:
+
+def compile_and_run(
+    mlir_path: str, compiler_args: list[str], function: str, args: list[np.ndarray]
+) -> list[np.ndarray]:
     iree.compiler.compile_file(
         mlir_path,
         extra_args=compiler_args,
     )
 
+
 @pytest.fixture(scope="session")
 def iree_module(model_variant, device_id) -> iree.runtime.VmModule:
     compiler_arguments = compiler_args(device_id)
 
+
 def device_array_to_host(device_array: iree.runtime.DeviceArray) -> np.ndarray:
     def reinterpret_hal_buffer_view_element_type(
         buffer_view: iree.runtime.HalBufferView,
@@ -157,11 +166,12 @@ def assert_text_encoder_state_close(
         rtol=0,
     )
 
+
 def test_results_close(model_variant, device_id):
     module_buffer = iree.compiler.compile_file(
-            str(mlir_path[model_variant]),
-            extra_args=compiler_args(device_id),
-        )
+        str(mlir_path[model_variant]),
+        extra_args=compiler_args(device_id),
+    )
 
     vm_instance = iree.runtime.VmInstance()
     paramIndex = iree.runtime.ParameterIndex()
@@ -173,13 +183,16 @@ def test_results_close(model_variant, device_id):
     device = iree.runtime.get_device(device_id)
     hal_module = iree.runtime.create_hal_module(instance=vm_instance, devices=[device])
     vm_module = iree.runtime.VmModule.from_buffer(vm_instance, module_buffer)
-    config=iree.runtime.Config(device=device)
-    bound_modules = iree.runtime.load_vm_modules(hal_module, parameters_module, vm_module,
-                                              config=config)
+    config = iree.runtime.Config(device=device)
+    bound_modules = iree.runtime.load_vm_modules(
+        hal_module, parameters_module, vm_module, config=config
+    )
     module = bound_modules[-1]
     result = module.forward_bs4(load_tensor_from_irpa(function_arg0_path))[0]
 
     expected_result = load_tensor_from_irpa(function_expected_result0)
     result = device_array_to_host(result).astype(dtype=expected_result.dtype)
 
-    assert_text_encoder_state_close(result, expected_result, absolute_tolerance[model_variant])
+    assert_text_encoder_state_close(
+        result, expected_result, absolute_tolerance[model_variant]
+    )
@@ -1,10 +1,15 @@
 # Baseline requirements for running the test suite.
 #   * See requirements-iree.txt for using IREE packages.
 
+azure-storage-blob
 ml_dtypes
 numpy
 pytest
+pytest-check
+pytest-dependency
 pytest-html
 pytest-reportlog
+pytest-retry
 pytest-timeout
 pytest-xdist
+tabulate
@@ -0,0 +1,3 @@
+## Regression Test Suite
+
+details to come!
@@ -0,0 +1,45 @@
+## Benchmark tests
+
+### Adding your own model
+
+- To add your own model, create a directory under `benchmarks` and add JSON files that correspond to the submodels and chip. Please follow the [JSON file schema in this README file](#required-and-optional-fields-for-the-json-model-file)
+
+### How to run
+
+```
+python sharktank_models/test_suite/benchmarks/run_benchmarks.py --model=sdxl --filename=*
+
+python sharktank_models/test_suite/benchmarks/run_benchmarks.py --model=sdxl --filename=clip_rocm
+```
+
+Argument options for the script
+
+| Argument Name | Default value | Description                                                                                                                                      |
+| ------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --model       | sdxl          | Runs benchmark tests for a specific model                                                                                                        |
+| --filename    | \*            | If specified, the benchmark tests will run for a specific filename (ex: `--filename clip`). If not specified, it will run tests on all filenames |
+| --sku         | mi300         | The benchmark tests will run on this sku and retrieve golden values from the specified sku                                                       |
+| --rocm-chip   | gfx942        | The benchmark tests will run on this ROCM chip                                                                                                   |
+
+### Required and optional fields for the JSON model file
+
+| Field Name                       | Required | Type    | Description                                                                                                                  |
+| -------------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------- |
+| inputs                           | required | array   | An array of input strings for the benchmark module (ex: `["1xi64, 1xf16]`)                                                   |
+| compilation_required             | optional | boolean | If true, this will let the benchmark test know that it needs to compile a file                                               |
+| compiled_file_name               | optional | string  | When the compilation occurs, this will be the file name                                                                      |
+| compile_flags                    | optional | array   | An array of compiler flag options                                                                                            |
+| mlir_file_path                   | optional | string  | Path to where the mlir file to compile is                                                                                    |
+| modules                          | optional | array   | Specific to e2e, add modules here to include in the benchmarking test                                                        |
+| function_run                     | required | string  | The function that the `iree-benchmark-module` will run adnd benchmark                                                        |
+| benchmark_repetitions            | required | float   | The number of times the benchmark tests will repeat                                                                          |
+| benchmark_min_warmup_time        | required | float   | The minimum warm up time for the benchmark test                                                                              |
+| device                           | required | string  | The device that the benchmark tests are running                                                                              |
+| golden_time_tolerance_multiplier | optional | object  | An object of tolerance multipliers, where the key is the sku and the value is the multiplier, (ex: `{"mi250": 1.3}`)         |
+| golden_time_ms                   | optional | object  | An object of golden times, where the key is the sku and the value is the golden time in ms, (ex: `{"mi250": 100}`)           |
+| golden_dispatch                  | optional | object  | An object of golden dispatches, where the key is the sku and the value is the golden dispatch count, (ex: `{"mi250": 1602}`) |
+| golden_size                      | optional | object  | An object of golden sizes, where the key is the sku and the value is the golden size in bytes, (ex: `{"mi250": 2000000}`)    |
+| specific_chip_to_ignore     | optional | array   | An array of chip values, where the benchmark tests will ignore the chips specified                                           |
+| real_weights_file_name           | optional | string  | If real weights is a different file name, specify it here in order to get the correct real weights file                      |
+
+Please feel free to look at any JSON examples under a model directory (ex: sdxl)
@@ -0,0 +1,23 @@
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
+    return %image : tensor<1x3x1024x1024xf16>
+  }
+}
@@ -0,0 +1,44 @@
+# Copyright 2025 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import subprocess
+import os
+from pathlib import Path
+import argparse
+import sys
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="sdxl")
+    parser.add_argument("--filename", type=str, default="*")
+    parser.add_argument("--sku", type=str, default="mi300")
+    parser.add_argument("--rocm-chip", type=str, default="gfx942")
+    args = parser.parse_args()
+    model = args.model
+    filename = args.filename
+    sku = args.sku
+    rocm_chip = args.rocm_chip
+
+    os.environ["BENCHMARK_MODEL"] = model
+    os.environ["BENCHMARK_FILE_NAME"] = filename
+    os.environ["SKU"] = sku
+    os.environ["ROCM_CHIP"] = rocm_chip
+
+    THIS_DIR = Path(__file__).parent
+
+    command = [
+        "pytest",
+        THIS_DIR / "test_model_benchmark.py",
+        "--log-cli-level=info",
+        "--timeout=600",
+        "--retries=7",
+    ]
+    subprocess.run(command)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,35 @@
+{
+    "inputs": [
+        "1x64xi64",
+        "1x64xi64",
+        "1x64xi64",
+        "1x64xi64"
+    ],
+    "function_run": "encode_prompts", 
+    "benchmark_flags": [
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+        "--device_allocator=caching"
+    ],
+    "device": "hip",
+    "golden_time_tolerance_multiplier": {
+        "mi250": 1.3,
+        "mi300": 1.1,
+        "mi308": 1.1
+    },
+    "golden_time_ms": {
+        "mi250": 14.5,
+        "mi300": 15.0,
+        "mi308": 15.0
+    },
+    "golden_dispatch": {
+        "mi250": 1139,
+        "mi300": 1139,
+        "mi308": 1139
+    },
+    "golden_size": {
+        "mi250": 860000,
+        "mi300": 860000,
+        "mi308": 860000
+    }
+}
@@ -0,0 +1,44 @@
+{
+    "inputs": [
+        "1x4x128x128xf16",
+        "1xf16",
+        "1x64xi64",
+        "1x64xi64",
+        "1x64xi64",
+        "1x64xi64"
+    ],
+    "compilation_required": true,
+    "compiled_file_name": "sdxl_full_pipeline_fp16_rocm",
+    "compile_flags": [
+        "--iree-global-opt-propagate-transposes=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution",
+        "--iree-codegen-gpu-native-math-precision=true",
+        "--iree-hip-waves-per-eu=2",
+        "--iree-opt-outer-dim-concat=true",
+        "--iree-llvmgpu-enable-prefetch",
+        "--iree-hal-target-backends=rocm"
+    ],
+    "mlir_file_path": "external_test_files/sdxl_pipeline_bench_f16.mlir",
+    "modules": [
+        "sdxl_clip",
+        "sdxl_unet_fp16",
+        "sdxl_vae"
+    ],
+    "function_run": "tokens_to_image", 
+    "benchmark_flags": [
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+        "--device_allocator=caching"
+    ],
+    "device": "hip",
+    "golden_time_tolerance_multiplier": {
+        "mi250": 1.3,
+        "mi300": 1.1,
+        "mi308": 1.1
+    },
+    "golden_time_ms": {
+        "mi250": 1100,
+        "mi300": 325,
+        "mi308": 800
+    }
+}
@@ -0,0 +1,35 @@
+{
+    "inputs": [
+        "1x4x128x128xf16",
+        "1xf16",
+        "2x64x2048xf16",
+        "2x1280xf16",
+        "2x6xf16",
+        "1xf16"
+    ],
+    "function_run": "main", 
+    "benchmark_flags": [
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+        "--device_allocator=caching"
+    ],
+    "device": "hip",
+    "golden_time_tolerance_multiplier": {
+        "mi300": 1.1,
+        "mi308": 1.1
+    },
+    "golden_time_ms": {
+        "mi300": 50,
+        "mi308": 140
+    },
+    "golden_dispatch": {
+        "mi300": 1424,
+        "mi308": 1424
+    },
+    "golden_size": {
+        "mi300": 2560000,
+        "mi308": 2560000
+    },
+    "specific_chip_to_ignore": ["gfx90a"],
+    "real_weights_file_name": "punet_weights.irpa"
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+## Regression Test Suite`
	`2`	`+`
	`3`	`+details to come!`