nod-ai · Mar 12, 2025
diff --git a/‎.github/workflows/ci-sharktank.yml
+3 b/‎.github/workflows/ci-sharktank.yml
+3
diff --git a/‎sharktank/sharktank/layers/mmdit.py
+3-5 b/‎sharktank/sharktank/layers/mmdit.py
+3-5
diff --git a/‎sharktank/sharktank/layers/testing.py
+37-36 b/‎sharktank/sharktank/layers/testing.py
+37-36
diff --git a/‎sharktank/sharktank/models/flux/export.py
+6-1 b/‎sharktank/sharktank/models/flux/export.py
+6-1
diff --git a/‎sharktank/sharktank/models/flux/flux.py
+59-27 b/‎sharktank/sharktank/models/flux/flux.py
+59-27
diff --git a/‎sharktank/sharktank/models/flux/testing.py
+45-8 b/‎sharktank/sharktank/models/flux/testing.py
+45-8
diff --git a/‎sharktank/sharktank/utils/logging.py
+5 b/‎sharktank/sharktank/utils/logging.py
+5
diff --git a/‎sharktank/sharktank/utils/testing.py
+3 b/‎sharktank/sharktank/utils/testing.py
+3
diff --git a/‎sharktank/tests/evaluate/perplexity_iree_test.py
+1-1 b/‎sharktank/tests/evaluate/perplexity_iree_test.py
+1-1
diff --git a/‎sharktank/tests/layers/mmdit_test.py
+8-2 b/‎sharktank/tests/layers/mmdit_test.py
+8-2
diff --git a/‎sharktank/tests/models/flux/flux_test.py
+78-45 b/‎sharktank/tests/models/flux/flux_test.py
+78-45
@@ -144,6 +144,9 @@ jobs:
             --with-flux-data \
             --with-vae-data \
             --with-quark-data \
+            --iree-hal-target-device=hip \
+            --iree-hip-target=gfx942 \
+            --iree-device=hip://0 \
             sharktank/tests/models/clip/clip_test.py \
             sharktank/tests/models/t5/t5_test.py \
             sharktank/tests/models/flux/flux_test.py \
 
@@ -162,11 +162,12 @@ def forward(
 
 
 class MMDITSingleBlock(ThetaLayer):
-    def __init__(self, theta, num_heads: int, hidden_size: int):
+    def __init__(self, theta, num_heads: int, hidden_size: int, mlp_ratio: float):
         super().__init__(theta)
 
         self.num_heads = num_heads
         self.hidden_size = hidden_size
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.add_module("mod", ModulationLayer(theta("modulation"), double=False))
         self.add_module(
             "attn_norm_q",
@@ -179,9 +180,6 @@ def __init__(self, theta, num_heads: int, hidden_size: int):
 
         self.add_module("linear1", LinearLayer(theta("linear1")))
         self.add_module("linear2", LinearLayer(theta("linear2")))
-        # TODO: There should be a way to refactor out the following two constants and just reference model shapes
-        self.hidden_size = 3072
-        self.mlp_hidden_dim = 3072
 
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
         mod, _ = self.mod(vec)
@@ -191,7 +189,7 @@ def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
         x_mod = (1 + mod.scale) * x_norm + mod.shift
         x_lin = self.linear1(x_mod)
         qkv, mlp = torch.split(
-            x_lin, [3 * self.hidden_size, 4 * self.mlp_hidden_dim], dim=-1
+            x_lin, [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
         )
 
         qkv_2 = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1)  #
 
@@ -95,24 +95,22 @@ def make_latent_attention_block_theta(
 
 
 def make_mmdit_double_block_random_theta(
-    in_channels: int = 128,
     hidden_size: int = 3072,
+    num_heads: int = 24,
     mlp_ratio: float = 4.0,
     dtype: torch.dtype | None = None,
 ) -> Theta:
-    in_channels = 128
-    hidden_size = 3072
-    mlp_ratio = 4.0
-    mlp_hidden_size = int((mlp_ratio - 1) * hidden_size)
-    mlp_hidden_size2 = int(mlp_ratio * hidden_size)
-    mlp_hidden_size3 = int(2 * (mlp_ratio - 1) * hidden_size)
+    head_dim = hidden_size // num_heads
+    mlp_hidden_size = int(mlp_ratio * hidden_size)
+    qkv_out_size = 3 * hidden_size
+    modulation_size = hidden_size * 6
     return Theta(
         {
             "img_attn.norm.key_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "img_attn.norm.query_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "img_attn.proj.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size,), dtype=dtype)
@@ -121,34 +119,34 @@ def make_mmdit_double_block_random_theta(
                 data=make_rand_torch((hidden_size, hidden_size), dtype=dtype)
             ),
             "img_attn.qkv.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size,), dtype=dtype)
+                data=make_rand_torch((qkv_out_size,), dtype=dtype)
             ),
             "img_attn.qkv.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size, hidden_size), dtype=dtype)
+                data=make_rand_torch((qkv_out_size, hidden_size), dtype=dtype)
             ),
             "img_mlp.0.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size2), dtype=dtype)
+                data=make_rand_torch((mlp_hidden_size), dtype=dtype)
             ),
             "img_mlp.0.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size2, hidden_size), dtype=dtype)
+                data=make_rand_torch((mlp_hidden_size, hidden_size), dtype=dtype)
             ),
             "img_mlp.2.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size), dtype=dtype)
             ),
             "img_mlp.2.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((hidden_size, mlp_hidden_size2), dtype=dtype)
+                data=make_rand_torch((hidden_size, mlp_hidden_size), dtype=dtype)
             ),
             "img_mod.lin.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3,), dtype=dtype)
+                data=make_rand_torch((modulation_size,), dtype=dtype)
             ),
             "img_mod.lin.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3, hidden_size), dtype=dtype)
+                data=make_rand_torch((modulation_size, hidden_size), dtype=dtype)
             ),
             "txt_attn.norm.key_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "txt_attn.norm.query_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "txt_attn.proj.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size,), dtype=dtype)
@@ -157,49 +155,50 @@ def make_mmdit_double_block_random_theta(
                 data=make_rand_torch((hidden_size, hidden_size), dtype=dtype)
             ),
             "txt_attn.qkv.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size,), dtype=dtype)
+                data=make_rand_torch((qkv_out_size,), dtype=dtype)
             ),
             "txt_attn.qkv.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size, hidden_size), dtype=dtype)
+                data=make_rand_torch((qkv_out_size, hidden_size), dtype=dtype)
             ),
             "txt_mlp.0.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size2), dtype=dtype)
+                data=make_rand_torch((mlp_hidden_size), dtype=dtype)
             ),
             "txt_mlp.0.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size2, hidden_size), dtype=dtype)
+                data=make_rand_torch((mlp_hidden_size, hidden_size), dtype=dtype)
             ),
             "txt_mlp.2.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size), dtype=dtype)
             ),
             "txt_mlp.2.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((hidden_size, mlp_hidden_size2), dtype=dtype)
+                data=make_rand_torch((hidden_size, mlp_hidden_size), dtype=dtype)
             ),
             "txt_mod.lin.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3,), dtype=dtype)
+                data=make_rand_torch((modulation_size,), dtype=dtype)
             ),
             "txt_mod.lin.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3, hidden_size), dtype=dtype)
+                data=make_rand_torch((modulation_size, hidden_size), dtype=dtype)
             ),
         }
     )
 
 
 def make_mmdit_single_block_random_theta(
-    in_channels: int = 128,
     hidden_size: int = 3072,
+    num_heads: int = 24,
     mlp_ratio: float = 4.0,
     dtype: torch.dtype | None = None,
 ) -> Theta:
-    mlp_hidden_size = int((mlp_ratio - 1) * hidden_size)
-    mlp_hidden_size2 = int((mlp_ratio + 1) * hidden_size)
-    mlp_hidden_size3 = int((2 * mlp_ratio - 1) * hidden_size)
+    mlp_hidden_dim = int(hidden_size * mlp_ratio)
+    head_dim = hidden_size // num_heads
+    modulation_size = 3 * hidden_size
+    linear1_hidden_size = hidden_size * 3 + mlp_hidden_dim
     return Theta(
         {
             "norm.key_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "norm.query_norm.scale": DefaultPrimitiveTensor(  #
-                data=make_rand_torch((in_channels,), dtype=dtype)
+                data=make_rand_torch((head_dim,), dtype=dtype)
             ),
             "attn.proj.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size,), dtype=dtype)
@@ -208,22 +207,24 @@ def make_mmdit_single_block_random_theta(
                 data=make_rand_torch((hidden_size, hidden_size), dtype=dtype)
             ),
             "linear1.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3,), dtype=dtype)
+                data=make_rand_torch((linear1_hidden_size,), dtype=dtype)
             ),
             "linear1.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size3, hidden_size), dtype=dtype)
+                data=make_rand_torch((linear1_hidden_size, hidden_size), dtype=dtype)
             ),
             "linear2.bias": DefaultPrimitiveTensor(
                 data=make_rand_torch((hidden_size), dtype=dtype)
             ),
             "linear2.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((hidden_size, mlp_hidden_size2), dtype=dtype)
+                data=make_rand_torch(
+                    (hidden_size, hidden_size + mlp_hidden_dim), dtype=dtype
+                )
             ),
             "modulation.lin.bias": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size,), dtype=dtype)
+                data=make_rand_torch((modulation_size,), dtype=dtype)
             ),
             "modulation.lin.weight": DefaultPrimitiveTensor(
-                data=make_rand_torch((mlp_hidden_size, hidden_size), dtype=dtype)
+                data=make_rand_torch((modulation_size, hidden_size), dtype=dtype)
             ),
         }
     )
@@ -16,6 +16,9 @@
 from ...types import Dataset
 from ...utils.hf_datasets import get_dataset
 from sharktank.transforms.dataset import set_float_dtype
+from iree.turbine.aot import (
+    ExternalTensorTrait,
+)
 
 flux_transformer_default_batch_sizes = [1]
 
@@ -35,6 +38,8 @@ def export_flux_transformer_model_mlir(
     else:
         model = model_or_parameters_path
 
+    for t in model.theta.flatten().values():
+        ExternalTensorTrait(external_name=t.name, external_scope="").set(t.as_torch())
     export_static_model_mlir(model, output_path=output_path, batch_sizes=batch_sizes)
 
 
@@ -60,7 +65,7 @@ def export_flux_transformer(
 ):
     export_flux_transformer_iree_parameters(model, parameters_output_path)
     export_flux_transformer_model_mlir(
-        parameters_output_path, output_path=mlir_output_path, batch_sizes=batch_sizes
+        model, output_path=mlir_output_path, batch_sizes=batch_sizes
     )
 
 
 
@@ -25,6 +25,7 @@
 from ... import ops
 
 __all__ = [
+    "FluxParams",
     "FluxModelV1",
 ]
 
@@ -49,6 +50,18 @@ class FluxParams:
     qkv_bias: bool
     guidance_embed: bool
 
+    time_dim: int = 256
+    txt_context_length: int = 512
+
+    # The allowed range of these values is dependent on the model size.
+    # They will not work for all variants, specifically toy-sized models.
+    output_img_height: int = 1024
+    output_img_width: int = 1024
+    output_img_channels: int = 3
+
+    # def __post_init__(self):
+    #     assert self.hidden_size == self.vec_in_dim * int(self.mlp_ratio)
+
     def to_hugging_face_properties(self) -> dict[str, Any]:
         hparams = {
             "in_channels": self.in_channels,
@@ -71,14 +84,12 @@ def from_hugging_face_properties(properties: dict[str, Any]) -> "FluxParams":
         vec_in_dim = p["pooled_projection_dim"]
         context_in_dim = p["joint_attention_dim"]
         mlp_ratio = 4.0
-        hidden_size = vec_in_dim * int(mlp_ratio)
+        hidden_size = int(vec_in_dim * mlp_ratio)
         num_heads = p["num_attention_heads"]
         depth = p["num_layers"]
         depth_single_blocks = p["num_single_layers"]
 
-        # TODO: figure out relation between hidden_size, num_heads and
-        # attention_head_dim.
-        # diffusers.FluxTransformer2DModel also hardcodes this.
+        # diffusers.FluxTransformer2DModel hardcodes this.
         axes_dim = [16, 56, 56]
         assert sum(axes_dim) == p["attention_head_dim"]
 
@@ -102,6 +113,29 @@ def from_hugging_face_properties(properties: dict[str, Any]) -> "FluxParams":
             guidance_embed=guidance_embed,
         )
 
+    def validate(self):
+        if self.in_channels % 4 != 0:
+            raise ValueError(f"In channels {self.in_channels} must be a multiple of 4")
+        if self.hidden_size != self.vec_in_dim * self.mlp_ratio:
+            raise ValueError(
+                "Equality hidden_size == vec_in_dim * mlp_ratio does not hold. "
+                f"{self.hidden_size} != {self.vec_in_dim} * {self.mlp_ratio}"
+            )
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {self.hidden_size} must be divisible by num_heads {self.num_heads}"
+            )
+        pe_dim = self.hidden_size // self.num_heads
+        if sum(self.axes_dim) != pe_dim:
+            raise ValueError(
+                f"axes_dim {self.axes_dim} must sum up to the positional embeddings"
+                f" dimension size {pe_dim}"
+            )
+        if any(d % 2 != 0 for d in self.axes_dim):
+            raise ValueError(
+                f"All elements of axes_dim {self.axes_dim} must be a multiple of 2"
+            )
+
 
 class FluxModelV1(ThetaLayer):
     """FluxModel adapted from Black Forest Lab's implementation."""
@@ -111,18 +145,11 @@ def __init__(self, theta: Theta, params: FluxParams):
             theta,
         )
 
+        params.validate()
         self.params = copy(params)
         self.in_channels = params.in_channels
         self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
         pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
-            )
         self.hidden_size = params.hidden_size
         self.num_heads = params.num_heads
         self.pe_embedder = EmbedND(
@@ -154,6 +181,7 @@ def __init__(self, theta: Theta, params: FluxParams):
                     theta("single_blocks", i),
                     num_heads=self.num_heads,
                     hidden_size=self.hidden_size,
+                    mlp_ratio=params.mlp_ratio,
                 )
                 for i in range(params.depth_single_blocks)
             ]
@@ -181,13 +209,15 @@ def forward(
 
         # running on sequences img
         img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256))
+        vec = self.time_in(timestep_embedding(timesteps, self.params.time_dim))
         if self.guidance:
             if guidance is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."
                 )
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+            vec = vec + self.guidance_in(
+                timestep_embedding(guidance, self.params.time_dim)
+            )
 
         vec = vec + self.vector_in(y)
 
@@ -213,14 +243,13 @@ def sample_inputs(
         if not (function is None or function == "forward"):
             raise ValueError(f'Only function "forward" is supported. Got "{function}"')
 
-        # The allowed range of these values is dependent on the model size.
-        # They will not work for all variants, specifically toy-sized models.
-        output_img_height = 1024
-        output_img_width = 1024
-        output_img_channels = 3
+        output_img_channels = self.params.output_img_channels
 
         img = self._get_noise(
-            batch_size, output_img_height, output_img_width, self.dtype
+            batch_size,
+            self.params.output_img_height,
+            self.params.output_img_width,
+            self.dtype,
         )
 
         _, c, h, w = img.shape
@@ -233,16 +262,17 @@ def sample_inputs(
         img_ids = img_ids.repeat(batch_size, 1, 1)
 
         # T5 encoder output
-        txt_context_length = 512
-        txt_dims_per_token = 4096
-        txt = torch.rand([1, txt_context_length, txt_dims_per_token], dtype=self.dtype)
+        txt_dims_per_token = self.params.context_in_dim
+        txt = torch.rand(
+            [1, self.params.txt_context_length, txt_dims_per_token], dtype=self.dtype
+        )
         txt = txt.repeat(batch_size, 1, 1)
         txt_ids = torch.zeros(batch_size, txt.shape[1], output_img_channels)
 
         timesteps = torch.rand([batch_size], dtype=self.dtype)
 
         # CLIP text model output
-        y = make_rand_torch([1, 768], dtype=self.dtype)
+        y = make_rand_torch([1, self.params.vec_in_dim], dtype=self.dtype)
         y = y.repeat(batch_size, 1)
 
         args = tuple()
@@ -269,12 +299,14 @@ def _get_noise(
         width: int,
         dtype: torch.dtype,
     ):
+        assert self.params.in_channels % 4 == 0
+        channels = self.params.in_channels // 4
         return torch.randn(
             batch_size,
-            16,
+            channels,
             # allow for packing
-            2 * math.ceil(height / 16),
-            2 * math.ceil(width / 16),
+            2 * math.ceil(height / channels),
+            2 * math.ceil(width / channels),
             dtype=dtype,
         )
 
 
@@ -10,7 +10,7 @@
 
 from .flux import FluxParams, FluxModelV1
 from .export import export_flux_transformer, flux_transformer_default_batch_sizes
-from ...types import DefaultPrimitiveTensor, Theta, save_load_theta
+from ...types import DefaultPrimitiveTensor, Theta
 from ...layers.testing import (
     make_rand_torch,
     make_mmdit_double_block_random_theta,
@@ -41,14 +41,11 @@ def convert_flux_transformer_input_for_hugging_face_model(
 
 
 def make_random_theta(config: FluxParams, dtype: torch.dtype):
-    # TODO: do not hardcode values.
-
     in_channels = config.in_channels
-    in_channels2 = 128
     hidden_size = config.hidden_size
     mlp_ratio = config.mlp_ratio
     context_in_dim = config.context_in_dim
-    time_dim = 256
+    time_dim = config.time_dim
     vec_dim = config.vec_in_dim
     patch_size = 1
     out_channels = config.out_channels
@@ -107,12 +104,18 @@ def make_random_theta(config: FluxParams, dtype: torch.dtype):
 
     for i in range(config.depth):
         tensor_dict[f"double_blocks.{i}"] = make_mmdit_double_block_random_theta(
-            in_channels=in_channels, hidden_size=hidden_size, mlp_ratio=mlp_ratio
+            hidden_size=hidden_size,
+            mlp_ratio=mlp_ratio,
+            num_heads=config.num_heads,
+            dtype=dtype,
         ).flatten()
 
     for i in range(config.depth_single_blocks):
         tensor_dict[f"single_blocks.{i}"] = make_mmdit_single_block_random_theta(
-            in_channels=in_channels2, hidden_size=hidden_size, mlp_ratio=mlp_ratio
+            hidden_size=hidden_size,
+            mlp_ratio=mlp_ratio,
+            num_heads=config.num_heads,
+            dtype=dtype,
         ).flatten()
 
     if config.guidance_embed:
@@ -141,7 +144,9 @@ def make_random_theta(config: FluxParams, dtype: torch.dtype):
             data=make_rand_torch((hidden_size,), dtype=dtype)
         )
 
-    return Theta(tensor_dict)
+    res = Theta(tensor_dict)
+    res.rename_tensors_to_paths()
+    return res
 
 
 def make_dev_single_layer_config():
@@ -162,6 +167,38 @@ def make_dev_single_layer_config():
     )
 
 
+def make_toy_config() -> FluxParams:
+    num_heads = 5
+    mlp_ratio = 2
+    axes_dim = [4 * 2, 4 * 3, 4 * 4]
+    in_channels = sum(axes_dim)
+    hidden_size = in_channels * num_heads
+    vec_in_dim = hidden_size // mlp_ratio
+    assert hidden_size == mlp_ratio * vec_in_dim
+    output_img_height = 2 * in_channels // 4
+    output_img_width = 3 * in_channels // 4
+    return FluxParams(
+        in_channels=in_channels,
+        out_channels=in_channels,
+        time_dim=13,
+        vec_in_dim=vec_in_dim,
+        context_in_dim=7,
+        txt_context_length=11,
+        hidden_size=hidden_size,
+        mlp_ratio=float(mlp_ratio),
+        num_heads=num_heads,
+        depth=3,
+        depth_single_blocks=2,
+        axes_dim=axes_dim,
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=True,
+        output_img_height=output_img_height,
+        output_img_width=output_img_width,
+        output_img_channels=3,
+    )
+
+
 def export_dev_random_single_layer(
     dtype: torch.dtype,
     mlir_output_path: PathLike,
 
@@ -5,8 +5,13 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import logging
+import torch
 
 from iree.turbine.support.logging import get_logger
 
 
 transform_logger: logging.Logger = get_logger("sharktank.transforms")
+
+
+def format_tensor_statistics(tensor: torch.Tensor):
+    return f"mean = {tensor.mean()}, median = {tensor.median()}, std dev = {tensor.std()}, min = {tensor.min()}, max = {tensor.max()}"
@@ -7,6 +7,7 @@
 from typing import Optional
 import contextlib
 from pathlib import Path
+import pytest
 from os import PathLike
 import os
 import shutil
@@ -21,6 +22,8 @@
 from ..types import *
 from .math import cosine_similarity
 
+is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
+
 # Range of torch.rand() is [0,1)
 # Range of torch.rand() * 2 - 1 is [-1, 1), includes negative values
 def make_rand_torch(shape: list[int], dtype: Optional[torch.dtype] = torch.float32):
 
@@ -10,8 +10,8 @@
 import numpy as np
 
 from sharktank.evaluate import perplexity_iree
+from sharktank.utils.testing import is_mi300x
 
-is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
 skipif_run_quick_llama_test = pytest.mark.skipif(
     'not config.getoption("run-nightly-llama-tests")',
     reason="Run large tests if --run-nightly-llama-tests is passed",
 
@@ -56,10 +56,16 @@ def _(model, img, txt, vec, rot) -> torch.Tensor:
         asm = str(output.mlir_module)
 
     def testSingleExport(self):
-        theta = make_mmdit_single_block_random_theta(hidden_size=self.hidden_size)
+        mlp_ratio = 4.0
+        theta = make_mmdit_single_block_random_theta(
+            hidden_size=self.hidden_size, num_heads=self.num_heads, mlp_ratio=mlp_ratio
+        )
         theta = self.save_load_theta(theta)
         mmdit = MMDITSingleBlock(
-            theta=theta, num_heads=self.num_heads, hidden_size=self.hidden_size
+            theta=theta,
+            num_heads=self.num_heads,
+            hidden_size=self.hidden_size,
+            mlp_ratio=mlp_ratio,
         )
 
         inp = torch.rand([self.batch_size, 1024, self.hidden_size])
 
@@ -21,20 +21,20 @@
 from sharktank.models.flux.testing import (
     convert_flux_transformer_input_for_hugging_face_model,
     export_dev_random_single_layer,
-    make_dev_single_layer_config,
+    make_toy_config,
     make_random_theta,
 )
 from sharktank.models.flux.flux import FluxModelV1, FluxParams
-from sharktank.utils.testing import TempDirTestBase
+from sharktank.utils.testing import TempDirTestBase, skip, is_mi300x
 from sharktank.utils.iree import (
-    get_iree_devices,
     load_iree_module,
     run_iree_module_function,
     prepare_iree_module_function_args,
     call_torch_module_function,
     flatten_for_iree_signature,
     iree_to_torch,
 )
+from sharktank.utils.logging import format_tensor_statistics
 from sharktank import ops
 from sharktank.transforms.dataset import set_float_dtype
 from sharktank.types import Dataset, Theta
@@ -44,8 +44,6 @@
 with_flux_data = pytest.mark.skipif("not config.getoption('with_flux_data')")
 
 iree_compile_flags = [
-    "--iree-hal-target-device=hip",
-    "--iree-hip-target=gfx942",
     "--iree-opt-const-eval=false",
     "--iree-opt-strip-assertions=true",
     "--iree-global-opt-propagate-transposes=true",
@@ -74,6 +72,15 @@ def convert_dtype_if_dtype(
     return t
 
 
+def convert_input_dtype(input: dict[str, torch.Tensor], dtype: torch.dtype):
+    always_float32_input_arg_names = set(["img_ids", "txt_ids"])
+    return OrderedDict(
+        (k, t if k in always_float32_input_arg_names else t.to(dtype=dtype))
+        for k, t in input.items()
+    )
+
+
+@pytest.mark.usefixtures("path_prefix", "get_iree_flags")
 class FluxTest(TempDirTestBase):
     def setUp(self):
         super().setUp()
@@ -96,6 +103,7 @@ def runCompareIreeAgainstTorchEager(
         target_theta = reference_model.theta.transform(
             functools.partial(set_float_dtype, dtype=target_dtype)
         )
+
         target_torch_model = FluxModelV1(
             theta=target_theta,
             params=reference_model.params,
@@ -115,30 +123,22 @@ def runCompareIreeAgainstTorchEager(
 
         iree_module_path = self._temp_dir / "model.vmfb"
         logger.info("Compiling MLIR file...")
+        compile_flags = iree_compile_flags + [
+            f"--iree-hal-target-device={self.iree_hal_target_device}",
+            f"--iree-hip-target={self.iree_hip_target}",
+        ]
         iree.compiler.compile_file(
             str(mlir_path),
             output_file=str(iree_module_path),
-            extra_args=iree_compile_flags,
+            extra_args=compile_flags,
         )
 
-        target_input_args, target_input_kwargs = target_torch_model.sample_inputs(
+        reference_input_args, reference_input_kwargs = reference_model.sample_inputs(
             batch_size
         )
-
-        reference_input_args = [
-            convert_dtype_if_dtype(
-                t, source_dtype=target_dtype, target_dtype=reference_model.dtype
-            )
-            for t in target_input_args
-        ]
-        reference_input_kwargs = OrderedDict(
-            (
-                k,
-                convert_dtype_if_dtype(
-                    t, source_dtype=target_dtype, target_dtype=reference_model.dtype
-                ),
-            )
-            for k, t in target_input_kwargs.items()
+        assert len(reference_input_args) == 0
+        target_input_kwargs = convert_input_dtype(
+            reference_input_kwargs, dtype=target_dtype
         )
 
         logger.info("Invoking reference torch function...")
@@ -150,15 +150,15 @@ def runCompareIreeAgainstTorchEager(
         )
         expected_outputs = flatten_for_iree_signature(reference_result_dict)
 
-        iree_devices = get_iree_devices(driver="hip", device_count=1)
+        iree_devices = [iree.runtime.get_device(self.iree_device)]
         logger.info("Loading IREE module...")
         iree_module, iree_vm_context, iree_vm_instance = load_iree_module(
             module_path=iree_module_path,
             devices=iree_devices,
             parameters_path=parameters_path,
         )
         iree_args = prepare_iree_module_function_args(
-            args=flatten_for_iree_signature([target_input_args, target_input_kwargs]),
+            args=flatten_for_iree_signature(target_input_kwargs),
             devices=iree_devices,
         )
 
@@ -177,9 +177,14 @@ def runCompareIreeAgainstTorchEager(
             for i in range(len(expected_outputs))
         ]
         logger.info("Comparing outputs...")
+        logger.info(f"Expected output {format_tensor_statistics(expected_outputs[0])}")
+        abs_diff = (actual_outputs[0] - expected_outputs[0]).abs()
+        logger.info(
+            f"Actual vs expected abs diff {format_tensor_statistics(abs_diff[0])}"
+        )
         torch.testing.assert_close(actual_outputs, expected_outputs, atol=atol, rtol=0)
 
-    def runTestCompareDevIreeAgainstHuggingFace(
+    def runTestCompareDevIreeAgainstEager(
         self, reference_dtype: torch.dtype, target_dtype: torch.dtype, atol: float
     ):
         parameters_output_path = self._temp_dir / "parameters.irpa"
@@ -211,21 +216,12 @@ def runTestCompareTorchEagerAgainstHuggingFace(
     ):
         target_input_args, target_input_kwargs = target_model.sample_inputs()
 
-        reference_input_args = [
-            convert_dtype_if_dtype(
-                t, source_dtype=target_model.dtype, target_dtype=reference_dtype
-            )
-            for t in target_input_args
-        ]
-        reference_input_kwargs = OrderedDict(
-            (
-                k,
-                convert_dtype_if_dtype(
-                    t, source_dtype=target_model.dtype, target_dtype=reference_dtype
-                ),
-            )
-            for k, t in target_input_kwargs.items()
+        assert len(target_input_args) == 0
+        reference_input_args = []
+        reference_input_kwargs = convert_input_dtype(
+            target_input_kwargs, dtype=reference_dtype
         )
+
         reference_input_kwargs = convert_flux_transformer_input_for_hugging_face_model(
             *reference_input_args, **reference_input_kwargs
         )
@@ -238,18 +234,55 @@ def runTestCompareTorchEagerAgainstHuggingFace(
 
         torch.testing.assert_close(target_output, reference_output, atol=atol, rtol=0)
 
+    def runTestCompareToyIreeAgainstEager(
+        self, reference_dtype: torch.dtype, target_dtype: torch.dtype, atol: float
+    ):
+        config = make_toy_config()
+        reference_theta = make_random_theta(config, dtype=reference_dtype)
+        reference_model = FluxModelV1(theta=reference_theta, params=config)
+        self.runCompareIreeAgainstTorchEager(
+            reference_model=reference_model, target_dtype=target_dtype, atol=atol
+        )
+
+    @is_mi300x
+    def testCompareToyIreeF32AgainstEagerF64(self):
+        """atol is apparently high because the expected output range is large.
+        Its absolute maximum is 3915. Observed atol is 0.036."""
+        self.runTestCompareToyIreeAgainstEager(
+            reference_dtype=torch.float64, target_dtype=torch.float32, atol=1e-1
+        )
+
+    @skip(
+        reason=(
+            "Sporadic segmentation fault during buffer destruction."
+            " See https://github.com/nod-ai/shark-ai/issues/1050"
+        )
+    )
+    @is_mi300x
+    def testCompareToyIreeBf16AgainstEagerF64(self):
+        """atol is apparently high because the expected output range is large.
+        Its absolute maximum is 3915. Observed atol is 260.6.
+        This is consistent with the expectation that bf16 atol should be worse by ~10^4
+        compared to f32. f32 can represent ~7 digits and bf16 can represent ~3."""
+        self.runTestCompareToyIreeAgainstEager(
+            reference_dtype=torch.float64, target_dtype=torch.bfloat16, atol=5e2
+        )
+
     @with_flux_data
-    def testCompareDevIreeF32AgainstHuggingFaceF32(self):
-        self.runTestCompareDevIreeAgainstHuggingFace(
+    def testCompareDevIreeF32AgainstEagerF32(self):
+        self.runTestCompareDevIreeAgainstEager(
             reference_dtype=torch.float32, target_dtype=torch.float32, atol=1e-2
         )
 
-    @pytest.mark.skip(
-        reason="Segmentation fault during output comparison. See https://github.com/nod-ai/shark-ai/issues/1050"
+    @skip(
+        reason=(
+            "Sporadic segmentation fault during buffer destruction."
+            " See https://github.com/nod-ai/shark-ai/issues/1050"
+        )
     )
     @with_flux_data
-    def testCompareDevIreeBf16AgainstHuggingFaceF32(self):
-        self.runTestCompareDevIreeAgainstHuggingFace(
+    def testCompareDevIreeBf16AgainstEagerF32(self):
+        self.runTestCompareDevIreeAgainstEager(
             reference_dtype=torch.float32, target_dtype=torch.bfloat16, atol=1
         )