Make the T5 model tests use cosine similarity (#895)

sogartar · web-flow · commit 7a8f3609b01a · 2025-02-10T12:49:54.000-08:00
There were several xfail tests with bad metric. Cosine similarity is a
better metric for language embeddings.

The comparison between bf16 and f32 exhibits a small fraction of
outliers that have a higher per-token numerical error than the majority
of tokens. To account for that the testing metric is expanded to test
for inlier and outlier absolute tolerance.
diff --git a/sharktank/sharktank/utils/testing.py b/sharktank/sharktank/utils/testing.py
@@ -239,8 +239,56 @@ def assert_iterables_equal(
         ), f"Iterables not equal at index {i} for elements {v1} and {v2}"
 
 
+def assert_tensor_close(
+    actual: torch.Tensor,
+    expected: torch.Tensor,
+    atol: float,
+    max_outliers_fraction: Optional[float] = None,
+    inlier_atol: Optional[float] = None,
+):
+    if (max_outliers_fraction is None and inlier_atol is not None) or (
+        max_outliers_fraction is not None and inlier_atol is None
+    ):
+        raise ValueError(
+            "max_outliers_fraction and inlier_atol must be provided or not together."
+        )
+
+    try:
+        torch.testing.assert_close(
+            actual,
+            expected,
+            atol=atol,
+            rtol=0,
+        )
+
+        if inlier_atol is not None:
+            outliers = (actual - expected).abs() > inlier_atol
+            outliers_fraction = outliers.count_nonzero() / outliers.numel()
+            if outliers_fraction > max_outliers_fraction:
+                raise AssertionError(
+                    f"The fraction of outliers {outliers_fraction:%} is above the allowed "
+                    f"{max_outliers_fraction:%}. Inlier atol={inlier_atol}."
+                )
+    except AssertionError as ex:
+        diff = actual - expected
+        std, mean = torch.std_mean(diff)
+        msg = (
+            "Difference (actual - expected):\n"
+            f"mean = {mean}\n"
+            f"median = {diff.median()}\n"
+            f"std dev = {std}\n"
+            f"min = {diff.min()}\n"
+            f"max = {diff.max()}\n"
+        )
+        raise AssertionError(msg) from ex
+
+
 def assert_text_encoder_state_close(
-    actual: torch.Tensor, expected: torch.Tensor, atol: float
+    actual: torch.Tensor,
+    expected: torch.Tensor,
+    atol: float,
+    max_outliers_fraction: Optional[float] = None,
+    inlier_atol: Optional[float] = None,
 ):
     """The cosine similarity has been suggested to compare encoder states.
 
@@ -261,11 +309,13 @@ def assert_text_encoder_state_close(
         expected,
         dim=-1,
     )
-    torch.testing.assert_close(
-        cosine_similarity_per_token,
-        torch.ones_like(cosine_similarity_per_token),
+
+    assert_tensor_close(
+        actual=cosine_similarity_per_token,
+        expected=torch.ones_like(cosine_similarity_per_token),
         atol=atol,
-        rtol=0,
+        max_outliers_fraction=max_outliers_fraction,
+        inlier_atol=inlier_atol,
     )
 
 
diff --git a/sharktank/tests/models/t5/t5_test.py b/sharktank/tests/models/t5/t5_test.py
@@ -41,6 +41,7 @@
     export_encoder_iree_parameters,
 )
 from sharktank.utils.testing import (
+    assert_text_encoder_state_close,
     make_rand_torch,
     make_random_mask,
     TempDirTestBase,
@@ -107,17 +108,16 @@ def testXxlBf16AgainstFluxGolden(self):
         ) as f:
             reference_last_hidden_state = torch.load(f)
 
-        torch.testing.assert_close(
-            outputs["last_hidden_state"], reference_last_hidden_state
+        assert_text_encoder_state_close(
+            outputs["last_hidden_state"], reference_last_hidden_state, atol=1e-1
         )
 
     def runTestV1_1CompareTorchEagerHuggingFace(
         self,
         huggingface_repo_id: str,
         reference_dtype: torch.dtype,
         target_dtype: torch.dtype,
-        atol: Optional[float] = None,
-        rtol: Optional[float] = None,
+        atol: float,
     ):
         get_dataset(
             huggingface_repo_id,
@@ -146,17 +146,18 @@ def runTestV1_1CompareTorchEagerHuggingFace(
             lambda t: ops.to(t, dtype=reference_dtype), actual_outputs
         )
 
-        torch.testing.assert_close(
-            actual_outputs, expected_outputs, atol=atol, rtol=rtol
+        assert_text_encoder_state_close(
+            actual_outputs["last_hidden_state"],
+            expected_outputs["last_hidden_state"],
+            atol,
         )
 
     def runTestV1_1CompareTorchEagerAgainstHuggingFace(
         self,
         huggingface_repo_id: str,
         reference_dtype: torch.dtype,
         target_dtype: torch.dtype,
-        atol: Optional[float] = None,
-        rtol: Optional[float] = None,
+        atol: float,
     ):
         get_dataset(
             huggingface_repo_id,
@@ -199,8 +200,10 @@ def runTestV1_1CompareTorchEagerAgainstHuggingFace(
         )
 
         logger.info("Comparing outputs...")
-        torch.testing.assert_close(
-            actual_outputs, expected_outputs, atol=atol, rtol=rtol
+        assert_text_encoder_state_close(
+            actual_outputs["last_hidden_state"],
+            expected_outputs["last_hidden_state"],
+            atol,
         )
 
     @pytest.mark.xfail(
@@ -213,12 +216,31 @@ def runTestV1_1CompareTorchEagerAgainstHuggingFace(
     )
     @with_t5_data
     def testV1_1SmallCompareTorchEagerHuggingFaceBf16AgainstF32(self):
+        """Hugging Face model tests to estimate numerical error baseline for reference.
+        We don't want to run this test regularly, but we would like to keep it around
+        as a reference. It provides some baseline of what numerical error to expect.
+        """
         self.runTestV1_1CompareTorchEagerHuggingFace(
             "google/t5-v1_1-small",
             reference_dtype=torch.float32,
             target_dtype=torch.bfloat16,
-            atol=1e-2,
-            rtol=1.6e-2,
+            # The observed error is 0.05.
+            atol=1e-1,
+        )
+
+    @pytest.mark.skip
+    @with_t5_data
+    def testV1_1XxlCompareTorchEagerHuggingFaceBf16AgainstF32(self):
+        """Hugging Face model tests to estimate numerical error baseline for reference.
+        We don't want to run this test regularly, but we would like to keep it around
+        as a reference. It provides some baseline of what numerical error to expect.
+        """
+        self.runTestV1_1CompareTorchEagerHuggingFace(
+            "google/t5-v1_1-xxl",
+            reference_dtype=torch.float32,
+            target_dtype=torch.bfloat16,
+            # The observed error is 0.026.
+            atol=1e-1,
         )
 
     @with_t5_data
@@ -227,24 +249,16 @@ def testV1_1SmallF32CompareTorchEagerAgainstHuggingFace(self):
             "google/t5-v1_1-small",
             reference_dtype=torch.float32,
             target_dtype=torch.float32,
+            atol=1e-5,
         )
 
-    @pytest.mark.xfail(
-        raises=AssertionError,
-        reason=(
-            "The accuracy is bad, "
-            "but for XXL we get the same result as the Flux pipeline. "
-            "This need further investigation how Flux works at all like that."
-        ),
-    )
     @with_t5_data
     def testV1_1SmallBf16CompareTorchEagerAgainstHuggingFaceF32(self):
         self.runTestV1_1CompareTorchEagerAgainstHuggingFace(
             "google/t5-v1_1-small",
             reference_dtype=torch.float32,
             target_dtype=torch.bfloat16,
-            atol=1e-2,
-            rtol=1.6e-2,
+            atol=1e-1,
         )
 
     @with_t5_data
@@ -253,6 +267,7 @@ def testV1_1SmallBf16CompareTorchEagerAgainstHuggingFace(self):
             "google/t5-v1_1-small",
             reference_dtype=torch.bfloat16,
             target_dtype=torch.bfloat16,
+            atol=1e-1,
         )
 
     @with_t5_data
@@ -261,23 +276,16 @@ def testV1_1XxlF32CompareTorchEagerAgainstHuggingFace(self):
             "google/t5-v1_1-xxl",
             reference_dtype=torch.float32,
             target_dtype=torch.float32,
+            atol=1e-5,
         )
 
-    @pytest.mark.xfail(
-        raises=AssertionError,
-        reason=(
-            "The accuracy is bad, but we get the same result as the Flux pipeline. "
-            "This need further investigation how Flux works at all like that."
-        ),
-    )
     @with_t5_data
     def testV1_1XxlBf16CompareTorchEagerAgainstHuggingFaceF32(self):
         self.runTestV1_1CompareTorchEagerAgainstHuggingFace(
             "google/t5-v1_1-xxl",
             reference_dtype=torch.float32,
             target_dtype=torch.bfloat16,
-            atol=1e-2,
-            rtol=1.6e-2,
+            atol=5e-2,
         )
 
 
@@ -293,8 +301,9 @@ def runTestV1_1CompareIreeAgainstTorchEager(
         huggingface_repo_id: str,
         reference_dtype: torch.dtype,
         target_dtype: torch.dtype,
-        atol: Optional[float] = None,
-        rtol: Optional[float] = None,
+        atol: float,
+        max_outliers_fraction: Optional[float] = None,
+        inlier_atol: Optional[float] = None,
     ):
         get_dataset(
             huggingface_repo_id,
@@ -386,34 +395,35 @@ def runTestV1_1CompareIreeAgainstTorchEager(
         ]
 
         logger.info("Comparing outputs...")
-        torch.testing.assert_close(reference_result, iree_result, atol=atol, rtol=rtol)
+        reference_result_last_hidden_state = reference_result[0]
+        iree_result_last_hidden_state = iree_result[0]
+        assert_text_encoder_state_close(
+            iree_result_last_hidden_state,
+            reference_result_last_hidden_state,
+            atol=atol,
+            max_outliers_fraction=max_outliers_fraction,
+            inlier_atol=inlier_atol,
+        )
 
     @with_t5_data
     def testV1_1CompareSmallIreeF32AgainstTorchEagerF32(self):
         self.runTestV1_1CompareIreeAgainstTorchEager(
             "google/t5-v1_1-small",
             reference_dtype=torch.float32,
             target_dtype=torch.float32,
-            atol=1e-4,
-            rtol=2.0e-3,
+            atol=1e-5,
         )
 
-    @pytest.mark.xfail(
-        raises=AssertionError,
-        reason=(
-            "The accuracy is bad, "
-            "but but it is no worse than the accuracy for of eager bfloat16. "
-            "This need further investigation how Flux works at all like that."
-        ),
-    )
     @with_t5_data
     def testV1_1CompareSmallIreeBf16AgainstTorchEagerF32(self):
         self.runTestV1_1CompareIreeAgainstTorchEager(
             "google/t5-v1_1-small",
             reference_dtype=torch.float32,
             target_dtype=torch.bfloat16,
-            atol=1e-2,
-            rtol=1.6e-2,
+            # The observed error is 0.12.
+            atol=0.2,
+            max_outliers_fraction=0.03,
+            inlier_atol=0.01,
         )
 
     @with_t5_data
@@ -422,26 +432,29 @@ def testV1_1CompareXxlIreeF32AgainstTorchEagerF32(self):
             "google/t5-v1_1-xxl",
             reference_dtype=torch.float32,
             target_dtype=torch.float32,
-            atol=1e-4,
-            rtol=2.0e-3,
+            atol=1e-5,
         )
 
-    @pytest.mark.xfail(
-        raises=AssertionError,
-        reason=(
-            "The accuracy is bad, "
-            "but but it is no worse than the accuracy for of eager bfloat16. "
-            "This need further investigation how Flux works at all like that."
-        ),
-    )
     @with_t5_data
     def testV1_1CompareXxlIreeBf16AgainstTorchEagerF32(self):
+        """The observed absolute numerical error is 0.21.
+        Per token cosine similarity metrics are
+        mean = 0.997
+        std dev = 0.018
+        min = 0.789
+
+        The error seems high as it corresponds to 38° angular difference.
+        For comparison the bf16 Hugging Face small model exhibits a worst token error
+        of 0.05. Although, here the error worse it may be reasonable as it comes from a
+        single token outlier. The majority of tokens have an error less than 0.01.
+        """
         self.runTestV1_1CompareIreeAgainstTorchEager(
             "google/t5-v1_1-xxl",
             reference_dtype=torch.float32,
             target_dtype=torch.bfloat16,
-            atol=1e-2,
-            rtol=1.6e-2,
+            atol=2.5e-1,
+            max_outliers_fraction=0.03,
+            inlier_atol=0.01,
         )