From 6d91f9158baba23bd795f33f9faf85e0e35d40b6 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Mon, 14 Apr 2025 15:36:49 +0000
Subject: [PATCH 01/16] adds available device to nlp tests #3335

---
 tests/ignite/metrics/nlp/test_bleu.py  | 32 +++++++++++++++-----------
 tests/ignite/metrics/nlp/test_rouge.py |  9 ++++----
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 9de9c6de78c5..16c6a818da2c 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -107,8 +107,9 @@ def test_micro_bleu_smooth2(candidates, references):
     _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3)
 
 
-def test_accumulation_macro_bleu():
-    bleu = Bleu(ngram=4, smooth="smooth2")
+def test_accumulation_macro_bleu(available_device):
+    bleu = Bleu(ngram=4, smooth="smooth2", device=available_device)
+    assert bleu._device == torch.device(available_device)
     bleu.update(([corpus.cand_1], [corpus.references_1]))
     bleu.update(([corpus.cand_2a], [corpus.references_2]))
     bleu.update(([corpus.cand_2b], [corpus.references_2]))
@@ -120,8 +121,9 @@ def test_accumulation_macro_bleu():
     assert bleu.compute() == value / 4
 
 
-def test_accumulation_micro_bleu():
-    bleu = Bleu(ngram=4, smooth="smooth2", average="micro")
+def test_accumulation_micro_bleu(available_device):
+    bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device)
+    assert bleu._device == torch.device(available_device)
     bleu.update(([corpus.cand_1], [corpus.references_1]))
     bleu.update(([corpus.cand_2a], [corpus.references_2]))
     bleu.update(([corpus.cand_2b], [corpus.references_2]))
@@ -133,8 +135,9 @@ def test_accumulation_micro_bleu():
     assert bleu.compute() == value
 
 
-def test_bleu_batch_macro():
-    bleu = Bleu(ngram=4)
+def test_bleu_batch_macro(available_device):
+    bleu = Bleu(ngram=4, device=available_device)
+    assert bleu._device == torch.device(available_device)
 
     # Batch size 3
     hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
@@ -162,8 +165,9 @@ def test_bleu_batch_macro():
     assert pytest.approx(ref_2) == reference_bleu_score
 
 
-def test_bleu_batch_micro():
-    bleu = Bleu(ngram=4, average="micro")
+def test_bleu_batch_micro(available_device):
+    bleu = Bleu(ngram=4, average="micro", device=available_device)
+    assert bleu._device == torch.device(available_device)
 
     # Batch size 3
     hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
@@ -187,8 +191,10 @@ def test_bleu_batch_micro():
         (corpus.cand_1, corpus.references_1),
     ],
 )
-def test_n_gram_counter(candidates, references):
-    bleu = Bleu(ngram=4)
+def test_n_gram_counter(candidates, references, available_device):
+    bleu = Bleu(ngram=4, device=available_device)
+    assert bleu._device == torch.device(available_device)
+
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
     assert hyp_length == len(candidates)
 
@@ -212,9 +218,9 @@ def _test_macro_distrib_integration(device):
     def update(_, i):
         return data[i + size * rank]
 
-    def _test(metric_device):
+    def _test(device):
         engine = Engine(update)
-        m = Bleu(ngram=4, smooth="smooth2")
+        m = Bleu(ngram=4, smooth="smooth2", device=device)
         m.attach(engine, "bleu")
 
         engine.run(data=list(range(size)), max_epochs=1)
@@ -256,7 +262,7 @@ def update(_, i):
 
     def _test(metric_device):
         engine = Engine(update)
-        m = Bleu(ngram=4, smooth="smooth2", average="micro")
+        m = Bleu(ngram=4, smooth="smooth2", average="micro", device=metric_device)
         m.attach(engine, "bleu")
 
         engine.run(data=list(range(size)), max_epochs=1)
diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
index 5d8562866c83..5dbf4c9bde8f 100644
--- a/tests/ignite/metrics/nlp/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -84,9 +84,9 @@ def test_wrong_inputs():
         (2, "abcdef", "zbdfz", (0, 0)),
     ],
 )
-def test_rouge_n_alpha(ngram, candidate, reference, expected):
+def test_rouge_n_alpha(ngram, candidate, reference, expected, available_device):
     for alpha in [0, 1, 0.3, 0.5, 0.8]:
-        rouge = RougeN(ngram=ngram, alpha=alpha)
+        rouge = RougeN(ngram=ngram, alpha=alpha, device=available_device)
         rouge.update(([candidate], [[reference]]))
         results = rouge.compute()
         assert results[f"Rouge-{ngram}-P"] == expected[0]
@@ -101,7 +101,7 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected):
 @pytest.mark.parametrize(
     "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5]
 )
-def test_rouge_metrics(candidates, references):
+def test_rouge_metrics(candidates, references, available_device):
     for multiref in ["average", "best"]:
         # PERL 1.5.5 reference
         apply_avg = multiref == "average"
@@ -123,7 +123,8 @@ def test_rouge_metrics(candidates, references):
 
         lower_split_candidates = [candidate.lower().split() for candidate in candidates]
 
-        m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5)
+        m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5, device=available_device)
+        assert m._device == torch.device(available_device)
         m.update((lower_split_candidates, lower_split_references))
         results = m.compute()
 

From c1dcffc9082b79da800b8a0130730e1fc068c752 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Mon, 14 Apr 2025 17:15:15 +0000
Subject: [PATCH 02/16] avoiding float64

---
 tests/ignite/metrics/nlp/test_bleu.py | 69 +++++++++++++++++----------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 16c6a818da2c..edca544fbc32 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -2,6 +2,7 @@
 import warnings
 from collections import Counter
 
+import numpy as np
 import pytest
 import torch
 from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
@@ -44,10 +45,10 @@ def test_wrong_inputs():
 )
 
 
-def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8):
+def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"):
     for i in range(1, ngram_range):
         weights = tuple([1 / i] * i)
-        bleu = Bleu(ngram=i, average=average, smooth=smooth)
+        bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device)
 
         if average == "macro":
             with warnings.catch_warnings():
@@ -55,56 +56,65 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No
                 reference = sentence_bleu(
                     references[0], candidates[0], weights=weights, smoothing_function=smooth_nltk_fn
                 )
-            assert pytest.approx(reference) == bleu._sentence_bleu(references[0], candidates[0])
+            computed = bleu._sentence_bleu(references[0], candidates[0])
+            if isinstance(computed, torch.Tensor):
+                computed = computed.cpu().float().item()
+            assert np.allclose(computed, reference, rtol=1e-6)
 
         elif average == "micro":
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 reference = corpus_bleu(references, candidates, weights=weights, smoothing_function=smooth_nltk_fn)
-            assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates)
+            computed = bleu._corpus_bleu(references, candidates)
+            if isinstance(computed, torch.Tensor):
+                computed = computed.cpu().float().item()
+            assert np.allclose(computed, reference, rtol=1e-6)
 
         bleu.update((candidates, references))
-        assert pytest.approx(reference) == bleu.compute()
+        computed = bleu.compute()
+        if isinstance(computed, torch.Tensor):
+            computed = computed.cpu().float().item()
+        assert np.allclose(computed, reference, rtol=1e-6)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_macro_bleu(candidates, references):
-    _test(candidates, references, "macro")
+def test_macro_bleu(candidates, references, available_device):
+    _test(candidates, references, "macro", device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_micro_bleu(candidates, references):
-    _test(candidates, references, "micro")
+def test_micro_bleu(candidates, references, available_device):
+    _test(candidates, references, "micro", device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_macro_bleu_smooth1(candidates, references):
-    _test(candidates, references, "macro", "smooth1", SmoothingFunction().method1)
+def test_macro_bleu_smooth1(candidates, references, available_device):
+    _test(candidates, references, "macro", "smooth1", SmoothingFunction().method1, device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_micro_bleu_smooth1(candidates, references):
-    _test(candidates, references, "micro", "smooth1", SmoothingFunction().method1)
+def test_micro_bleu_smooth1(candidates, references, available_device):
+    _test(candidates, references, "micro", "smooth1", SmoothingFunction().method1, device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_macro_bleu_nltk_smooth2(candidates, references):
-    _test(candidates, references, "macro", "nltk_smooth2", SmoothingFunction().method2)
+def test_macro_bleu_nltk_smooth2(candidates, references, available_device):
+    _test(candidates, references, "macro", "nltk_smooth2", SmoothingFunction().method2, device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_micro_bleu_nltk_smooth2(candidates, references):
-    _test(candidates, references, "micro", "nltk_smooth2", SmoothingFunction().method2)
+def test_micro_bleu_nltk_smooth2(candidates, references, available_device):
+    _test(candidates, references, "micro", "nltk_smooth2", SmoothingFunction().method2, device=available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_macro_bleu_smooth2(candidates, references):
-    _test(candidates, references, "macro", "smooth2", SmoothingFunction().method2, 3)
+def test_macro_bleu_smooth2(candidates, references, available_device):
+    _test(candidates, references, "macro", "smooth2", SmoothingFunction().method2, 3, available_device)
 
 
 @pytest.mark.parametrize(*parametrize_args)
-def test_micro_bleu_smooth2(candidates, references):
-    _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3)
+def test_micro_bleu_smooth2(candidates, references, available_device):
+    _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3, device=available_device)
 
 
 def test_accumulation_macro_bleu(available_device):
@@ -118,7 +128,10 @@ def test_accumulation_macro_bleu(available_device):
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
-    assert bleu.compute() == value / 4
+    computed = bleu.compute()
+    if isinstance(computed, torch.Tensor):
+        computed = computed.cpu().float().item()
+    assert np.allclose(computed, value / 4, rtol=1e-6)
 
 
 def test_accumulation_micro_bleu(available_device):
@@ -151,7 +164,11 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    assert pytest.approx(bleu.compute()) == reference_bleu_score
+    reference_bleu_score = np.float32(reference_bleu_score)
+    computed = bleu.compute()
+    if isinstance(computed, torch.Tensor):
+        computed = computed.cpu().float().item()
+    assert np.allclose(computed, reference_bleu_score, rtol=1e-6)
 
     value = 0
     for _hypotheses, _refs in zip(hypotheses, refs):
@@ -161,8 +178,8 @@ def test_bleu_batch_macro(available_device):
     ref_1 = value / len(refs)
     ref_2 = bleu.compute()
 
-    assert pytest.approx(ref_1) == reference_bleu_score
-    assert pytest.approx(ref_2) == reference_bleu_score
+    assert np.allclose(ref_1, reference_bleu_score, rtol=1e-6)
+    assert np.allclose(ref_2, reference_bleu_score, rtol=1e-6)
 
 
 def test_bleu_batch_micro(available_device):
@@ -196,6 +213,8 @@ def test_n_gram_counter(candidates, references, available_device):
     assert bleu._device == torch.device(available_device)
 
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
+    hyp_length = int(hyp_length)
+    ref_length = int(ref_length)
     assert hyp_length == len(candidates)
 
     ref_lens = (len(reference) for reference in references)

From 6dd44ce83563ccc6c87a6980a9ebb626fc3f12a6 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Tue, 15 Apr 2025 16:00:43 +0000
Subject: [PATCH 03/16] converts candidates and references float32 on MPS

---
 tests/ignite/metrics/nlp/test_bleu.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index edca544fbc32..5787fb187f31 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -16,6 +16,14 @@
 corpus = CorpusForTest(lower_split=True)
 
 
+def to_float32_if_mps(x, device):
+    if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64:
+        return x.to(torch.float32)
+    elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64:
+        return x.astype(np.float32)
+    return x
+
+
 def test_wrong_inputs():
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         Bleu(ngram=0)
@@ -46,6 +54,10 @@ def test_wrong_inputs():
 
 
 def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"):
+
+    candidates = to_float32_if_mps(candidates, device)
+    references = to_float32_if_mps(references, device)
+
     for i in range(1, ngram_range):
         weights = tuple([1 / i] * i)
         bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device)

From d1d949fb894b506aca6aa76ed059172c5f0d8690 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Tue, 15 Apr 2025 16:19:11 +0000
Subject: [PATCH 04/16] more conversions to float32 on MPS

---
 tests/ignite/metrics/nlp/test_bleu.py | 57 ++++++++++++++++++---------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 5787fb187f31..8ed98cd30f1c 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -132,14 +132,21 @@ def test_micro_bleu_smooth2(candidates, references, available_device):
 def test_accumulation_macro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", device=available_device)
     assert bleu._device == torch.device(available_device)
-    bleu.update(([corpus.cand_1], [corpus.references_1]))
-    bleu.update(([corpus.cand_2a], [corpus.references_2]))
-    bleu.update(([corpus.cand_2b], [corpus.references_2]))
-    bleu.update(([corpus.cand_3], [corpus.references_2]))
-    value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
+    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
+    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
+    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
+    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
+    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
+    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
+
+    bleu.update(([cand_1], [ref_1]))
+    bleu.update(([cand_2a], [ref_2]))
+    bleu.update(([cand_2b], [ref_2]))
+    bleu.update(([cand_3], [ref_2]))
+    value = bleu._sentence_bleu(ref_1, cand_1)
+    value += bleu._sentence_bleu(ref_2, cand_2a)
+    value += bleu._sentence_bleu(ref_2, cand_2b)
+    value += bleu._sentence_bleu(ref_2, cand_3)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
@@ -149,13 +156,20 @@ def test_accumulation_macro_bleu(available_device):
 def test_accumulation_micro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device)
     assert bleu._device == torch.device(available_device)
-    bleu.update(([corpus.cand_1], [corpus.references_1]))
-    bleu.update(([corpus.cand_2a], [corpus.references_2]))
-    bleu.update(([corpus.cand_2b], [corpus.references_2]))
-    bleu.update(([corpus.cand_3], [corpus.references_2]))
+    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
+    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
+    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
+    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
+    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
+    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
+
+    bleu.update(([cand_1], [ref_1]))
+    bleu.update(([cand_2a], [ref_2]))
+    bleu.update(([cand_2b], [ref_2]))
+    bleu.update(([cand_3], [ref_2]))
     value = bleu._corpus_bleu(
-        [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2],
-        [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3],
+        [ref_1, ref_2, ref_2, ref_2],
+        [cand_1, cand_2a, cand_2b, cand_3],
     )
     assert bleu.compute() == value
 
@@ -165,8 +179,10 @@ def test_bleu_batch_macro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
-    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
+    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
+    refs = [
+        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
+    ]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -199,8 +215,10 @@ def test_bleu_batch_micro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
-    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
+    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
+    refs = [
+        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
+    ]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -224,6 +242,9 @@ def test_n_gram_counter(candidates, references, available_device):
     bleu = Bleu(ngram=4, device=available_device)
     assert bleu._device == torch.device(available_device)
 
+    candidates = to_float32_if_mps(candidates, available_device)
+    references = to_float32_if_mps(references, available_device)
+
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
     hyp_length = int(hyp_length)
     ref_length = int(ref_length)

From fabc322d5c7f84541525f4e2ae18cb2f3e461a8a Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Thu, 17 Apr 2025 08:13:55 +0000
Subject: [PATCH 05/16] rolls back some unnecessary conversions to float32

---
 tests/ignite/metrics/nlp/test_bleu.py | 77 ++++++++-------------------
 1 file changed, 23 insertions(+), 54 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 8ed98cd30f1c..d7c48485fe6e 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -16,14 +16,6 @@
 corpus = CorpusForTest(lower_split=True)
 
 
-def to_float32_if_mps(x, device):
-    if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64:
-        return x.to(torch.float32)
-    elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64:
-        return x.astype(np.float32)
-    return x
-
-
 def test_wrong_inputs():
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         Bleu(ngram=0)
@@ -55,9 +47,6 @@ def test_wrong_inputs():
 
 def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"):
 
-    candidates = to_float32_if_mps(candidates, device)
-    references = to_float32_if_mps(references, device)
-
     for i in range(1, ngram_range):
         weights = tuple([1 / i] * i)
         bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device)
@@ -132,21 +121,15 @@ def test_micro_bleu_smooth2(candidates, references, available_device):
 def test_accumulation_macro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", device=available_device)
     assert bleu._device == torch.device(available_device)
-    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
-    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
-    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
-    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
-    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
-    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
-
-    bleu.update(([cand_1], [ref_1]))
-    bleu.update(([cand_2a], [ref_2]))
-    bleu.update(([cand_2b], [ref_2]))
-    bleu.update(([cand_3], [ref_2]))
-    value = bleu._sentence_bleu(ref_1, cand_1)
-    value += bleu._sentence_bleu(ref_2, cand_2a)
-    value += bleu._sentence_bleu(ref_2, cand_2b)
-    value += bleu._sentence_bleu(ref_2, cand_3)
+    bleu.update(([corpus.cand_1], [corpus.references_1]))
+    bleu.update(([corpus.cand_2a], [corpus.references_2]))
+    bleu.update(([corpus.cand_2b], [corpus.references_2]))
+    bleu.update(([corpus.cand_3], [corpus.references_2]))
+    value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
+
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
@@ -156,22 +139,18 @@ def test_accumulation_macro_bleu(available_device):
 def test_accumulation_micro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device)
     assert bleu._device == torch.device(available_device)
-    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
-    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
-    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
-    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
-    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
-    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
-
-    bleu.update(([cand_1], [ref_1]))
-    bleu.update(([cand_2a], [ref_2]))
-    bleu.update(([cand_2b], [ref_2]))
-    bleu.update(([cand_3], [ref_2]))
+    bleu.update(([corpus.cand_1], [corpus.references_1]))
+    bleu.update(([corpus.cand_2a], [corpus.references_2]))
+    bleu.update(([corpus.cand_2b], [corpus.references_2]))
+    bleu.update(([corpus.cand_3], [corpus.references_2]))
     value = bleu._corpus_bleu(
-        [ref_1, ref_2, ref_2, ref_2],
-        [cand_1, cand_2a, cand_2b, cand_3],
+        [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2],
+        [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3],
     )
-    assert bleu.compute() == value
+    computed = bleu.compute()
+    if isinstance(computed, torch.Tensor):
+        computed = computed.cpu().float().item()
+    assert np.allclose(computed, value, rtol=1e-6)
 
 
 def test_bleu_batch_macro(available_device):
@@ -179,10 +158,8 @@ def test_bleu_batch_macro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
-    refs = [
-        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
-    ]
+    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
+    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -192,7 +169,6 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    reference_bleu_score = np.float32(reference_bleu_score)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
@@ -215,10 +191,8 @@ def test_bleu_batch_micro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
-    refs = [
-        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
-    ]
+    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
+    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -242,12 +216,7 @@ def test_n_gram_counter(candidates, references, available_device):
     bleu = Bleu(ngram=4, device=available_device)
     assert bleu._device == torch.device(available_device)
 
-    candidates = to_float32_if_mps(candidates, available_device)
-    references = to_float32_if_mps(references, available_device)
-
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
-    hyp_length = int(hyp_length)
-    ref_length = int(ref_length)
     assert hyp_length == len(candidates)
 
     ref_lens = (len(reference) for reference in references)

From e2e1562e332feebaf4fafeef267c3a04802c1568 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Thu, 17 Apr 2025 14:03:12 +0000
Subject: [PATCH 06/16] trying to make tests pass

---
 ignite/metrics/nlp/bleu.py            |  5 +++--
 tests/ignite/metrics/nlp/test_bleu.py | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index ed3b14b4dc52..5324fd509c59 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -1,4 +1,5 @@
 import math
+from collections import Counter
 from typing import Any, Callable, Sequence, Tuple, Union
 
 import torch
@@ -158,8 +159,8 @@ def _n_gram_counter(
         self,
         references: Sequence[Sequence[Sequence[Any]]],
         candidates: Sequence[Sequence[Any]],
-        p_numerators: torch.Tensor,
-        p_denominators: torch.Tensor,
+        p_numerators: Union[torch.Tensor, Counter],
+        p_denominators: Union[torch.Tensor, Counter],
     ) -> Tuple[int, int]:
         if len(references) != len(candidates):
             raise ValueError(
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index d7c48485fe6e..b042738ff749 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -16,6 +16,14 @@
 corpus = CorpusForTest(lower_split=True)
 
 
+def to_float32_if_mps(x, device):
+    if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64:
+        return x.to(torch.float32)
+    elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64:
+        return x.astype(np.float32)
+    return x
+
+
 def test_wrong_inputs():
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         Bleu(ngram=0)
@@ -47,6 +55,9 @@ def test_wrong_inputs():
 
 def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"):
 
+    candidates = to_float32_if_mps(candidates, device)
+    references = to_float32_if_mps(references, device)
+
     for i in range(1, ngram_range):
         weights = tuple([1 / i] * i)
         bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device)
@@ -169,6 +180,7 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
+    reference_bleu_score = to_float32_if_mps(reference_bleu_score)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()

From 9681a0b5ba5e7899f6e6317b65ebb38d44399f75 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Thu, 17 Apr 2025 14:27:00 +0000
Subject: [PATCH 07/16] rollback to previously passing tests

---
 tests/ignite/metrics/nlp/test_bleu.py | 67 +++++++++++++++++----------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index b042738ff749..8ed98cd30f1c 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -132,15 +132,21 @@ def test_micro_bleu_smooth2(candidates, references, available_device):
 def test_accumulation_macro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", device=available_device)
     assert bleu._device == torch.device(available_device)
-    bleu.update(([corpus.cand_1], [corpus.references_1]))
-    bleu.update(([corpus.cand_2a], [corpus.references_2]))
-    bleu.update(([corpus.cand_2b], [corpus.references_2]))
-    bleu.update(([corpus.cand_3], [corpus.references_2]))
-    value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
-    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
-
+    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
+    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
+    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
+    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
+    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
+    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
+
+    bleu.update(([cand_1], [ref_1]))
+    bleu.update(([cand_2a], [ref_2]))
+    bleu.update(([cand_2b], [ref_2]))
+    bleu.update(([cand_3], [ref_2]))
+    value = bleu._sentence_bleu(ref_1, cand_1)
+    value += bleu._sentence_bleu(ref_2, cand_2a)
+    value += bleu._sentence_bleu(ref_2, cand_2b)
+    value += bleu._sentence_bleu(ref_2, cand_3)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
@@ -150,18 +156,22 @@ def test_accumulation_macro_bleu(available_device):
 def test_accumulation_micro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device)
     assert bleu._device == torch.device(available_device)
-    bleu.update(([corpus.cand_1], [corpus.references_1]))
-    bleu.update(([corpus.cand_2a], [corpus.references_2]))
-    bleu.update(([corpus.cand_2b], [corpus.references_2]))
-    bleu.update(([corpus.cand_3], [corpus.references_2]))
+    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
+    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
+    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
+    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
+    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
+    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
+
+    bleu.update(([cand_1], [ref_1]))
+    bleu.update(([cand_2a], [ref_2]))
+    bleu.update(([cand_2b], [ref_2]))
+    bleu.update(([cand_3], [ref_2]))
     value = bleu._corpus_bleu(
-        [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2],
-        [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3],
+        [ref_1, ref_2, ref_2, ref_2],
+        [cand_1, cand_2a, cand_2b, cand_3],
     )
-    computed = bleu.compute()
-    if isinstance(computed, torch.Tensor):
-        computed = computed.cpu().float().item()
-    assert np.allclose(computed, value, rtol=1e-6)
+    assert bleu.compute() == value
 
 
 def test_bleu_batch_macro(available_device):
@@ -169,8 +179,10 @@ def test_bleu_batch_macro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
-    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
+    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
+    refs = [
+        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
+    ]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -180,7 +192,7 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    reference_bleu_score = to_float32_if_mps(reference_bleu_score)
+    reference_bleu_score = np.float32(reference_bleu_score)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
@@ -203,8 +215,10 @@ def test_bleu_batch_micro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
-    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
+    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
+    refs = [
+        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
+    ]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -228,7 +242,12 @@ def test_n_gram_counter(candidates, references, available_device):
     bleu = Bleu(ngram=4, device=available_device)
     assert bleu._device == torch.device(available_device)
 
+    candidates = to_float32_if_mps(candidates, available_device)
+    references = to_float32_if_mps(references, available_device)
+
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
+    hyp_length = int(hyp_length)
+    ref_length = int(ref_length)
     assert hyp_length == len(candidates)
 
     ref_lens = (len(reference) for reference in references)

From 1d6dd7d2a7c61c84ef0361dc5395644f3c13968e Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Thu, 17 Apr 2025 14:45:06 +0000
Subject: [PATCH 08/16] rollback _n_gram_counter parameter type change

---
 ignite/metrics/nlp/bleu.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 5324fd509c59..ed3b14b4dc52 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -1,5 +1,4 @@
 import math
-from collections import Counter
 from typing import Any, Callable, Sequence, Tuple, Union
 
 import torch
@@ -159,8 +158,8 @@ def _n_gram_counter(
         self,
         references: Sequence[Sequence[Sequence[Any]]],
         candidates: Sequence[Sequence[Any]],
-        p_numerators: Union[torch.Tensor, Counter],
-        p_denominators: Union[torch.Tensor, Counter],
+        p_numerators: torch.Tensor,
+        p_denominators: torch.Tensor,
     ) -> Tuple[int, int]:
         if len(references) != len(candidates):
             raise ValueError(

From 908ec7726045349dffe4d76e22063d8f8f43164f Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Fri, 18 Apr 2025 14:04:54 +0000
Subject: [PATCH 09/16] in bleu.py do not use torch.double

---
 ignite/metrics/nlp/bleu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index ed3b14b4dc52..3b6053967744 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -236,7 +236,8 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates
     @reinit__is_reduced
     def reset(self) -> None:
         if self.average == "macro":
-            self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device)
+            dtype = torch.get_default_dtype() if self._device.type == "mps" else torch.double
+            self._sum_of_bleu = torch.tensor(0.0, dtype=dtype, device=self._device)
             self._num_sentences = 0
 
         if self.average == "micro":

From 3ad6b5e01c39d4a45453d5d69da837076769124d Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 08:06:46 +0000
Subject: [PATCH 10/16] clean up

---
 tests/ignite/metrics/nlp/test_bleu.py | 102 +++++++-------------------
 1 file changed, 28 insertions(+), 74 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 8ed98cd30f1c..19ddd878d043 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -2,7 +2,6 @@
 import warnings
 from collections import Counter
 
-import numpy as np
 import pytest
 import torch
 from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
@@ -16,14 +15,6 @@
 corpus = CorpusForTest(lower_split=True)
 
 
-def to_float32_if_mps(x, device):
-    if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64:
-        return x.to(torch.float32)
-    elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64:
-        return x.astype(np.float32)
-    return x
-
-
 def test_wrong_inputs():
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         Bleu(ngram=0)
@@ -54,10 +45,6 @@ def test_wrong_inputs():
 
 
 def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"):
-
-    candidates = to_float32_if_mps(candidates, device)
-    references = to_float32_if_mps(references, device)
-
     for i in range(1, ngram_range):
         weights = tuple([1 / i] * i)
         bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device)
@@ -68,25 +55,16 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No
                 reference = sentence_bleu(
                     references[0], candidates[0], weights=weights, smoothing_function=smooth_nltk_fn
                 )
-            computed = bleu._sentence_bleu(references[0], candidates[0])
-            if isinstance(computed, torch.Tensor):
-                computed = computed.cpu().float().item()
-            assert np.allclose(computed, reference, rtol=1e-6)
+            assert pytest.approx(reference) == bleu._sentence_bleu(references[0], candidates[0])
 
         elif average == "micro":
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 reference = corpus_bleu(references, candidates, weights=weights, smoothing_function=smooth_nltk_fn)
-            computed = bleu._corpus_bleu(references, candidates)
-            if isinstance(computed, torch.Tensor):
-                computed = computed.cpu().float().item()
-            assert np.allclose(computed, reference, rtol=1e-6)
+            assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates)
 
         bleu.update((candidates, references))
-        computed = bleu.compute()
-        if isinstance(computed, torch.Tensor):
-            computed = computed.cpu().float().item()
-        assert np.allclose(computed, reference, rtol=1e-6)
+        assert pytest.approx(reference) == bleu.compute()
 
 
 @pytest.mark.parametrize(*parametrize_args)
@@ -132,44 +110,32 @@ def test_micro_bleu_smooth2(candidates, references, available_device):
 def test_accumulation_macro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", device=available_device)
     assert bleu._device == torch.device(available_device)
-    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
-    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
-    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
-    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
-    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
-    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
-
-    bleu.update(([cand_1], [ref_1]))
-    bleu.update(([cand_2a], [ref_2]))
-    bleu.update(([cand_2b], [ref_2]))
-    bleu.update(([cand_3], [ref_2]))
-    value = bleu._sentence_bleu(ref_1, cand_1)
-    value += bleu._sentence_bleu(ref_2, cand_2a)
-    value += bleu._sentence_bleu(ref_2, cand_2b)
-    value += bleu._sentence_bleu(ref_2, cand_3)
+
+    bleu.update(([corpus.cand_1], [corpus.references_1]))
+    bleu.update(([corpus.cand_2a], [corpus.references_2]))
+    bleu.update(([corpus.cand_2b], [corpus.references_2]))
+    bleu.update(([corpus.cand_3], [corpus.references_2]))
+    value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
+    value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
     computed = bleu.compute()
     if isinstance(computed, torch.Tensor):
         computed = computed.cpu().float().item()
-    assert np.allclose(computed, value / 4, rtol=1e-6)
+    assert computed == value / 4
 
 
 def test_accumulation_micro_bleu(available_device):
     bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device)
     assert bleu._device == torch.device(available_device)
-    cand_1 = to_float32_if_mps(corpus.cand_1, available_device)
-    cand_2a = to_float32_if_mps(corpus.cand_2a, available_device)
-    cand_2b = to_float32_if_mps(corpus.cand_2b, available_device)
-    cand_3 = to_float32_if_mps(corpus.cand_3, available_device)
-    ref_1 = to_float32_if_mps(corpus.references_1, available_device)
-    ref_2 = to_float32_if_mps(corpus.references_2, available_device)
-
-    bleu.update(([cand_1], [ref_1]))
-    bleu.update(([cand_2a], [ref_2]))
-    bleu.update(([cand_2b], [ref_2]))
-    bleu.update(([cand_3], [ref_2]))
+
+    bleu.update(([corpus.cand_1], [corpus.references_1]))
+    bleu.update(([corpus.cand_2a], [corpus.references_2]))
+    bleu.update(([corpus.cand_2b], [corpus.references_2]))
+    bleu.update(([corpus.cand_3], [corpus.references_2]))
     value = bleu._corpus_bleu(
-        [ref_1, ref_2, ref_2, ref_2],
-        [cand_1, cand_2a, cand_2b, cand_3],
+        [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2],
+        [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3],
     )
     assert bleu.compute() == value
 
@@ -179,10 +145,8 @@ def test_bleu_batch_macro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
-    refs = [
-        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
-    ]
+    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
+    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -192,11 +156,8 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    reference_bleu_score = np.float32(reference_bleu_score)
     computed = bleu.compute()
-    if isinstance(computed, torch.Tensor):
-        computed = computed.cpu().float().item()
-    assert np.allclose(computed, reference_bleu_score, rtol=1e-6)
+    assert pytest.approx(computed) == reference_bleu_score
 
     value = 0
     for _hypotheses, _refs in zip(hypotheses, refs):
@@ -204,10 +165,10 @@ def test_bleu_batch_macro(available_device):
         bleu.update(([_hypotheses], [_refs]))
 
     ref_1 = value / len(refs)
-    ref_2 = bleu.compute()
+    ref_2 = bleu.compute().cpu().numpy()
 
-    assert np.allclose(ref_1, reference_bleu_score, rtol=1e-6)
-    assert np.allclose(ref_2, reference_bleu_score, rtol=1e-6)
+    assert pytest.approx(ref_1) == reference_bleu_score
+    assert pytest.approx(ref_2) == reference_bleu_score
 
 
 def test_bleu_batch_micro(available_device):
@@ -215,10 +176,8 @@ def test_bleu_batch_micro(available_device):
     assert bleu._device == torch.device(available_device)
 
     # Batch size 3
-    hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]]
-    refs = [
-        to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2]
-    ]
+    hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]
+    refs = [corpus.references_1, corpus.references_2, corpus.references_2]
     bleu.update((hypotheses, refs))
 
     with warnings.catch_warnings():
@@ -242,12 +201,7 @@ def test_n_gram_counter(candidates, references, available_device):
     bleu = Bleu(ngram=4, device=available_device)
     assert bleu._device == torch.device(available_device)
 
-    candidates = to_float32_if_mps(candidates, available_device)
-    references = to_float32_if_mps(references, available_device)
-
     hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter())
-    hyp_length = int(hyp_length)
-    ref_length = int(ref_length)
     assert hyp_length == len(candidates)
 
     ref_lens = (len(reference) for reference in references)

From ed54722995193a9f7429b7c858c141730337e925 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 08:41:10 +0000
Subject: [PATCH 11/16] sets dtype in bleu.py

---
 ignite/metrics/nlp/bleu.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 3b6053967744..9172f80de625 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -236,13 +236,12 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates
     @reinit__is_reduced
     def reset(self) -> None:
         if self.average == "macro":
-            dtype = torch.get_default_dtype() if self._device.type == "mps" else torch.double
-            self._sum_of_bleu = torch.tensor(0.0, dtype=dtype, device=self._device)
+            self._sum_of_bleu = torch.tensor(0.0, dtype=self._double_dtype, device=self._device)
             self._num_sentences = 0
 
         if self.average == "micro":
-            self.p_numerators = torch.zeros(self.ngrams_order + 1)
-            self.p_denominators = torch.zeros(self.ngrams_order + 1)
+            self.p_numerators = torch.zeros(self.ngrams_order + 1, dtype=self._double_dtype)
+            self.p_denominators = torch.zeros(self.ngrams_order + 1, dtype=self._double_dtype)
             self.hyp_length_sum = 0
             self.ref_length_sum = 0
 
@@ -279,7 +278,7 @@ def _compute_micro(self) -> float:
         )
         return bleu_score
 
-    def compute(self) -> None:
+    def compute(self):
         if self.average == "macro":
             return self._compute_macro()
         elif self.average == "micro":

From 2c841935dbbcdfc1be3da1a590120af61ae5fa11 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 08:47:28 +0000
Subject: [PATCH 12/16] adds return type to Bleu.compute

---
 ignite/metrics/nlp/bleu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 9172f80de625..bdca2e31428f 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -2,6 +2,7 @@
 from typing import Any, Callable, Sequence, Tuple, Union
 
 import torch
+from torch import Tensor
 
 from ignite.exceptions import NotComputableError
 from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
@@ -278,7 +279,7 @@ def _compute_micro(self) -> float:
         )
         return bleu_score
 
-    def compute(self):
+    def compute(self) -> None | Tensor | float:
         if self.average == "macro":
             return self._compute_macro()
         elif self.average == "micro":

From 59806b4f3575395bdcfa3c3736649a974df815cf Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 09:05:46 +0000
Subject: [PATCH 13/16] removes unnecessary conversion

---
 tests/ignite/metrics/nlp/test_bleu.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index 19ddd878d043..a87de88d8abb 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -119,10 +119,7 @@ def test_accumulation_macro_bleu(available_device):
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a)
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b)
     value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3)
-    computed = bleu.compute()
-    if isinstance(computed, torch.Tensor):
-        computed = computed.cpu().float().item()
-    assert computed == value / 4
+    assert bleu.compute() == value / 4
 
 
 def test_accumulation_micro_bleu(available_device):
@@ -156,8 +153,7 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    computed = bleu.compute()
-    assert pytest.approx(computed) == reference_bleu_score
+    assert pytest.approx(bleu.compute()) == reference_bleu_score
 
     value = 0
     for _hypotheses, _refs in zip(hypotheses, refs):
@@ -165,7 +161,7 @@ def test_bleu_batch_macro(available_device):
         bleu.update(([_hypotheses], [_refs]))
 
     ref_1 = value / len(refs)
-    ref_2 = bleu.compute().cpu().numpy()
+    ref_2 = bleu.compute()
 
     assert pytest.approx(ref_1) == reference_bleu_score
     assert pytest.approx(ref_2) == reference_bleu_score

From 385fb27106659eba799cecc6a4dbf8cf831fa013 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 09:10:49 +0000
Subject: [PATCH 14/16] typing

---
 ignite/metrics/nlp/bleu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index bdca2e31428f..6529c11f1f8f 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -279,7 +279,7 @@ def _compute_micro(self) -> float:
         )
         return bleu_score
 
-    def compute(self) -> None | Tensor | float:
+    def compute(self) -> Union[None, Tensor, float]:
         if self.average == "macro":
             return self._compute_macro()
         elif self.average == "micro":

From 0ed7c3a991f14916de5e332f4d5cd0b793cb1c26 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 09:20:51 +0000
Subject: [PATCH 15/16] typing

---
 ignite/metrics/nlp/bleu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 6529c11f1f8f..0ca724a2ddc3 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -284,3 +284,4 @@ def compute(self) -> Union[None, Tensor, float]:
             return self._compute_macro()
         elif self.average == "micro":
             return self._compute_micro()
+        return None

From 07360519e00e5763211f3cf08448436b66c64365 Mon Sep 17 00:00:00 2001
From: BanzaiTokyo <banzaitokyo@gmail.com>
Date: Wed, 23 Apr 2025 09:46:46 +0000
Subject: [PATCH 16/16] transfer tensors in tests to cpu

---
 tests/ignite/metrics/nlp/test_bleu.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index a87de88d8abb..b191cd8ded6f 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -64,7 +64,10 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No
             assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates)
 
         bleu.update((candidates, references))
-        assert pytest.approx(reference) == bleu.compute()
+        computed = bleu.compute()
+        if isinstance(computed, torch.Tensor):
+            computed = computed.cpu().item()
+        assert pytest.approx(reference) == computed
 
 
 @pytest.mark.parametrize(*parametrize_args)
@@ -153,7 +156,11 @@ def test_bleu_batch_macro(available_device):
             + sentence_bleu(refs[1], hypotheses[1])
             + sentence_bleu(refs[2], hypotheses[2])
         ) / 3
-    assert pytest.approx(bleu.compute()) == reference_bleu_score
+    computed = bleu.compute()
+    if isinstance(computed, torch.Tensor):
+        computed = computed.cpu().item()
+
+    assert pytest.approx(computed) == reference_bleu_score
 
     value = 0
     for _hypotheses, _refs in zip(hypotheses, refs):
@@ -161,10 +168,12 @@ def test_bleu_batch_macro(available_device):
         bleu.update(([_hypotheses], [_refs]))
 
     ref_1 = value / len(refs)
-    ref_2 = bleu.compute()
+    computed = bleu.compute()
+    if isinstance(computed, torch.Tensor):
+        computed = computed.cpu().item()
 
     assert pytest.approx(ref_1) == reference_bleu_score
-    assert pytest.approx(ref_2) == reference_bleu_score
+    assert pytest.approx(computed) == reference_bleu_score
 
 
 def test_bleu_batch_micro(available_device):