From 6d91f9158baba23bd795f33f9faf85e0e35d40b6 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Mon, 14 Apr 2025 15:36:49 +0000 Subject: [PATCH 01/16] adds available device to nlp tests #3335 --- tests/ignite/metrics/nlp/test_bleu.py | 32 +++++++++++++++----------- tests/ignite/metrics/nlp/test_rouge.py | 9 ++++---- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 9de9c6de78c5..16c6a818da2c 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -107,8 +107,9 @@ def test_micro_bleu_smooth2(candidates, references): _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3) -def test_accumulation_macro_bleu(): - bleu = Bleu(ngram=4, smooth="smooth2") +def test_accumulation_macro_bleu(available_device): + bleu = Bleu(ngram=4, smooth="smooth2", device=available_device) + assert bleu._device == torch.device(available_device) bleu.update(([corpus.cand_1], [corpus.references_1])) bleu.update(([corpus.cand_2a], [corpus.references_2])) bleu.update(([corpus.cand_2b], [corpus.references_2])) @@ -120,8 +121,9 @@ def test_accumulation_macro_bleu(): assert bleu.compute() == value / 4 -def test_accumulation_micro_bleu(): - bleu = Bleu(ngram=4, smooth="smooth2", average="micro") +def test_accumulation_micro_bleu(available_device): + bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device) + assert bleu._device == torch.device(available_device) bleu.update(([corpus.cand_1], [corpus.references_1])) bleu.update(([corpus.cand_2a], [corpus.references_2])) bleu.update(([corpus.cand_2b], [corpus.references_2])) @@ -133,8 +135,9 @@ def test_accumulation_micro_bleu(): assert bleu.compute() == value -def test_bleu_batch_macro(): - bleu = Bleu(ngram=4) +def test_bleu_batch_macro(available_device): + bleu = Bleu(ngram=4, device=available_device) + assert bleu._device == torch.device(available_device) # Batch size 3 hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] @@ -162,8 +165,9 @@ def test_bleu_batch_macro(): assert pytest.approx(ref_2) == reference_bleu_score -def test_bleu_batch_micro(): - bleu = Bleu(ngram=4, average="micro") +def test_bleu_batch_micro(available_device): + bleu = Bleu(ngram=4, average="micro", device=available_device) + assert bleu._device == torch.device(available_device) # Batch size 3 hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] @@ -187,8 +191,10 @@ def test_bleu_batch_micro(): (corpus.cand_1, corpus.references_1), ], ) -def test_n_gram_counter(candidates, references): - bleu = Bleu(ngram=4) +def test_n_gram_counter(candidates, references, available_device): + bleu = Bleu(ngram=4, device=available_device) + assert bleu._device == torch.device(available_device) + hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) assert hyp_length == len(candidates) @@ -212,9 +218,9 @@ def _test_macro_distrib_integration(device): def update(_, i): return data[i + size * rank] - def _test(metric_device): + def _test(device): engine = Engine(update) - m = Bleu(ngram=4, smooth="smooth2") + m = Bleu(ngram=4, smooth="smooth2", device=device) m.attach(engine, "bleu") engine.run(data=list(range(size)), max_epochs=1) @@ -256,7 +262,7 @@ def update(_, i): def _test(metric_device): engine = Engine(update) - m = Bleu(ngram=4, smooth="smooth2", average="micro") + m = Bleu(ngram=4, smooth="smooth2", average="micro", device=metric_device) m.attach(engine, "bleu") engine.run(data=list(range(size)), max_epochs=1) diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index 5d8562866c83..5dbf4c9bde8f 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -84,9 +84,9 @@ def test_wrong_inputs(): (2, "abcdef", "zbdfz", (0, 0)), ], ) -def test_rouge_n_alpha(ngram, candidate, reference, expected): +def test_rouge_n_alpha(ngram, candidate, reference, expected, available_device): for alpha in [0, 1, 0.3, 0.5, 0.8]: - rouge = RougeN(ngram=ngram, alpha=alpha) + rouge = RougeN(ngram=ngram, alpha=alpha, device=available_device) rouge.update(([candidate], [[reference]])) results = rouge.compute() assert results[f"Rouge-{ngram}-P"] == expected[0] @@ -101,7 +101,7 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected): @pytest.mark.parametrize( "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5] ) -def test_rouge_metrics(candidates, references): +def test_rouge_metrics(candidates, references, available_device): for multiref in ["average", "best"]: # PERL 1.5.5 reference apply_avg = multiref == "average" @@ -123,7 +123,8 @@ def test_rouge_metrics(candidates, references): lower_split_candidates = [candidate.lower().split() for candidate in candidates] - m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5) + m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5, device=available_device) + assert m._device == torch.device(available_device) m.update((lower_split_candidates, lower_split_references)) results = m.compute() From c1dcffc9082b79da800b8a0130730e1fc068c752 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Mon, 14 Apr 2025 17:15:15 +0000 Subject: [PATCH 02/16] avoiding float64 --- tests/ignite/metrics/nlp/test_bleu.py | 69 +++++++++++++++++---------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 16c6a818da2c..edca544fbc32 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -2,6 +2,7 @@ import warnings from collections import Counter +import numpy as np import pytest import torch from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction @@ -44,10 +45,10 @@ def test_wrong_inputs(): ) -def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8): +def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"): for i in range(1, ngram_range): weights = tuple([1 / i] * i) - bleu = Bleu(ngram=i, average=average, smooth=smooth) + bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device) if average == "macro": with warnings.catch_warnings(): @@ -55,56 +56,65 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No reference = sentence_bleu( references[0], candidates[0], weights=weights, smoothing_function=smooth_nltk_fn ) - assert pytest.approx(reference) == bleu._sentence_bleu(references[0], candidates[0]) + computed = bleu._sentence_bleu(references[0], candidates[0]) + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, reference, rtol=1e-6) elif average == "micro": with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = corpus_bleu(references, candidates, weights=weights, smoothing_function=smooth_nltk_fn) - assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates) + computed = bleu._corpus_bleu(references, candidates) + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, reference, rtol=1e-6) bleu.update((candidates, references)) - assert pytest.approx(reference) == bleu.compute() + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, reference, rtol=1e-6) @pytest.mark.parametrize(*parametrize_args) -def test_macro_bleu(candidates, references): - _test(candidates, references, "macro") +def test_macro_bleu(candidates, references, available_device): + _test(candidates, references, "macro", device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_micro_bleu(candidates, references): - _test(candidates, references, "micro") +def test_micro_bleu(candidates, references, available_device): + _test(candidates, references, "micro", device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_macro_bleu_smooth1(candidates, references): - _test(candidates, references, "macro", "smooth1", SmoothingFunction().method1) +def test_macro_bleu_smooth1(candidates, references, available_device): + _test(candidates, references, "macro", "smooth1", SmoothingFunction().method1, device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_micro_bleu_smooth1(candidates, references): - _test(candidates, references, "micro", "smooth1", SmoothingFunction().method1) +def test_micro_bleu_smooth1(candidates, references, available_device): + _test(candidates, references, "micro", "smooth1", SmoothingFunction().method1, device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_macro_bleu_nltk_smooth2(candidates, references): - _test(candidates, references, "macro", "nltk_smooth2", SmoothingFunction().method2) +def test_macro_bleu_nltk_smooth2(candidates, references, available_device): + _test(candidates, references, "macro", "nltk_smooth2", SmoothingFunction().method2, device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_micro_bleu_nltk_smooth2(candidates, references): - _test(candidates, references, "micro", "nltk_smooth2", SmoothingFunction().method2) +def test_micro_bleu_nltk_smooth2(candidates, references, available_device): + _test(candidates, references, "micro", "nltk_smooth2", SmoothingFunction().method2, device=available_device) @pytest.mark.parametrize(*parametrize_args) -def test_macro_bleu_smooth2(candidates, references): - _test(candidates, references, "macro", "smooth2", SmoothingFunction().method2, 3) +def test_macro_bleu_smooth2(candidates, references, available_device): + _test(candidates, references, "macro", "smooth2", SmoothingFunction().method2, 3, available_device) @pytest.mark.parametrize(*parametrize_args) -def test_micro_bleu_smooth2(candidates, references): - _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3) +def test_micro_bleu_smooth2(candidates, references, available_device): + _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3, device=available_device) def test_accumulation_macro_bleu(available_device): @@ -118,7 +128,10 @@ def test_accumulation_macro_bleu(available_device): value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) - assert bleu.compute() == value / 4 + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, value / 4, rtol=1e-6) def test_accumulation_micro_bleu(available_device): @@ -151,7 +164,11 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - assert pytest.approx(bleu.compute()) == reference_bleu_score + reference_bleu_score = np.float32(reference_bleu_score) + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, reference_bleu_score, rtol=1e-6) value = 0 for _hypotheses, _refs in zip(hypotheses, refs): @@ -161,8 +178,8 @@ def test_bleu_batch_macro(available_device): ref_1 = value / len(refs) ref_2 = bleu.compute() - assert pytest.approx(ref_1) == reference_bleu_score - assert pytest.approx(ref_2) == reference_bleu_score + assert np.allclose(ref_1, reference_bleu_score, rtol=1e-6) + assert np.allclose(ref_2, reference_bleu_score, rtol=1e-6) def test_bleu_batch_micro(available_device): @@ -196,6 +213,8 @@ def test_n_gram_counter(candidates, references, available_device): assert bleu._device == torch.device(available_device) hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) + hyp_length = int(hyp_length) + ref_length = int(ref_length) assert hyp_length == len(candidates) ref_lens = (len(reference) for reference in references) From 6dd44ce83563ccc6c87a6980a9ebb626fc3f12a6 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Tue, 15 Apr 2025 16:00:43 +0000 Subject: [PATCH 03/16] converts candidates and references float32 on MPS --- tests/ignite/metrics/nlp/test_bleu.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index edca544fbc32..5787fb187f31 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -16,6 +16,14 @@ corpus = CorpusForTest(lower_split=True) +def to_float32_if_mps(x, device): + if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64: + return x.to(torch.float32) + elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64: + return x.astype(np.float32) + return x + + def test_wrong_inputs(): with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) @@ -46,6 +54,10 @@ def test_wrong_inputs(): def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"): + + candidates = to_float32_if_mps(candidates, device) + references = to_float32_if_mps(references, device) + for i in range(1, ngram_range): weights = tuple([1 / i] * i) bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device) From d1d949fb894b506aca6aa76ed059172c5f0d8690 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Tue, 15 Apr 2025 16:19:11 +0000 Subject: [PATCH 04/16] more conversions to float32 on MPS --- tests/ignite/metrics/nlp/test_bleu.py | 57 ++++++++++++++++++--------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 5787fb187f31..8ed98cd30f1c 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -132,14 +132,21 @@ def test_micro_bleu_smooth2(candidates, references, available_device): def test_accumulation_macro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", device=available_device) assert bleu._device == torch.device(available_device) - bleu.update(([corpus.cand_1], [corpus.references_1])) - bleu.update(([corpus.cand_2a], [corpus.references_2])) - bleu.update(([corpus.cand_2b], [corpus.references_2])) - bleu.update(([corpus.cand_3], [corpus.references_2])) - value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) + cand_1 = to_float32_if_mps(corpus.cand_1, available_device) + cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) + cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) + cand_3 = to_float32_if_mps(corpus.cand_3, available_device) + ref_1 = to_float32_if_mps(corpus.references_1, available_device) + ref_2 = to_float32_if_mps(corpus.references_2, available_device) + + bleu.update(([cand_1], [ref_1])) + bleu.update(([cand_2a], [ref_2])) + bleu.update(([cand_2b], [ref_2])) + bleu.update(([cand_3], [ref_2])) + value = bleu._sentence_bleu(ref_1, cand_1) + value += bleu._sentence_bleu(ref_2, cand_2a) + value += bleu._sentence_bleu(ref_2, cand_2b) + value += bleu._sentence_bleu(ref_2, cand_3) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() @@ -149,13 +156,20 @@ def test_accumulation_macro_bleu(available_device): def test_accumulation_micro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device) assert bleu._device == torch.device(available_device) - bleu.update(([corpus.cand_1], [corpus.references_1])) - bleu.update(([corpus.cand_2a], [corpus.references_2])) - bleu.update(([corpus.cand_2b], [corpus.references_2])) - bleu.update(([corpus.cand_3], [corpus.references_2])) + cand_1 = to_float32_if_mps(corpus.cand_1, available_device) + cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) + cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) + cand_3 = to_float32_if_mps(corpus.cand_3, available_device) + ref_1 = to_float32_if_mps(corpus.references_1, available_device) + ref_2 = to_float32_if_mps(corpus.references_2, available_device) + + bleu.update(([cand_1], [ref_1])) + bleu.update(([cand_2a], [ref_2])) + bleu.update(([cand_2b], [ref_2])) + bleu.update(([cand_3], [ref_2])) value = bleu._corpus_bleu( - [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2], - [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3], + [ref_1, ref_2, ref_2, ref_2], + [cand_1, cand_2a, cand_2b, cand_3], ) assert bleu.compute() == value @@ -165,8 +179,10 @@ def test_bleu_batch_macro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] - refs = [corpus.references_1, corpus.references_2, corpus.references_2] + hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] + refs = [ + to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] + ] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -199,8 +215,10 @@ def test_bleu_batch_micro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] - refs = [corpus.references_1, corpus.references_2, corpus.references_2] + hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] + refs = [ + to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] + ] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -224,6 +242,9 @@ def test_n_gram_counter(candidates, references, available_device): bleu = Bleu(ngram=4, device=available_device) assert bleu._device == torch.device(available_device) + candidates = to_float32_if_mps(candidates, available_device) + references = to_float32_if_mps(references, available_device) + hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) hyp_length = int(hyp_length) ref_length = int(ref_length) From fabc322d5c7f84541525f4e2ae18cb2f3e461a8a Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Thu, 17 Apr 2025 08:13:55 +0000 Subject: [PATCH 05/16] rolls back some unnecessary conversions to float32 --- tests/ignite/metrics/nlp/test_bleu.py | 77 ++++++++------------------- 1 file changed, 23 insertions(+), 54 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 8ed98cd30f1c..d7c48485fe6e 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -16,14 +16,6 @@ corpus = CorpusForTest(lower_split=True) -def to_float32_if_mps(x, device): - if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64: - return x.to(torch.float32) - elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64: - return x.astype(np.float32) - return x - - def test_wrong_inputs(): with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) @@ -55,9 +47,6 @@ def test_wrong_inputs(): def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"): - candidates = to_float32_if_mps(candidates, device) - references = to_float32_if_mps(references, device) - for i in range(1, ngram_range): weights = tuple([1 / i] * i) bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device) @@ -132,21 +121,15 @@ def test_micro_bleu_smooth2(candidates, references, available_device): def test_accumulation_macro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", device=available_device) assert bleu._device == torch.device(available_device) - cand_1 = to_float32_if_mps(corpus.cand_1, available_device) - cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) - cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) - cand_3 = to_float32_if_mps(corpus.cand_3, available_device) - ref_1 = to_float32_if_mps(corpus.references_1, available_device) - ref_2 = to_float32_if_mps(corpus.references_2, available_device) - - bleu.update(([cand_1], [ref_1])) - bleu.update(([cand_2a], [ref_2])) - bleu.update(([cand_2b], [ref_2])) - bleu.update(([cand_3], [ref_2])) - value = bleu._sentence_bleu(ref_1, cand_1) - value += bleu._sentence_bleu(ref_2, cand_2a) - value += bleu._sentence_bleu(ref_2, cand_2b) - value += bleu._sentence_bleu(ref_2, cand_3) + bleu.update(([corpus.cand_1], [corpus.references_1])) + bleu.update(([corpus.cand_2a], [corpus.references_2])) + bleu.update(([corpus.cand_2b], [corpus.references_2])) + bleu.update(([corpus.cand_3], [corpus.references_2])) + value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) + computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() @@ -156,22 +139,18 @@ def test_accumulation_macro_bleu(available_device): def test_accumulation_micro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device) assert bleu._device == torch.device(available_device) - cand_1 = to_float32_if_mps(corpus.cand_1, available_device) - cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) - cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) - cand_3 = to_float32_if_mps(corpus.cand_3, available_device) - ref_1 = to_float32_if_mps(corpus.references_1, available_device) - ref_2 = to_float32_if_mps(corpus.references_2, available_device) - - bleu.update(([cand_1], [ref_1])) - bleu.update(([cand_2a], [ref_2])) - bleu.update(([cand_2b], [ref_2])) - bleu.update(([cand_3], [ref_2])) + bleu.update(([corpus.cand_1], [corpus.references_1])) + bleu.update(([corpus.cand_2a], [corpus.references_2])) + bleu.update(([corpus.cand_2b], [corpus.references_2])) + bleu.update(([corpus.cand_3], [corpus.references_2])) value = bleu._corpus_bleu( - [ref_1, ref_2, ref_2, ref_2], - [cand_1, cand_2a, cand_2b, cand_3], + [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2], + [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3], ) - assert bleu.compute() == value + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().float().item() + assert np.allclose(computed, value, rtol=1e-6) def test_bleu_batch_macro(available_device): @@ -179,10 +158,8 @@ def test_bleu_batch_macro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] - refs = [ - to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] - ] + hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] + refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -192,7 +169,6 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - reference_bleu_score = np.float32(reference_bleu_score) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() @@ -215,10 +191,8 @@ def test_bleu_batch_micro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] - refs = [ - to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] - ] + hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] + refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -242,12 +216,7 @@ def test_n_gram_counter(candidates, references, available_device): bleu = Bleu(ngram=4, device=available_device) assert bleu._device == torch.device(available_device) - candidates = to_float32_if_mps(candidates, available_device) - references = to_float32_if_mps(references, available_device) - hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) - hyp_length = int(hyp_length) - ref_length = int(ref_length) assert hyp_length == len(candidates) ref_lens = (len(reference) for reference in references) From e2e1562e332feebaf4fafeef267c3a04802c1568 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Thu, 17 Apr 2025 14:03:12 +0000 Subject: [PATCH 06/16] trying to make tests pass --- ignite/metrics/nlp/bleu.py | 5 +++-- tests/ignite/metrics/nlp/test_bleu.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index ed3b14b4dc52..5324fd509c59 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -1,4 +1,5 @@ import math +from collections import Counter from typing import Any, Callable, Sequence, Tuple, Union import torch @@ -158,8 +159,8 @@ def _n_gram_counter( self, references: Sequence[Sequence[Sequence[Any]]], candidates: Sequence[Sequence[Any]], - p_numerators: torch.Tensor, - p_denominators: torch.Tensor, + p_numerators: Union[torch.Tensor, Counter], + p_denominators: Union[torch.Tensor, Counter], ) -> Tuple[int, int]: if len(references) != len(candidates): raise ValueError( diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index d7c48485fe6e..b042738ff749 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -16,6 +16,14 @@ corpus = CorpusForTest(lower_split=True) +def to_float32_if_mps(x, device): + if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64: + return x.to(torch.float32) + elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64: + return x.astype(np.float32) + return x + + def test_wrong_inputs(): with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) @@ -47,6 +55,9 @@ def test_wrong_inputs(): def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"): + candidates = to_float32_if_mps(candidates, device) + references = to_float32_if_mps(references, device) + for i in range(1, ngram_range): weights = tuple([1 / i] * i) bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device) @@ -169,6 +180,7 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 + reference_bleu_score = to_float32_if_mps(reference_bleu_score) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() From 9681a0b5ba5e7899f6e6317b65ebb38d44399f75 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Thu, 17 Apr 2025 14:27:00 +0000 Subject: [PATCH 07/16] rollback to previously passing tests --- tests/ignite/metrics/nlp/test_bleu.py | 67 +++++++++++++++++---------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index b042738ff749..8ed98cd30f1c 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -132,15 +132,21 @@ def test_micro_bleu_smooth2(candidates, references, available_device): def test_accumulation_macro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", device=available_device) assert bleu._device == torch.device(available_device) - bleu.update(([corpus.cand_1], [corpus.references_1])) - bleu.update(([corpus.cand_2a], [corpus.references_2])) - bleu.update(([corpus.cand_2b], [corpus.references_2])) - bleu.update(([corpus.cand_3], [corpus.references_2])) - value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) - value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) - + cand_1 = to_float32_if_mps(corpus.cand_1, available_device) + cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) + cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) + cand_3 = to_float32_if_mps(corpus.cand_3, available_device) + ref_1 = to_float32_if_mps(corpus.references_1, available_device) + ref_2 = to_float32_if_mps(corpus.references_2, available_device) + + bleu.update(([cand_1], [ref_1])) + bleu.update(([cand_2a], [ref_2])) + bleu.update(([cand_2b], [ref_2])) + bleu.update(([cand_3], [ref_2])) + value = bleu._sentence_bleu(ref_1, cand_1) + value += bleu._sentence_bleu(ref_2, cand_2a) + value += bleu._sentence_bleu(ref_2, cand_2b) + value += bleu._sentence_bleu(ref_2, cand_3) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() @@ -150,18 +156,22 @@ def test_accumulation_macro_bleu(available_device): def test_accumulation_micro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device) assert bleu._device == torch.device(available_device) - bleu.update(([corpus.cand_1], [corpus.references_1])) - bleu.update(([corpus.cand_2a], [corpus.references_2])) - bleu.update(([corpus.cand_2b], [corpus.references_2])) - bleu.update(([corpus.cand_3], [corpus.references_2])) + cand_1 = to_float32_if_mps(corpus.cand_1, available_device) + cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) + cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) + cand_3 = to_float32_if_mps(corpus.cand_3, available_device) + ref_1 = to_float32_if_mps(corpus.references_1, available_device) + ref_2 = to_float32_if_mps(corpus.references_2, available_device) + + bleu.update(([cand_1], [ref_1])) + bleu.update(([cand_2a], [ref_2])) + bleu.update(([cand_2b], [ref_2])) + bleu.update(([cand_3], [ref_2])) value = bleu._corpus_bleu( - [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2], - [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3], + [ref_1, ref_2, ref_2, ref_2], + [cand_1, cand_2a, cand_2b, cand_3], ) - computed = bleu.compute() - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert np.allclose(computed, value, rtol=1e-6) + assert bleu.compute() == value def test_bleu_batch_macro(available_device): @@ -169,8 +179,10 @@ def test_bleu_batch_macro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] - refs = [corpus.references_1, corpus.references_2, corpus.references_2] + hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] + refs = [ + to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] + ] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -180,7 +192,7 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - reference_bleu_score = to_float32_if_mps(reference_bleu_score) + reference_bleu_score = np.float32(reference_bleu_score) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() @@ -203,8 +215,10 @@ def test_bleu_batch_micro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] - refs = [corpus.references_1, corpus.references_2, corpus.references_2] + hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] + refs = [ + to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] + ] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -228,7 +242,12 @@ def test_n_gram_counter(candidates, references, available_device): bleu = Bleu(ngram=4, device=available_device) assert bleu._device == torch.device(available_device) + candidates = to_float32_if_mps(candidates, available_device) + references = to_float32_if_mps(references, available_device) + hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) + hyp_length = int(hyp_length) + ref_length = int(ref_length) assert hyp_length == len(candidates) ref_lens = (len(reference) for reference in references) From 1d6dd7d2a7c61c84ef0361dc5395644f3c13968e Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Thu, 17 Apr 2025 14:45:06 +0000 Subject: [PATCH 08/16] rollback _n_gram_counter parameter type change --- ignite/metrics/nlp/bleu.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 5324fd509c59..ed3b14b4dc52 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -1,5 +1,4 @@ import math -from collections import Counter from typing import Any, Callable, Sequence, Tuple, Union import torch @@ -159,8 +158,8 @@ def _n_gram_counter( self, references: Sequence[Sequence[Sequence[Any]]], candidates: Sequence[Sequence[Any]], - p_numerators: Union[torch.Tensor, Counter], - p_denominators: Union[torch.Tensor, Counter], + p_numerators: torch.Tensor, + p_denominators: torch.Tensor, ) -> Tuple[int, int]: if len(references) != len(candidates): raise ValueError( From 908ec7726045349dffe4d76e22063d8f8f43164f Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Fri, 18 Apr 2025 14:04:54 +0000 Subject: [PATCH 09/16] in bleu.py do not use torch.double --- ignite/metrics/nlp/bleu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index ed3b14b4dc52..3b6053967744 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -236,7 +236,8 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates @reinit__is_reduced def reset(self) -> None: if self.average == "macro": - self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device) + dtype = torch.get_default_dtype() if self._device.type == "mps" else torch.double + self._sum_of_bleu = torch.tensor(0.0, dtype=dtype, device=self._device) self._num_sentences = 0 if self.average == "micro": From 3ad6b5e01c39d4a45453d5d69da837076769124d Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 08:06:46 +0000 Subject: [PATCH 10/16] clean up --- tests/ignite/metrics/nlp/test_bleu.py | 102 +++++++------------------- 1 file changed, 28 insertions(+), 74 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 8ed98cd30f1c..19ddd878d043 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -2,7 +2,6 @@ import warnings from collections import Counter -import numpy as np import pytest import torch from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction @@ -16,14 +15,6 @@ corpus = CorpusForTest(lower_split=True) -def to_float32_if_mps(x, device): - if isinstance(x, torch.Tensor) and device == "mps" and x.dtype == torch.float64: - return x.to(torch.float32) - elif isinstance(x, np.ndarray) and device == "mps" and x.dtype == np.float64: - return x.astype(np.float32) - return x - - def test_wrong_inputs(): with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) @@ -54,10 +45,6 @@ def test_wrong_inputs(): def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=None, ngram_range=8, device="cpu"): - - candidates = to_float32_if_mps(candidates, device) - references = to_float32_if_mps(references, device) - for i in range(1, ngram_range): weights = tuple([1 / i] * i) bleu = Bleu(ngram=i, average=average, smooth=smooth, device=device) @@ -68,25 +55,16 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No reference = sentence_bleu( references[0], candidates[0], weights=weights, smoothing_function=smooth_nltk_fn ) - computed = bleu._sentence_bleu(references[0], candidates[0]) - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert np.allclose(computed, reference, rtol=1e-6) + assert pytest.approx(reference) == bleu._sentence_bleu(references[0], candidates[0]) elif average == "micro": with warnings.catch_warnings(): warnings.simplefilter("ignore") reference = corpus_bleu(references, candidates, weights=weights, smoothing_function=smooth_nltk_fn) - computed = bleu._corpus_bleu(references, candidates) - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert np.allclose(computed, reference, rtol=1e-6) + assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates) bleu.update((candidates, references)) - computed = bleu.compute() - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert np.allclose(computed, reference, rtol=1e-6) + assert pytest.approx(reference) == bleu.compute() @pytest.mark.parametrize(*parametrize_args) @@ -132,44 +110,32 @@ def test_micro_bleu_smooth2(candidates, references, available_device): def test_accumulation_macro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", device=available_device) assert bleu._device == torch.device(available_device) - cand_1 = to_float32_if_mps(corpus.cand_1, available_device) - cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) - cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) - cand_3 = to_float32_if_mps(corpus.cand_3, available_device) - ref_1 = to_float32_if_mps(corpus.references_1, available_device) - ref_2 = to_float32_if_mps(corpus.references_2, available_device) - - bleu.update(([cand_1], [ref_1])) - bleu.update(([cand_2a], [ref_2])) - bleu.update(([cand_2b], [ref_2])) - bleu.update(([cand_3], [ref_2])) - value = bleu._sentence_bleu(ref_1, cand_1) - value += bleu._sentence_bleu(ref_2, cand_2a) - value += bleu._sentence_bleu(ref_2, cand_2b) - value += bleu._sentence_bleu(ref_2, cand_3) + + bleu.update(([corpus.cand_1], [corpus.references_1])) + bleu.update(([corpus.cand_2a], [corpus.references_2])) + bleu.update(([corpus.cand_2b], [corpus.references_2])) + bleu.update(([corpus.cand_3], [corpus.references_2])) + value = bleu._sentence_bleu(corpus.references_1, corpus.cand_1) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) + value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) computed = bleu.compute() if isinstance(computed, torch.Tensor): computed = computed.cpu().float().item() - assert np.allclose(computed, value / 4, rtol=1e-6) + assert computed == value / 4 def test_accumulation_micro_bleu(available_device): bleu = Bleu(ngram=4, smooth="smooth2", average="micro", device=available_device) assert bleu._device == torch.device(available_device) - cand_1 = to_float32_if_mps(corpus.cand_1, available_device) - cand_2a = to_float32_if_mps(corpus.cand_2a, available_device) - cand_2b = to_float32_if_mps(corpus.cand_2b, available_device) - cand_3 = to_float32_if_mps(corpus.cand_3, available_device) - ref_1 = to_float32_if_mps(corpus.references_1, available_device) - ref_2 = to_float32_if_mps(corpus.references_2, available_device) - - bleu.update(([cand_1], [ref_1])) - bleu.update(([cand_2a], [ref_2])) - bleu.update(([cand_2b], [ref_2])) - bleu.update(([cand_3], [ref_2])) + + bleu.update(([corpus.cand_1], [corpus.references_1])) + bleu.update(([corpus.cand_2a], [corpus.references_2])) + bleu.update(([corpus.cand_2b], [corpus.references_2])) + bleu.update(([corpus.cand_3], [corpus.references_2])) value = bleu._corpus_bleu( - [ref_1, ref_2, ref_2, ref_2], - [cand_1, cand_2a, cand_2b, cand_3], + [corpus.references_1, corpus.references_2, corpus.references_2, corpus.references_2], + [corpus.cand_1, corpus.cand_2a, corpus.cand_2b, corpus.cand_3], ) assert bleu.compute() == value @@ -179,10 +145,8 @@ def test_bleu_batch_macro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] - refs = [ - to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] - ] + hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] + refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -192,11 +156,8 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - reference_bleu_score = np.float32(reference_bleu_score) computed = bleu.compute() - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert np.allclose(computed, reference_bleu_score, rtol=1e-6) + assert pytest.approx(computed) == reference_bleu_score value = 0 for _hypotheses, _refs in zip(hypotheses, refs): @@ -204,10 +165,10 @@ def test_bleu_batch_macro(available_device): bleu.update(([_hypotheses], [_refs])) ref_1 = value / len(refs) - ref_2 = bleu.compute() + ref_2 = bleu.compute().cpu().numpy() - assert np.allclose(ref_1, reference_bleu_score, rtol=1e-6) - assert np.allclose(ref_2, reference_bleu_score, rtol=1e-6) + assert pytest.approx(ref_1) == reference_bleu_score + assert pytest.approx(ref_2) == reference_bleu_score def test_bleu_batch_micro(available_device): @@ -215,10 +176,8 @@ def test_bleu_batch_micro(available_device): assert bleu._device == torch.device(available_device) # Batch size 3 - hypotheses = [to_float32_if_mps(c, available_device) for c in [corpus.cand_1, corpus.cand_2a, corpus.cand_2b]] - refs = [ - to_float32_if_mps(r, available_device) for r in [corpus.references_1, corpus.references_2, corpus.references_2] - ] + hypotheses = [corpus.cand_1, corpus.cand_2a, corpus.cand_2b] + refs = [corpus.references_1, corpus.references_2, corpus.references_2] bleu.update((hypotheses, refs)) with warnings.catch_warnings(): @@ -242,12 +201,7 @@ def test_n_gram_counter(candidates, references, available_device): bleu = Bleu(ngram=4, device=available_device) assert bleu._device == torch.device(available_device) - candidates = to_float32_if_mps(candidates, available_device) - references = to_float32_if_mps(references, available_device) - hyp_length, ref_length = bleu._n_gram_counter([references], [candidates], Counter(), Counter()) - hyp_length = int(hyp_length) - ref_length = int(ref_length) assert hyp_length == len(candidates) ref_lens = (len(reference) for reference in references) From ed54722995193a9f7429b7c858c141730337e925 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 08:41:10 +0000 Subject: [PATCH 11/16] sets dtype in bleu.py --- ignite/metrics/nlp/bleu.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 3b6053967744..9172f80de625 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -236,13 +236,12 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates @reinit__is_reduced def reset(self) -> None: if self.average == "macro": - dtype = torch.get_default_dtype() if self._device.type == "mps" else torch.double - self._sum_of_bleu = torch.tensor(0.0, dtype=dtype, device=self._device) + self._sum_of_bleu = torch.tensor(0.0, dtype=self._double_dtype, device=self._device) self._num_sentences = 0 if self.average == "micro": - self.p_numerators = torch.zeros(self.ngrams_order + 1) - self.p_denominators = torch.zeros(self.ngrams_order + 1) + self.p_numerators = torch.zeros(self.ngrams_order + 1, dtype=self._double_dtype) + self.p_denominators = torch.zeros(self.ngrams_order + 1, dtype=self._double_dtype) self.hyp_length_sum = 0 self.ref_length_sum = 0 @@ -279,7 +278,7 @@ def _compute_micro(self) -> float: ) return bleu_score - def compute(self) -> None: + def compute(self): if self.average == "macro": return self._compute_macro() elif self.average == "micro": From 2c841935dbbcdfc1be3da1a590120af61ae5fa11 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 08:47:28 +0000 Subject: [PATCH 12/16] adds return type to Bleu.compute --- ignite/metrics/nlp/bleu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 9172f80de625..bdca2e31428f 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -2,6 +2,7 @@ from typing import Any, Callable, Sequence, Tuple, Union import torch +from torch import Tensor from ignite.exceptions import NotComputableError from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce @@ -278,7 +279,7 @@ def _compute_micro(self) -> float: ) return bleu_score - def compute(self): + def compute(self) -> None | Tensor | float: if self.average == "macro": return self._compute_macro() elif self.average == "micro": From 59806b4f3575395bdcfa3c3736649a974df815cf Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 09:05:46 +0000 Subject: [PATCH 13/16] removes unnecessary conversion --- tests/ignite/metrics/nlp/test_bleu.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 19ddd878d043..a87de88d8abb 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -119,10 +119,7 @@ def test_accumulation_macro_bleu(available_device): value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2a) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_2b) value += bleu._sentence_bleu(corpus.references_2, corpus.cand_3) - computed = bleu.compute() - if isinstance(computed, torch.Tensor): - computed = computed.cpu().float().item() - assert computed == value / 4 + assert bleu.compute() == value / 4 def test_accumulation_micro_bleu(available_device): @@ -156,8 +153,7 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - computed = bleu.compute() - assert pytest.approx(computed) == reference_bleu_score + assert pytest.approx(bleu.compute()) == reference_bleu_score value = 0 for _hypotheses, _refs in zip(hypotheses, refs): @@ -165,7 +161,7 @@ def test_bleu_batch_macro(available_device): bleu.update(([_hypotheses], [_refs])) ref_1 = value / len(refs) - ref_2 = bleu.compute().cpu().numpy() + ref_2 = bleu.compute() assert pytest.approx(ref_1) == reference_bleu_score assert pytest.approx(ref_2) == reference_bleu_score From 385fb27106659eba799cecc6a4dbf8cf831fa013 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 09:10:49 +0000 Subject: [PATCH 14/16] typing --- ignite/metrics/nlp/bleu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index bdca2e31428f..6529c11f1f8f 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -279,7 +279,7 @@ def _compute_micro(self) -> float: ) return bleu_score - def compute(self) -> None | Tensor | float: + def compute(self) -> Union[None, Tensor, float]: if self.average == "macro": return self._compute_macro() elif self.average == "micro": From 0ed7c3a991f14916de5e332f4d5cd0b793cb1c26 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 09:20:51 +0000 Subject: [PATCH 15/16] typing --- ignite/metrics/nlp/bleu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 6529c11f1f8f..0ca724a2ddc3 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -284,3 +284,4 @@ def compute(self) -> Union[None, Tensor, float]: return self._compute_macro() elif self.average == "micro": return self._compute_micro() + return None From 07360519e00e5763211f3cf08448436b66c64365 Mon Sep 17 00:00:00 2001 From: BanzaiTokyo Date: Wed, 23 Apr 2025 09:46:46 +0000 Subject: [PATCH 16/16] transfer tensors in tests to cpu --- tests/ignite/metrics/nlp/test_bleu.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index a87de88d8abb..b191cd8ded6f 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -64,7 +64,10 @@ def _test(candidates, references, average, smooth="no_smooth", smooth_nltk_fn=No assert pytest.approx(reference) == bleu._corpus_bleu(references, candidates) bleu.update((candidates, references)) - assert pytest.approx(reference) == bleu.compute() + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().item() + assert pytest.approx(reference) == computed @pytest.mark.parametrize(*parametrize_args) @@ -153,7 +156,11 @@ def test_bleu_batch_macro(available_device): + sentence_bleu(refs[1], hypotheses[1]) + sentence_bleu(refs[2], hypotheses[2]) ) / 3 - assert pytest.approx(bleu.compute()) == reference_bleu_score + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().item() + + assert pytest.approx(computed) == reference_bleu_score value = 0 for _hypotheses, _refs in zip(hypotheses, refs): @@ -161,10 +168,12 @@ def test_bleu_batch_macro(available_device): bleu.update(([_hypotheses], [_refs])) ref_1 = value / len(refs) - ref_2 = bleu.compute() + computed = bleu.compute() + if isinstance(computed, torch.Tensor): + computed = computed.cpu().item() assert pytest.approx(ref_1) == reference_bleu_score - assert pytest.approx(ref_2) == reference_bleu_score + assert pytest.approx(computed) == reference_bleu_score def test_bleu_batch_micro(available_device):