-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathinference_time.py
133 lines (126 loc) · 4.4 KB
/
inference_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import sys
from pathlib import Path
import timeit
from sacrerouge.metrics import ChrF, SentBleu, BertScore
from nmtscore import NMTScorer
sys.path.insert(1, str(Path(__file__).resolve().parent.parent.parent))
from experiments import paraphrase_tasks
from experiments.metrics.benchmark_metrics import BenchmarkMetric
from experiments.metrics.sbert import SBERT
from experiments.metrics.symmetric_metric import SymmetricMetric
from experiments.metrics.nmtscore_metrics import DirectNMTScoreMetric, PivotNMTScoreMetric, CrossLikelihoodNMTScoreMetric
BATCH_SIZE = 32
NMT_TRANSLATE_KWARGS = {
"batch_size": BATCH_SIZE,
"use_cache": False,
}
NMT_SCORE_KWARGS = {
"batch_size": BATCH_SIZE,
"use_cache": False,
}
benchmark_metrics = [
# Surface similarity baselines
BenchmarkMetric(
title="ChrF",
metric_names=["chrf"],
load_func=lambda a_lang, b_lang: SymmetricMetric(ChrF()),
),
BenchmarkMetric(
title="SentBLEU",
metric_names=["sent-bleu"],
load_func=lambda a_lang, b_lang: SymmetricMetric(
SentBleu(trg_lang=a_lang, tokenize=None),
SentBleu(trg_lang=b_lang, tokenize=None),
),
),
# Embedding baselines
BenchmarkMetric(
title="Sentence-BERT",
metric_names=["sbert"],
load_func=lambda a_lang, b_lang: SBERT("paraphrase-xlm-r-multilingual-v1"),
),
BenchmarkMetric(
title="BERTScore-F1",
metric_names=["bertscore_f1"],
load_func=lambda a_lang, b_lang: BertScore("xlm-roberta-large", num_layers=17, batch_size=BATCH_SIZE),
),
# Translation-based measures
BenchmarkMetric(
title="Direct_Translation_Probability (normalized)",
metric_names=["nmtscore-direct"],
load_func=lambda a_lang, b_lang: DirectNMTScoreMetric(
a_lang,
b_lang,
scorer=NMTScorer("prism", device=0),
both_directions=True,
score_kwargs=NMT_SCORE_KWARGS,
),
),
BenchmarkMetric(
title="Pivot_Translation_Probability (normalized)",
metric_names=["nmtscore-pivot"],
load_func=lambda a_lang, b_lang: PivotNMTScoreMetric(
a_lang,
b_lang,
scorer=NMTScorer("prism", device=0),
both_directions=True,
translate_kwargs=NMT_TRANSLATE_KWARGS,
score_kwargs=NMT_SCORE_KWARGS,
),
),
BenchmarkMetric(
title="Translation_Cross-Likelihood (normalized)",
metric_names=["nmtscore-cross"],
load_func=lambda a_lang, b_lang: CrossLikelihoodNMTScoreMetric(
scorer=NMTScorer("prism", device=0),
both_directions=True,
translate_kwargs=NMT_TRANSLATE_KWARGS,
score_kwargs=NMT_SCORE_KWARGS,
),
),
BenchmarkMetric(
title="Direct_Translation_Probability (unnormalized)",
metric_names=["nmtscore-direct"],
load_func=lambda a_lang, b_lang: DirectNMTScoreMetric(
a_lang,
b_lang,
scorer=NMTScorer("prism", device=0),
normalize=False,
both_directions=True,
score_kwargs=NMT_SCORE_KWARGS,
),
),
BenchmarkMetric(
title="Pivot_Translation_Probability (unnormalized)",
metric_names=["nmtscore-pivot"],
load_func=lambda a_lang, b_lang: PivotNMTScoreMetric(
a_lang,
b_lang,
scorer=NMTScorer("prism", device=0),
normalize=False,
both_directions=True,
translate_kwargs=NMT_TRANSLATE_KWARGS,
score_kwargs=NMT_SCORE_KWARGS,
),
),
BenchmarkMetric(
title="Translation_Cross-Likelihood (unnormalized)",
metric_names=["nmtscore-cross"],
load_func=lambda a_lang, b_lang: CrossLikelihoodNMTScoreMetric(
scorer=NMTScorer("prism", device=0),
normalize=False,
both_directions=True,
translate_kwargs=NMT_TRANSLATE_KWARGS,
score_kwargs=NMT_SCORE_KWARGS,
),
),
]
mrpc = paraphrase_tasks.MRPCTask("validation")
num_pairs = len(mrpc.get_samples())
print("Number of pairs: ", num_pairs)
for benchmark_metric in benchmark_metrics:
print(benchmark_metric.title)
metric = benchmark_metric.load_metric(a_lang="en", b_lang="en")
time = timeit.timeit(lambda: mrpc.evaluate(metric, benchmark_metric.metric_names), number=1)
print(time / num_pairs * 1000, "ms")
del metric