diff --git a/results/sample/factcc_summary.json b/results/sample/factcc_summary.json new file mode 100644 index 0000000..821d467 --- /dev/null +++ b/results/sample/factcc_summary.json @@ -0,0 +1,398 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.5815545229, + "average": 0.5815545229 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": -0.0600319914, + "average": -0.0600319914 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.0277402752, + "average": 0.0277402752 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.7169653488, + "average": 0.7169653488 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": -0.0528466412, + "average": -0.0528466412 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.0967947792, + "average": 0.0967947792 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.3208156661, + "average": 0.3208156661 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.5156587126, + "average": -0.5156587126 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.470599931, + "average": -0.470599931 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.6595192702, + "average": 0.6595192702 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.0757447464, + "average": 0.0757447464 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.1639806392, + "average": 0.1639806392 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.6717421438, + "average": 0.6717421438 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.3588681114, + "average": 0.3588681114 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": -0.2877454493, + "average": -0.2877454493 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": -0.2861778665, + "average": -0.2861778665 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.6681691136, + "average": -0.6681691136 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.6737821313, + "average": -0.6737821313 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.6666898215, + "average": 0.6666898215 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.3804341873, + "average": 0.3804341873 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.4086553414, + "average": 0.4086553414 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.6806158226, + "average": 0.6806158226 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.6709422745, + "average": 0.6709422745 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.3458599955, + "average": 0.3458599955 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.2855930523, + "average": 0.2855930523 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.5795375818, + "average": -0.5795375818 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.5553977183, + "average": -0.5553977183 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.5867073853, + "average": 0.5867073853 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.262299699, + "average": 0.262299699 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.2984548025, + "average": 0.2984548025 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.591284334, + "average": 0.591284334 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.5146585507, + "average": 0.5146585507 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.2794090753, + "average": -0.2794090753 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.4868644956, + "average": 0.4868644956 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": -0.1217161239, + "average": -0.1217161239 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.0, + "average": 0.0 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.7302967433, + "average": 0.7302967433 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": -0.0608580619, + "average": -0.0608580619 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.2434322478, + "average": 0.2434322478 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": -0.1217161239, + "average": -0.1217161239 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.3042903097, + "average": -0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.3042903097, + "average": -0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.4868644956, + "average": 0.4868644956 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.1825741858, + "average": 0.1825741858 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.1825741858, + "average": 0.1825741858 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.5477225575, + "average": 0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.2434322478, + "average": 0.2434322478 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": -0.3042903097, + "average": -0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": -0.4868644956, + "average": -0.4868644956 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.5477225575, + "average": -0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.5477225575, + "average": -0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.6085806195, + "average": 0.6085806195 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.4260064336, + "average": 0.4260064336 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.4868644956, + "average": 0.4868644956 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.6085806195, + "average": 0.6085806195 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.5477225575, + "average": 0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.1825741858, + "average": 0.1825741858 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": -0.3651483717, + "average": -0.3651483717 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.3651483717, + "average": -0.3651483717 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.3042903097, + "average": -0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.5477225575, + "average": 0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.3042903097, + "average": 0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.3042903097, + "average": 0.3042903097 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.5477225575, + "average": 0.5477225575 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.3651483717, + "average": 0.3651483717 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.0608580619, + "average": -0.0608580619 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.5685352436, + "average": 0.5685352436 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": -0.1421338109, + "average": -0.1421338109 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.0, + "average": 0.0 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.8528028654, + "average": 0.8528028654 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": -0.0710669055, + "average": -0.0710669055 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.2842676218, + "average": 0.2842676218 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": -0.1421338109, + "average": -0.1421338109 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.3553345273, + "average": -0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.3553345273, + "average": -0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.5685352436, + "average": 0.5685352436 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.2132007164, + "average": 0.2132007164 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.2132007164, + "average": 0.2132007164 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.6396021491, + "average": 0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.2842676218, + "average": 0.2842676218 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": -0.3553345273, + "average": -0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": -0.5685352436, + "average": -0.5685352436 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.6396021491, + "average": -0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.6396021491, + "average": -0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.7106690545, + "average": 0.7106690545 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.4974683382, + "average": 0.4974683382 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.5685352436, + "average": 0.5685352436 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.7106690545, + "average": 0.7106690545 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.6396021491, + "average": 0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.2132007164, + "average": 0.2132007164 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": -0.4264014327, + "average": -0.4264014327 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.4264014327, + "average": -0.4264014327 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.3553345273, + "average": -0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.6396021491, + "average": 0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.3553345273, + "average": 0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.3553345273, + "average": 0.3553345273 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.6396021491, + "average": 0.6396021491 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.4264014327, + "average": 0.4264014327 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.0710669055, + "average": -0.0710669055 + } +} \ No newline at end of file diff --git a/results/sample/factcc_summary.txt b/results/sample/factcc_summary.txt new file mode 100644 index 0000000..945aeb3 --- /dev/null +++ b/results/sample/factcc_summary.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 0.582 + R -0.060 + F 0.028 + bertscore-sentence-cos-roberta P 0.717 + R -0.053 + F 0.097 + bertscore-sentence-mnli-roberta-not_neutral P 0.321 + R -0.516 + F -0.471 + bertscore-sentence-mnli-roberta-entail_only P 0.660 + R 0.076 + F 0.164 + bertscore-sentence-mnli-roberta-entail_contradict P 0.672 + R 0.359 + F -0.288 + bertscore-sentence-mnli-bart-not_neutral P -0.286 + R -0.668 + F -0.674 + bertscore-sentence-mnli-bart-entail_only P 0.667 + R 0.380 + F 0.409 + bertscore-sentence-mnli-bart-entail_contradict P 0.681 + R 0.671 + F 0.346 + bertscore-sentence-mnli-deberta-not_neutral P 0.286 + R -0.580 + F -0.555 + bertscore-sentence-mnli-deberta-entail_only P 0.587 + R 0.262 + F 0.298 + bertscore-sentence-mnli-deberta-entail_contradict P 0.591 + R 0.515 + F -0.279 +kendalltau human new bertscore-sentence-cos-mpnet P 0.487 + R -0.122 + F 0.000 + bertscore-sentence-cos-roberta P 0.730 + R -0.061 + F 0.243 + bertscore-sentence-mnli-roberta-not_neutral P -0.122 + R -0.304 + F -0.304 + bertscore-sentence-mnli-roberta-entail_only P 0.487 + R 0.183 + F 0.183 + bertscore-sentence-mnli-roberta-entail_contradict P 0.548 + R 0.243 + F -0.304 + bertscore-sentence-mnli-bart-not_neutral P -0.487 + R -0.548 + F -0.548 + bertscore-sentence-mnli-bart-entail_only P 0.609 + R 0.426 + F 0.487 + bertscore-sentence-mnli-bart-entail_contradict P 0.609 + R 0.548 + F 0.183 + bertscore-sentence-mnli-deberta-not_neutral P -0.365 + R -0.365 + F -0.304 + bertscore-sentence-mnli-deberta-entail_only P 0.548 + R 0.304 + F 0.304 + bertscore-sentence-mnli-deberta-entail_contradict P 0.548 + R 0.365 + F -0.061 +spearmanr human new bertscore-sentence-cos-mpnet P 0.569 + R -0.142 + F 0.000 + bertscore-sentence-cos-roberta P 0.853 + R -0.071 + F 0.284 + bertscore-sentence-mnli-roberta-not_neutral P -0.142 + R -0.355 + F -0.355 + bertscore-sentence-mnli-roberta-entail_only P 0.569 + R 0.213 + F 0.213 + bertscore-sentence-mnli-roberta-entail_contradict P 0.640 + R 0.284 + F -0.355 + bertscore-sentence-mnli-bart-not_neutral P -0.569 + R -0.640 + F -0.640 + bertscore-sentence-mnli-bart-entail_only P 0.711 + R 0.497 + F 0.569 + bertscore-sentence-mnli-bart-entail_contradict P 0.711 + R 0.640 + F 0.213 + bertscore-sentence-mnli-deberta-not_neutral P -0.426 + R -0.426 + F -0.355 + bertscore-sentence-mnli-deberta-entail_only P 0.640 + R 0.355 + F 0.355 + bertscore-sentence-mnli-deberta-entail_contradict P 0.640 + R 0.426 + F -0.071 \ No newline at end of file diff --git a/results/sample/factcc_system.json b/results/sample/factcc_system.json new file mode 100644 index 0000000..09ef229 --- /dev/null +++ b/results/sample/factcc_system.json @@ -0,0 +1,101 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.5815545229, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -0.0600319914, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.0277402752, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.7169653488, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -0.0528466412, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.0967947792, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": 0.3208156661, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.5156587126, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.470599931, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.6595192702, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.0757447464, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.1639806392, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.6717421438, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.3588681114, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": -0.2877454493, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": -0.2861778665, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.6681691136, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.6737821313, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.6666898215, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.3804341873, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.4086553414, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.6806158226, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.6709422745, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.3458599955, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": 0.2855930523, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.5795375818, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.5553977183, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.5867073853, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.262299699, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.2984548025, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.591284334, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.5146585507, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": -0.2794090753, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.4868644956, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -0.1217161239, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.7302967433, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -0.0608580619, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.2434322478, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -0.1217161239, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.4868644956, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.1825741858, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.1825741858, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.2434322478, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": -0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": -0.4868644956, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.6085806195, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.4260064336, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.4868644956, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.6085806195, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.1825741858, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": -0.3651483717, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.3651483717, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.3042903097, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.5477225575, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.3651483717, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": -0.0608580619, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.5685352436, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -0.1421338109, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.8528028654, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -0.0710669055, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.2842676218, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -0.1421338109, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.5685352436, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.2132007164, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.2132007164, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.2842676218, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": -0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": -0.5685352436, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.7106690545, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.4974683382, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.5685352436, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.7106690545, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.2132007164, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": -0.4264014327, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.4264014327, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.3553345273, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.6396021491, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.4264014327, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": -0.0710669055 +} \ No newline at end of file diff --git a/results/sample/factcc_system.txt b/results/sample/factcc_system.txt new file mode 100644 index 0000000..945aeb3 --- /dev/null +++ b/results/sample/factcc_system.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 0.582 + R -0.060 + F 0.028 + bertscore-sentence-cos-roberta P 0.717 + R -0.053 + F 0.097 + bertscore-sentence-mnli-roberta-not_neutral P 0.321 + R -0.516 + F -0.471 + bertscore-sentence-mnli-roberta-entail_only P 0.660 + R 0.076 + F 0.164 + bertscore-sentence-mnli-roberta-entail_contradict P 0.672 + R 0.359 + F -0.288 + bertscore-sentence-mnli-bart-not_neutral P -0.286 + R -0.668 + F -0.674 + bertscore-sentence-mnli-bart-entail_only P 0.667 + R 0.380 + F 0.409 + bertscore-sentence-mnli-bart-entail_contradict P 0.681 + R 0.671 + F 0.346 + bertscore-sentence-mnli-deberta-not_neutral P 0.286 + R -0.580 + F -0.555 + bertscore-sentence-mnli-deberta-entail_only P 0.587 + R 0.262 + F 0.298 + bertscore-sentence-mnli-deberta-entail_contradict P 0.591 + R 0.515 + F -0.279 +kendalltau human new bertscore-sentence-cos-mpnet P 0.487 + R -0.122 + F 0.000 + bertscore-sentence-cos-roberta P 0.730 + R -0.061 + F 0.243 + bertscore-sentence-mnli-roberta-not_neutral P -0.122 + R -0.304 + F -0.304 + bertscore-sentence-mnli-roberta-entail_only P 0.487 + R 0.183 + F 0.183 + bertscore-sentence-mnli-roberta-entail_contradict P 0.548 + R 0.243 + F -0.304 + bertscore-sentence-mnli-bart-not_neutral P -0.487 + R -0.548 + F -0.548 + bertscore-sentence-mnli-bart-entail_only P 0.609 + R 0.426 + F 0.487 + bertscore-sentence-mnli-bart-entail_contradict P 0.609 + R 0.548 + F 0.183 + bertscore-sentence-mnli-deberta-not_neutral P -0.365 + R -0.365 + F -0.304 + bertscore-sentence-mnli-deberta-entail_only P 0.548 + R 0.304 + F 0.304 + bertscore-sentence-mnli-deberta-entail_contradict P 0.548 + R 0.365 + F -0.061 +spearmanr human new bertscore-sentence-cos-mpnet P 0.569 + R -0.142 + F 0.000 + bertscore-sentence-cos-roberta P 0.853 + R -0.071 + F 0.284 + bertscore-sentence-mnli-roberta-not_neutral P -0.142 + R -0.355 + F -0.355 + bertscore-sentence-mnli-roberta-entail_only P 0.569 + R 0.213 + F 0.213 + bertscore-sentence-mnli-roberta-entail_contradict P 0.640 + R 0.284 + F -0.355 + bertscore-sentence-mnli-bart-not_neutral P -0.569 + R -0.640 + F -0.640 + bertscore-sentence-mnli-bart-entail_only P 0.711 + R 0.497 + F 0.569 + bertscore-sentence-mnli-bart-entail_contradict P 0.711 + R 0.640 + F 0.213 + bertscore-sentence-mnli-deberta-not_neutral P -0.426 + R -0.426 + F -0.355 + bertscore-sentence-mnli-deberta-entail_only P 0.640 + R 0.355 + F 0.355 + bertscore-sentence-mnli-deberta-entail_contradict P 0.640 + R 0.426 + F -0.071 \ No newline at end of file diff --git a/results/sample/frank_summary.json b/results/sample/frank_summary.json new file mode 100644 index 0000000..18f8066 --- /dev/null +++ b/results/sample/frank_summary.json @@ -0,0 +1,497 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.6523168225, + "1": 0.8836796854, + "average": 0.767998254 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.9964892049, + "1": 0.8216000345, + "average": 0.9090446197 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.9925326034, + "1": 0.881378723, + "average": 0.9369556632 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.7616395933, + "1": 0.8423049729, + "average": 0.8019722831 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.9928982106, + "1": 0.5333381858, + "average": 0.7631181982 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.988378774, + "1": 0.7037292882, + "average": 0.8460540311 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.05947937, + "1": -0.6886233732, + "average": -0.3145720016 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": 0.4938209264, + "1": -0.9630750169, + "average": -0.2346270452 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": 0.4330768299, + "1": -0.949933078, + "average": -0.258428124 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.3226130539, + "1": 0.9998038969, + "average": 0.6612084754 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.5877117304, + "1": 0.933071025, + "average": 0.7603913777 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.5645184323, + "1": 0.9568984177, + "average": 0.760708425 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.3521710885, + "1": 0.9998703776, + "average": 0.6760207331 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.6881255546, + "1": 0.9688964374, + "average": 0.828510996 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.6805524563, + "1": 0.9061032489, + "average": 0.7933278526 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.2051125563, + "1": 0.5603971085, + "average": 0.3827548324 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": 0.1295401693, + "1": 0.9281542471, + "average": 0.5288472082 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": 0.1144687052, + "1": 0.9252607751, + "average": 0.5198647401 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.921654899, + "1": 0.9537935653, + "average": 0.9377242322 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.770946228, + "1": 0.8372773713, + "average": 0.8041117997 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.7948027011, + "1": 0.8649306494, + "average": 0.8298666752 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.9240466736, + "1": 0.9618711344, + "average": 0.942958904 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.8108151236, + "1": 0.8552488098, + "average": 0.8330319667 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.8319969862, + "1": 0.8811877631, + "average": 0.8565923746 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": -0.2552969672, + "1": 0.9779677455, + "average": 0.3613353892 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.6462106152, + "1": -0.3784708357, + "average": -0.5123407254 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.5926340368, + "1": 0.2590484996, + "average": -0.1667927686 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.1684693914, + "1": 0.9651428983, + "average": 0.5668061449 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": -0.6139918453, + "1": 0.7191224856, + "average": 0.0525653202 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": -0.524233183, + "1": 0.7601658352, + "average": 0.1179663261 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.1776970083, + "1": 0.9512483383, + "average": 0.5644726733 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": -0.5578563201, + "1": 0.8738149265, + "average": 0.1579793032 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.4785149282, + "1": 0.9011416499, + "average": 0.2113133608 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.632455532, + "1": 0.316227766, + "average": 0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.316227766, + "1": -0.632455532, + "average": -0.158113883 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": 0.316227766, + "1": -0.632455532, + "average": -0.158113883 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": 0.316227766, + "1": -0.632455532, + "average": -0.158113883 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.316227766, + "1": 0.632455532, + "average": 0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.316227766, + "1": 0.632455532, + "average": 0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.316227766, + "1": 0.632455532, + "average": 0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.0, + "1": 0.632455532, + "average": 0.316227766 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": 0.316227766, + "1": 0.632455532, + "average": 0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": 0.0, + "1": 0.632455532, + "average": 0.316227766 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.632455532, + "1": 0.632455532, + "average": 0.632455532 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.0, + "1": 0.632455532, + "average": 0.316227766 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.632455532, + "1": -0.316227766, + "average": -0.474341649 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.632455532, + "1": 0.316227766, + "average": -0.158113883 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.0, + "1": 0.632455532, + "average": 0.316227766 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": -0.632455532, + "1": 0.632455532, + "average": 0.0 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": -0.632455532, + "1": 0.632455532, + "average": 0.0 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.0, + "1": 0.632455532, + "average": 0.316227766 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": -0.632455532, + "1": 0.632455532, + "average": 0.0 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.632455532, + "1": 0.632455532, + "average": 0.0 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.7071067812, + "1": 0.3535533906, + "average": 0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.3535533906, + "1": -0.7071067812, + "average": -0.1767766953 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": 0.3535533906, + "1": -0.7071067812, + "average": -0.1767766953 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": 0.3535533906, + "1": -0.7071067812, + "average": -0.1767766953 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.3535533906, + "1": 0.7071067812, + "average": 0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.3535533906, + "1": 0.7071067812, + "average": 0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.3535533906, + "1": 0.7071067812, + "average": 0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.0, + "1": 0.7071067812, + "average": 0.3535533906 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": 0.3535533906, + "1": 0.7071067812, + "average": 0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": 0.0, + "1": 0.7071067812, + "average": 0.3535533906 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.7071067812, + "1": 0.7071067812, + "average": 0.7071067812 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.0, + "1": 0.7071067812, + "average": 0.3535533906 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.7071067812, + "1": -0.3535533906, + "average": -0.5303300859 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.7071067812, + "1": 0.3535533906, + "average": -0.1767766953 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.0, + "1": 0.7071067812, + "average": 0.3535533906 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": -0.7071067812, + "1": 0.7071067812, + "average": 0.0 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": -0.7071067812, + "1": 0.7071067812, + "average": 0.0 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.0, + "1": 0.7071067812, + "average": 0.3535533906 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": -0.7071067812, + "1": 0.7071067812, + "average": 0.0 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": -0.7071067812, + "1": 0.7071067812, + "average": 0.0 + } +} \ No newline at end of file diff --git a/results/sample/frank_summary.txt b/results/sample/frank_summary.txt new file mode 100644 index 0000000..842d431 --- /dev/null +++ b/results/sample/frank_summary.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 0.768 + R 0.909 + F 0.937 + bertscore-sentence-cos-roberta P 0.802 + R 0.763 + F 0.846 + bertscore-sentence-mnli-roberta-not_neutral P -0.315 + R -0.235 + F -0.258 + bertscore-sentence-mnli-roberta-entail_only P 0.661 + R 0.760 + F 0.761 + bertscore-sentence-mnli-roberta-entail_contradict P 0.676 + R 0.829 + F 0.793 + bertscore-sentence-mnli-bart-not_neutral P 0.383 + R 0.529 + F 0.520 + bertscore-sentence-mnli-bart-entail_only P 0.938 + R 0.804 + F 0.830 + bertscore-sentence-mnli-bart-entail_contradict P 0.943 + R 0.833 + F 0.857 + bertscore-sentence-mnli-deberta-not_neutral P 0.361 + R -0.512 + F -0.167 + bertscore-sentence-mnli-deberta-entail_only P 0.567 + R 0.053 + F 0.118 + bertscore-sentence-mnli-deberta-entail_contradict P 0.564 + R 0.158 + F 0.211 +kendalltau human new bertscore-sentence-cos-mpnet P 0.632 + R 0.632 + F 0.632 + bertscore-sentence-cos-roberta P 0.632 + R 0.474 + F 0.632 + bertscore-sentence-mnli-roberta-not_neutral P -0.158 + R -0.158 + F -0.158 + bertscore-sentence-mnli-roberta-entail_only P 0.474 + R 0.632 + F 0.474 + bertscore-sentence-mnli-roberta-entail_contradict P 0.474 + R 0.632 + F 0.632 + bertscore-sentence-mnli-bart-not_neutral P 0.316 + R 0.474 + F 0.316 + bertscore-sentence-mnli-bart-entail_only P 0.632 + R 0.632 + F 0.632 + bertscore-sentence-mnli-bart-entail_contradict P 0.632 + R 0.632 + F 0.632 + bertscore-sentence-mnli-deberta-not_neutral P 0.316 + R -0.474 + F -0.158 + bertscore-sentence-mnli-deberta-entail_only P 0.316 + R 0.000 + F 0.000 + bertscore-sentence-mnli-deberta-entail_contradict P 0.316 + R 0.000 + F 0.000 +spearmanr human new bertscore-sentence-cos-mpnet P 0.707 + R 0.707 + F 0.707 + bertscore-sentence-cos-roberta P 0.707 + R 0.530 + F 0.707 + bertscore-sentence-mnli-roberta-not_neutral P -0.177 + R -0.177 + F -0.177 + bertscore-sentence-mnli-roberta-entail_only P 0.530 + R 0.707 + F 0.530 + bertscore-sentence-mnli-roberta-entail_contradict P 0.530 + R 0.707 + F 0.707 + bertscore-sentence-mnli-bart-not_neutral P 0.354 + R 0.530 + F 0.354 + bertscore-sentence-mnli-bart-entail_only P 0.707 + R 0.707 + F 0.707 + bertscore-sentence-mnli-bart-entail_contradict P 0.707 + R 0.707 + F 0.707 + bertscore-sentence-mnli-deberta-not_neutral P 0.354 + R -0.530 + F -0.177 + bertscore-sentence-mnli-deberta-entail_only P 0.354 + R 0.000 + F 0.000 + bertscore-sentence-mnli-deberta-entail_contradict P 0.354 + R 0.000 + F 0.000 \ No newline at end of file diff --git a/results/sample/frank_system.json b/results/sample/frank_system.json new file mode 100644 index 0000000..799ca47 --- /dev/null +++ b/results/sample/frank_system.json @@ -0,0 +1,101 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": -1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 1.0, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": -1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 1.0, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": -1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 1.0, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 1.0 +} \ No newline at end of file diff --git a/results/sample/frank_system.txt b/results/sample/frank_system.txt new file mode 100644 index 0000000..c8c8909 --- /dev/null +++ b/results/sample/frank_system.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-cos-roberta P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-roberta-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-not_neutral P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-bart-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-deberta-entail_only P -1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-entail_contradict P -1.000 + R 1.000 + F 1.000 +kendalltau human new bertscore-sentence-cos-mpnet P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-cos-roberta P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-roberta-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-not_neutral P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-bart-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-deberta-entail_only P -1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-entail_contradict P -1.000 + R 1.000 + F 1.000 +spearmanr human new bertscore-sentence-cos-mpnet P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-cos-roberta P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-roberta-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-roberta-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-not_neutral P 1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-bart-entail_only P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-bart-entail_contradict P 1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-not_neutral P -1.000 + R -1.000 + F -1.000 + bertscore-sentence-mnli-deberta-entail_only P -1.000 + R 1.000 + F 1.000 + bertscore-sentence-mnli-deberta-entail_contradict P -1.000 + R 1.000 + F 1.000 \ No newline at end of file diff --git a/results/sample/qags_summary.json b/results/sample/qags_summary.json new file mode 100644 index 0000000..5c12b2b --- /dev/null +++ b/results/sample/qags_summary.json @@ -0,0 +1,398 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.392532985, + "average": 0.392532985 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.2435308866, + "average": 0.2435308866 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.3042979733, + "average": 0.3042979733 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.4129099992, + "average": 0.4129099992 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.166766945, + "average": 0.166766945 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.2525273414, + "average": 0.2525273414 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": -0.1711217893, + "average": -0.1711217893 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.175536578, + "average": -0.175536578 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.1655423697, + "average": -0.1655423697 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.2813473235, + "average": 0.2813473235 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.2935981135, + "average": 0.2935981135 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.2976186778, + "average": 0.2976186778 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.2874685137, + "average": 0.2874685137 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.3047171654, + "average": 0.3047171654 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.304254812, + "average": 0.304254812 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.0203709744, + "average": 0.0203709744 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.3128092426, + "average": -0.3128092426 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.2909581206, + "average": -0.2909581206 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.2757640792, + "average": 0.2757640792 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.3734603417, + "average": 0.3734603417 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.3633644666, + "average": 0.3633644666 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.297161942, + "average": 0.297161942 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.4032365407, + "average": 0.4032365407 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.3978617381, + "average": 0.3978617381 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.0119973098, + "average": 0.0119973098 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.2230155052, + "average": -0.2230155052 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.2021040946, + "average": -0.2021040946 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.2609029888, + "average": 0.2609029888 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.3157773364, + "average": 0.3157773364 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.3138601256, + "average": 0.3138601256 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.2761733855, + "average": 0.2761733855 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.36250906, + "average": 0.36250906 + }, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": 0.3605971503, + "average": 0.3605971503 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.3385555834, + "average": 0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.0967301667, + "average": 0.0967301667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.0967301667, + "average": 0.0967301667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.0967301667, + "average": 0.0967301667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.0483650833, + "average": 0.0483650833 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.0483650833, + "average": 0.0483650833 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.1934603334, + "average": -0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.1934603334, + "average": -0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.3385555834, + "average": 0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.3385555834, + "average": 0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.3385555834, + "average": 0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.3869206667, + "average": 0.3869206667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.3385555834, + "average": -0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.3385555834, + "average": -0.3385555834 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.14509525, + "average": 0.14509525 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.2418254167, + "average": 0.2418254167 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.0967301667, + "average": -0.0967301667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.0967301667, + "average": -0.0967301667 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.2418254167, + "average": 0.2418254167 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.2901905, + "average": 0.2901905 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": 0.1934603334, + "average": 0.1934603334 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": { + "0": 0.4300756154, + "average": 0.4300756154 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": { + "0": 0.093494699, + "average": 0.093494699 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": { + "0": 0.1371255585, + "average": 0.1371255585 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": { + "0": 0.3926777358, + "average": 0.3926777358 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": { + "0": 0.1308925786, + "average": 0.1308925786 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": { + "0": 0.0373978796, + "average": 0.0373978796 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": { + "0": 0.1059606589, + "average": 0.1059606589 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": { + "0": -0.3116489967, + "average": -0.3116489967 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": { + "0": -0.3116489967, + "average": -0.3116489967 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": { + "0": 0.4363085953, + "average": 0.4363085953 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": { + "0": 0.3802117759, + "average": 0.3802117759 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": { + "0": 0.4051436957, + "average": 0.4051436957 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": { + "0": 0.373978796, + "average": 0.373978796 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": { + "0": 0.4425415753, + "average": 0.4425415753 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": { + "0": 0.467473495, + "average": 0.467473495 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": { + "0": 0.2119213177, + "average": 0.2119213177 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": { + "0": -0.4737064749, + "average": -0.4737064749 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": { + "0": -0.4737064749, + "average": -0.4737064749 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": { + "0": 0.2181542977, + "average": 0.2181542977 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": { + "0": 0.3428138963, + "average": 0.3428138963 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": { + "0": 0.3428138963, + "average": 0.3428138963 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": { + "0": 0.2493191973, + "average": 0.2493191973 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": { + "0": 0.3428138963, + "average": 0.3428138963 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": { + "0": 0.3428138963, + "average": 0.3428138963 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": { + "0": 0.280484097, + "average": 0.280484097 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": { + "0": -0.1495915184, + "average": -0.1495915184 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": { + "0": -0.1059606589, + "average": -0.1059606589 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": { + "0": 0.3054160167, + "average": 0.3054160167 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": { + "0": 0.3116489967, + "average": 0.3116489967 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": { + "0": 0.373978796, + "average": 0.373978796 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": { + "0": 0.3054160167, + "average": 0.3054160167 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": { + "0": 0.280484097, + "average": 0.280484097 + }, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": { + "0": 0.280484097, + "average": 0.280484097 + } +} \ No newline at end of file diff --git a/results/sample/qags_summary.txt b/results/sample/qags_summary.txt new file mode 100644 index 0000000..e964f38 --- /dev/null +++ b/results/sample/qags_summary.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 0.393 + R 0.244 + F 0.304 + bertscore-sentence-cos-roberta P 0.413 + R 0.167 + F 0.253 + bertscore-sentence-mnli-roberta-not_neutral P -0.171 + R -0.176 + F -0.166 + bertscore-sentence-mnli-roberta-entail_only P 0.281 + R 0.294 + F 0.298 + bertscore-sentence-mnli-roberta-entail_contradict P 0.287 + R 0.305 + F 0.304 + bertscore-sentence-mnli-bart-not_neutral P 0.020 + R -0.313 + F -0.291 + bertscore-sentence-mnli-bart-entail_only P 0.276 + R 0.373 + F 0.363 + bertscore-sentence-mnli-bart-entail_contradict P 0.297 + R 0.403 + F 0.398 + bertscore-sentence-mnli-deberta-not_neutral P 0.012 + R -0.223 + F -0.202 + bertscore-sentence-mnli-deberta-entail_only P 0.261 + R 0.316 + F 0.314 + bertscore-sentence-mnli-deberta-entail_contradict P 0.276 + R 0.363 + F 0.361 +kendalltau human new bertscore-sentence-cos-mpnet P 0.339 + R 0.097 + F 0.097 + bertscore-sentence-cos-roberta P 0.290 + R 0.097 + F 0.048 + bertscore-sentence-mnli-roberta-not_neutral P 0.048 + R -0.193 + F -0.193 + bertscore-sentence-mnli-roberta-entail_only P 0.339 + R 0.290 + F 0.339 + bertscore-sentence-mnli-roberta-entail_contradict P 0.290 + R 0.339 + F 0.387 + bertscore-sentence-mnli-bart-not_neutral P 0.193 + R -0.339 + F -0.339 + bertscore-sentence-mnli-bart-entail_only P 0.145 + R 0.290 + F 0.290 + bertscore-sentence-mnli-bart-entail_contradict P 0.193 + R 0.290 + F 0.290 + bertscore-sentence-mnli-deberta-not_neutral P 0.242 + R -0.097 + F -0.097 + bertscore-sentence-mnli-deberta-entail_only P 0.193 + R 0.242 + F 0.290 + bertscore-sentence-mnli-deberta-entail_contradict P 0.193 + R 0.193 + F 0.193 +spearmanr human new bertscore-sentence-cos-mpnet P 0.430 + R 0.093 + F 0.137 + bertscore-sentence-cos-roberta P 0.393 + R 0.131 + F 0.037 + bertscore-sentence-mnli-roberta-not_neutral P 0.106 + R -0.312 + F -0.312 + bertscore-sentence-mnli-roberta-entail_only P 0.436 + R 0.380 + F 0.405 + bertscore-sentence-mnli-roberta-entail_contradict P 0.374 + R 0.443 + F 0.467 + bertscore-sentence-mnli-bart-not_neutral P 0.212 + R -0.474 + F -0.474 + bertscore-sentence-mnli-bart-entail_only P 0.218 + R 0.343 + F 0.343 + bertscore-sentence-mnli-bart-entail_contradict P 0.249 + R 0.343 + F 0.343 + bertscore-sentence-mnli-deberta-not_neutral P 0.280 + R -0.150 + F -0.106 + bertscore-sentence-mnli-deberta-entail_only P 0.305 + R 0.312 + F 0.374 + bertscore-sentence-mnli-deberta-entail_contradict P 0.305 + R 0.280 + F 0.280 \ No newline at end of file diff --git a/results/sample/qags_system.json b/results/sample/qags_system.json new file mode 100644 index 0000000..05539ba --- /dev/null +++ b/results/sample/qags_system.json @@ -0,0 +1,101 @@ +{ + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.392532985, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": 0.2435308866, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.3042979733, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.4129099992, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": 0.166766945, + "('pearsonr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.2525273414, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": -0.1711217893, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.175536578, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.1655423697, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.2813473235, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.2935981135, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.2976186778, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.2874685137, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.3047171654, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 0.304254812, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 0.0203709744, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.3128092426, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.2909581206, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.2757640792, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.3734603417, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.3633644666, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.297161942, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.4032365407, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.3978617381, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": 0.0119973098, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.2230155052, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.2021040946, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.2609029888, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.3157773364, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.3138601256, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.2761733855, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.36250906, + "('pearsonr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 0.3605971503, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": 0.0967301667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.0967301667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": 0.0967301667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.0483650833, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": 0.0483650833, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 0.3869206667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.3385555834, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.14509525, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": 0.2418254167, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.0967301667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.0967301667, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.2418254167, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.2901905, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.1934603334, + "('kendalltau', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 0.1934603334, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'P')": 0.4300756154, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'R')": 0.093494699, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-mpnet', 'F')": 0.1371255585, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'P')": 0.3926777358, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'R')": 0.1308925786, + "('spearmanr', 'human', 'new', 'bertscore-sentence-cos-roberta', 'F')": 0.0373978796, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'P')": 0.1059606589, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'R')": -0.3116489967, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-not_neutral', 'F')": -0.3116489967, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'P')": 0.4363085953, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'R')": 0.3802117759, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_only', 'F')": 0.4051436957, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'P')": 0.373978796, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'R')": 0.4425415753, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-roberta-entail_contradict', 'F')": 0.467473495, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'P')": 0.2119213177, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'R')": -0.4737064749, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-not_neutral', 'F')": -0.4737064749, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'P')": 0.2181542977, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'R')": 0.3428138963, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_only', 'F')": 0.3428138963, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'P')": 0.2493191973, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'R')": 0.3428138963, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-bart-entail_contradict', 'F')": 0.3428138963, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'P')": 0.280484097, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'R')": -0.1495915184, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-not_neutral', 'F')": -0.1059606589, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'P')": 0.3054160167, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'R')": 0.3116489967, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_only', 'F')": 0.373978796, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'P')": 0.3054160167, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'R')": 0.280484097, + "('spearmanr', 'human', 'new', 'bertscore-sentence-mnli-deberta-entail_contradict', 'F')": 0.280484097 +} \ No newline at end of file diff --git a/results/sample/qags_system.txt b/results/sample/qags_system.txt new file mode 100644 index 0000000..e964f38 --- /dev/null +++ b/results/sample/qags_system.txt @@ -0,0 +1,100 @@ +corr_metric aspect approach model score_name +pearsonr human new bertscore-sentence-cos-mpnet P 0.393 + R 0.244 + F 0.304 + bertscore-sentence-cos-roberta P 0.413 + R 0.167 + F 0.253 + bertscore-sentence-mnli-roberta-not_neutral P -0.171 + R -0.176 + F -0.166 + bertscore-sentence-mnli-roberta-entail_only P 0.281 + R 0.294 + F 0.298 + bertscore-sentence-mnli-roberta-entail_contradict P 0.287 + R 0.305 + F 0.304 + bertscore-sentence-mnli-bart-not_neutral P 0.020 + R -0.313 + F -0.291 + bertscore-sentence-mnli-bart-entail_only P 0.276 + R 0.373 + F 0.363 + bertscore-sentence-mnli-bart-entail_contradict P 0.297 + R 0.403 + F 0.398 + bertscore-sentence-mnli-deberta-not_neutral P 0.012 + R -0.223 + F -0.202 + bertscore-sentence-mnli-deberta-entail_only P 0.261 + R 0.316 + F 0.314 + bertscore-sentence-mnli-deberta-entail_contradict P 0.276 + R 0.363 + F 0.361 +kendalltau human new bertscore-sentence-cos-mpnet P 0.339 + R 0.097 + F 0.097 + bertscore-sentence-cos-roberta P 0.290 + R 0.097 + F 0.048 + bertscore-sentence-mnli-roberta-not_neutral P 0.048 + R -0.193 + F -0.193 + bertscore-sentence-mnli-roberta-entail_only P 0.339 + R 0.290 + F 0.339 + bertscore-sentence-mnli-roberta-entail_contradict P 0.290 + R 0.339 + F 0.387 + bertscore-sentence-mnli-bart-not_neutral P 0.193 + R -0.339 + F -0.339 + bertscore-sentence-mnli-bart-entail_only P 0.145 + R 0.290 + F 0.290 + bertscore-sentence-mnli-bart-entail_contradict P 0.193 + R 0.290 + F 0.290 + bertscore-sentence-mnli-deberta-not_neutral P 0.242 + R -0.097 + F -0.097 + bertscore-sentence-mnli-deberta-entail_only P 0.193 + R 0.242 + F 0.290 + bertscore-sentence-mnli-deberta-entail_contradict P 0.193 + R 0.193 + F 0.193 +spearmanr human new bertscore-sentence-cos-mpnet P 0.430 + R 0.093 + F 0.137 + bertscore-sentence-cos-roberta P 0.393 + R 0.131 + F 0.037 + bertscore-sentence-mnli-roberta-not_neutral P 0.106 + R -0.312 + F -0.312 + bertscore-sentence-mnli-roberta-entail_only P 0.436 + R 0.380 + F 0.405 + bertscore-sentence-mnli-roberta-entail_contradict P 0.374 + R 0.443 + F 0.467 + bertscore-sentence-mnli-bart-not_neutral P 0.212 + R -0.474 + F -0.474 + bertscore-sentence-mnli-bart-entail_only P 0.218 + R 0.343 + F 0.343 + bertscore-sentence-mnli-bart-entail_contradict P 0.249 + R 0.343 + F 0.343 + bertscore-sentence-mnli-deberta-not_neutral P 0.280 + R -0.150 + F -0.106 + bertscore-sentence-mnli-deberta-entail_only P 0.305 + R 0.312 + F 0.374 + bertscore-sentence-mnli-deberta-entail_contradict P 0.305 + R 0.280 + F 0.280 \ No newline at end of file